1/*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25/**
26 * @file
27 * DSP utils
28 */
29
30#include "avcodec.h"
31#include "dsputil.h"
32#include "simple_idct.h"
33#include "faandct.h"
34#include "faanidct.h"
35#include "mathops.h"
36#include "mpegvideo.h"
37#include "config.h"
38#include "lpc.h"
39#include "ac3dec.h"
40#include "vorbis.h"
41#include "png.h"
42
43uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44uint32_t ff_squareTbl[512] = {0, };
45
46// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
47#define pb_7f (~0UL/255 * 0x7f)
48#define pb_80 (~0UL/255 * 0x80)
49
50const uint8_t ff_zigzag_direct[64] = {
51    0,   1,  8, 16,  9,  2,  3, 10,
52    17, 24, 32, 25, 18, 11,  4,  5,
53    12, 19, 26, 33, 40, 48, 41, 34,
54    27, 20, 13,  6,  7, 14, 21, 28,
55    35, 42, 49, 56, 57, 50, 43, 36,
56    29, 22, 15, 23, 30, 37, 44, 51,
57    58, 59, 52, 45, 38, 31, 39, 46,
58    53, 60, 61, 54, 47, 55, 62, 63
59};
60
61/* Specific zigzag scan for 248 idct. NOTE that unlike the
62   specification, we interleave the fields */
63const uint8_t ff_zigzag248_direct[64] = {
64     0,  8,  1,  9, 16, 24,  2, 10,
65    17, 25, 32, 40, 48, 56, 33, 41,
66    18, 26,  3, 11,  4, 12, 19, 27,
67    34, 42, 49, 57, 50, 58, 35, 43,
68    20, 28,  5, 13,  6, 14, 21, 29,
69    36, 44, 51, 59, 52, 60, 37, 45,
70    22, 30,  7, 15, 23, 31, 38, 46,
71    53, 61, 54, 62, 39, 47, 55, 63,
72};
73
74/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
76
77const uint8_t ff_alternate_horizontal_scan[64] = {
78    0,  1,   2,  3,  8,  9, 16, 17,
79    10, 11,  4,  5,  6,  7, 15, 14,
80    13, 12, 19, 18, 24, 25, 32, 33,
81    26, 27, 20, 21, 22, 23, 28, 29,
82    30, 31, 34, 35, 40, 41, 48, 49,
83    42, 43, 36, 37, 38, 39, 44, 45,
84    46, 47, 50, 51, 56, 57, 58, 59,
85    52, 53, 54, 55, 60, 61, 62, 63,
86};
87
88const uint8_t ff_alternate_vertical_scan[64] = {
89    0,  8,  16, 24,  1,  9,  2, 10,
90    17, 25, 32, 40, 48, 56, 57, 49,
91    41, 33, 26, 18,  3, 11,  4, 12,
92    19, 27, 34, 42, 50, 58, 35, 43,
93    51, 59, 20, 28,  5, 13,  6, 14,
94    21, 29, 36, 44, 52, 60, 37, 45,
95    53, 61, 22, 30,  7, 15, 23, 31,
96    38, 46, 54, 62, 39, 47, 55, 63,
97};
98
99/* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
100 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
101const uint32_t ff_inverse[257]={
102         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
103 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
104 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
105 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
106 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
107 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
108  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
109  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
110  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
111  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
112  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
113  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
114  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
115  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
116  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
117  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
118  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
119  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
120  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
121  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
122  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
123  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
124  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
125  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
126  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
127  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
128  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
129  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
130  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
131  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
132  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
133  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
134  16777216
135};
136
137/* Input permutation for the simple_idct_mmx */
138static const uint8_t simple_mmx_permutation[64]={
139        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
140        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
141        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
142        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
143        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
144        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
145        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
146        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
147};
148
149static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
150
151void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
152    int i;
153    int end;
154
155    st->scantable= src_scantable;
156
157    for(i=0; i<64; i++){
158        int j;
159        j = src_scantable[i];
160        st->permutated[i] = permutation[j];
161#if ARCH_PPC
162        st->inverse[j] = i;
163#endif
164    }
165
166    end=-1;
167    for(i=0; i<64; i++){
168        int j;
169        j = st->permutated[i];
170        if(j>end) end=j;
171        st->raster_end[i]= end;
172    }
173}
174
175static int pix_sum_c(uint8_t * pix, int line_size)
176{
177    int s, i, j;
178
179    s = 0;
180    for (i = 0; i < 16; i++) {
181        for (j = 0; j < 16; j += 8) {
182            s += pix[0];
183            s += pix[1];
184            s += pix[2];
185            s += pix[3];
186            s += pix[4];
187            s += pix[5];
188            s += pix[6];
189            s += pix[7];
190            pix += 8;
191        }
192        pix += line_size - 16;
193    }
194    return s;
195}
196
197static int pix_norm1_c(uint8_t * pix, int line_size)
198{
199    int s, i, j;
200    uint32_t *sq = ff_squareTbl + 256;
201
202    s = 0;
203    for (i = 0; i < 16; i++) {
204        for (j = 0; j < 16; j += 8) {
205#if 0
206            s += sq[pix[0]];
207            s += sq[pix[1]];
208            s += sq[pix[2]];
209            s += sq[pix[3]];
210            s += sq[pix[4]];
211            s += sq[pix[5]];
212            s += sq[pix[6]];
213            s += sq[pix[7]];
214#else
215#if LONG_MAX > 2147483647
216            register uint64_t x=*(uint64_t*)pix;
217            s += sq[x&0xff];
218            s += sq[(x>>8)&0xff];
219            s += sq[(x>>16)&0xff];
220            s += sq[(x>>24)&0xff];
221            s += sq[(x>>32)&0xff];
222            s += sq[(x>>40)&0xff];
223            s += sq[(x>>48)&0xff];
224            s += sq[(x>>56)&0xff];
225#else
226            register uint32_t x=*(uint32_t*)pix;
227            s += sq[x&0xff];
228            s += sq[(x>>8)&0xff];
229            s += sq[(x>>16)&0xff];
230            s += sq[(x>>24)&0xff];
231            x=*(uint32_t*)(pix+4);
232            s += sq[x&0xff];
233            s += sq[(x>>8)&0xff];
234            s += sq[(x>>16)&0xff];
235            s += sq[(x>>24)&0xff];
236#endif
237#endif
238            pix += 8;
239        }
240        pix += line_size - 16;
241    }
242    return s;
243}
244
245static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
246    int i;
247
248    for(i=0; i+8<=w; i+=8){
249        dst[i+0]= bswap_32(src[i+0]);
250        dst[i+1]= bswap_32(src[i+1]);
251        dst[i+2]= bswap_32(src[i+2]);
252        dst[i+3]= bswap_32(src[i+3]);
253        dst[i+4]= bswap_32(src[i+4]);
254        dst[i+5]= bswap_32(src[i+5]);
255        dst[i+6]= bswap_32(src[i+6]);
256        dst[i+7]= bswap_32(src[i+7]);
257    }
258    for(;i<w; i++){
259        dst[i+0]= bswap_32(src[i+0]);
260    }
261}
262
263static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
264{
265    int s, i;
266    uint32_t *sq = ff_squareTbl + 256;
267
268    s = 0;
269    for (i = 0; i < h; i++) {
270        s += sq[pix1[0] - pix2[0]];
271        s += sq[pix1[1] - pix2[1]];
272        s += sq[pix1[2] - pix2[2]];
273        s += sq[pix1[3] - pix2[3]];
274        pix1 += line_size;
275        pix2 += line_size;
276    }
277    return s;
278}
279
280static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
281{
282    int s, i;
283    uint32_t *sq = ff_squareTbl + 256;
284
285    s = 0;
286    for (i = 0; i < h; i++) {
287        s += sq[pix1[0] - pix2[0]];
288        s += sq[pix1[1] - pix2[1]];
289        s += sq[pix1[2] - pix2[2]];
290        s += sq[pix1[3] - pix2[3]];
291        s += sq[pix1[4] - pix2[4]];
292        s += sq[pix1[5] - pix2[5]];
293        s += sq[pix1[6] - pix2[6]];
294        s += sq[pix1[7] - pix2[7]];
295        pix1 += line_size;
296        pix2 += line_size;
297    }
298    return s;
299}
300
301static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
302{
303    int s, i;
304    uint32_t *sq = ff_squareTbl + 256;
305
306    s = 0;
307    for (i = 0; i < h; i++) {
308        s += sq[pix1[ 0] - pix2[ 0]];
309        s += sq[pix1[ 1] - pix2[ 1]];
310        s += sq[pix1[ 2] - pix2[ 2]];
311        s += sq[pix1[ 3] - pix2[ 3]];
312        s += sq[pix1[ 4] - pix2[ 4]];
313        s += sq[pix1[ 5] - pix2[ 5]];
314        s += sq[pix1[ 6] - pix2[ 6]];
315        s += sq[pix1[ 7] - pix2[ 7]];
316        s += sq[pix1[ 8] - pix2[ 8]];
317        s += sq[pix1[ 9] - pix2[ 9]];
318        s += sq[pix1[10] - pix2[10]];
319        s += sq[pix1[11] - pix2[11]];
320        s += sq[pix1[12] - pix2[12]];
321        s += sq[pix1[13] - pix2[13]];
322        s += sq[pix1[14] - pix2[14]];
323        s += sq[pix1[15] - pix2[15]];
324
325        pix1 += line_size;
326        pix2 += line_size;
327    }
328    return s;
329}
330
331/* draw the edges of width 'w' of an image of size width, height */
332//FIXME check that this is ok for mpeg4 interlaced
333static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
334{
335    uint8_t *ptr, *last_line;
336    int i;
337
338    last_line = buf + (height - 1) * wrap;
339    for(i=0;i<w;i++) {
340        /* top and bottom */
341        memcpy(buf - (i + 1) * wrap, buf, width);
342        memcpy(last_line + (i + 1) * wrap, last_line, width);
343    }
344    /* left and right */
345    ptr = buf;
346    for(i=0;i<height;i++) {
347        memset(ptr - w, ptr[0], w);
348        memset(ptr + width, ptr[width-1], w);
349        ptr += wrap;
350    }
351    /* corners */
352    for(i=0;i<w;i++) {
353        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
354        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
355        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
356        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
357    }
358}
359
360/**
361 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
362 * @param buf destination buffer
363 * @param src source buffer
364 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
365 * @param block_w width of block
366 * @param block_h height of block
367 * @param src_x x coordinate of the top left sample of the block in the source buffer
368 * @param src_y y coordinate of the top left sample of the block in the source buffer
369 * @param w width of the source buffer
370 * @param h height of the source buffer
371 */
372void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
373                                    int src_x, int src_y, int w, int h){
374    int x, y;
375    int start_y, start_x, end_y, end_x;
376
377    if(src_y>= h){
378        src+= (h-1-src_y)*linesize;
379        src_y=h-1;
380    }else if(src_y<=-block_h){
381        src+= (1-block_h-src_y)*linesize;
382        src_y=1-block_h;
383    }
384    if(src_x>= w){
385        src+= (w-1-src_x);
386        src_x=w-1;
387    }else if(src_x<=-block_w){
388        src+= (1-block_w-src_x);
389        src_x=1-block_w;
390    }
391
392    start_y= FFMAX(0, -src_y);
393    start_x= FFMAX(0, -src_x);
394    end_y= FFMIN(block_h, h-src_y);
395    end_x= FFMIN(block_w, w-src_x);
396
397    // copy existing part
398    for(y=start_y; y<end_y; y++){
399        for(x=start_x; x<end_x; x++){
400            buf[x + y*linesize]= src[x + y*linesize];
401        }
402    }
403
404    //top
405    for(y=0; y<start_y; y++){
406        for(x=start_x; x<end_x; x++){
407            buf[x + y*linesize]= buf[x + start_y*linesize];
408        }
409    }
410
411    //bottom
412    for(y=end_y; y<block_h; y++){
413        for(x=start_x; x<end_x; x++){
414            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
415        }
416    }
417
418    for(y=0; y<block_h; y++){
419       //left
420        for(x=0; x<start_x; x++){
421            buf[x + y*linesize]= buf[start_x + y*linesize];
422        }
423
424       //right
425        for(x=end_x; x<block_w; x++){
426            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
427        }
428    }
429}
430
431static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
432{
433    int i;
434
435    /* read the pixels */
436    for(i=0;i<8;i++) {
437        block[0] = pixels[0];
438        block[1] = pixels[1];
439        block[2] = pixels[2];
440        block[3] = pixels[3];
441        block[4] = pixels[4];
442        block[5] = pixels[5];
443        block[6] = pixels[6];
444        block[7] = pixels[7];
445        pixels += line_size;
446        block += 8;
447    }
448}
449
450static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
451                          const uint8_t *s2, int stride){
452    int i;
453
454    /* read the pixels */
455    for(i=0;i<8;i++) {
456        block[0] = s1[0] - s2[0];
457        block[1] = s1[1] - s2[1];
458        block[2] = s1[2] - s2[2];
459        block[3] = s1[3] - s2[3];
460        block[4] = s1[4] - s2[4];
461        block[5] = s1[5] - s2[5];
462        block[6] = s1[6] - s2[6];
463        block[7] = s1[7] - s2[7];
464        s1 += stride;
465        s2 += stride;
466        block += 8;
467    }
468}
469
470
471static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
472                                 int line_size)
473{
474    int i;
475    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
476
477    /* read the pixels */
478    for(i=0;i<8;i++) {
479        pixels[0] = cm[block[0]];
480        pixels[1] = cm[block[1]];
481        pixels[2] = cm[block[2]];
482        pixels[3] = cm[block[3]];
483        pixels[4] = cm[block[4]];
484        pixels[5] = cm[block[5]];
485        pixels[6] = cm[block[6]];
486        pixels[7] = cm[block[7]];
487
488        pixels += line_size;
489        block += 8;
490    }
491}
492
493static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
494                                 int line_size)
495{
496    int i;
497    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
498
499    /* read the pixels */
500    for(i=0;i<4;i++) {
501        pixels[0] = cm[block[0]];
502        pixels[1] = cm[block[1]];
503        pixels[2] = cm[block[2]];
504        pixels[3] = cm[block[3]];
505
506        pixels += line_size;
507        block += 8;
508    }
509}
510
511static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
512                                 int line_size)
513{
514    int i;
515    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
516
517    /* read the pixels */
518    for(i=0;i<2;i++) {
519        pixels[0] = cm[block[0]];
520        pixels[1] = cm[block[1]];
521
522        pixels += line_size;
523        block += 8;
524    }
525}
526
527static void put_signed_pixels_clamped_c(const DCTELEM *block,
528                                        uint8_t *restrict pixels,
529                                        int line_size)
530{
531    int i, j;
532
533    for (i = 0; i < 8; i++) {
534        for (j = 0; j < 8; j++) {
535            if (*block < -128)
536                *pixels = 0;
537            else if (*block > 127)
538                *pixels = 255;
539            else
540                *pixels = (uint8_t)(*block + 128);
541            block++;
542            pixels++;
543        }
544        pixels += (line_size - 8);
545    }
546}
547
548static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
549                                    int line_size)
550{
551    int i;
552
553    /* read the pixels */
554    for(i=0;i<8;i++) {
555        pixels[0] = block[0];
556        pixels[1] = block[1];
557        pixels[2] = block[2];
558        pixels[3] = block[3];
559        pixels[4] = block[4];
560        pixels[5] = block[5];
561        pixels[6] = block[6];
562        pixels[7] = block[7];
563
564        pixels += line_size;
565        block += 8;
566    }
567}
568
569static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
570                          int line_size)
571{
572    int i;
573    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
574
575    /* read the pixels */
576    for(i=0;i<8;i++) {
577        pixels[0] = cm[pixels[0] + block[0]];
578        pixels[1] = cm[pixels[1] + block[1]];
579        pixels[2] = cm[pixels[2] + block[2]];
580        pixels[3] = cm[pixels[3] + block[3]];
581        pixels[4] = cm[pixels[4] + block[4]];
582        pixels[5] = cm[pixels[5] + block[5]];
583        pixels[6] = cm[pixels[6] + block[6]];
584        pixels[7] = cm[pixels[7] + block[7]];
585        pixels += line_size;
586        block += 8;
587    }
588}
589
590static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
591                          int line_size)
592{
593    int i;
594    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
595
596    /* read the pixels */
597    for(i=0;i<4;i++) {
598        pixels[0] = cm[pixels[0] + block[0]];
599        pixels[1] = cm[pixels[1] + block[1]];
600        pixels[2] = cm[pixels[2] + block[2]];
601        pixels[3] = cm[pixels[3] + block[3]];
602        pixels += line_size;
603        block += 8;
604    }
605}
606
607static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
608                          int line_size)
609{
610    int i;
611    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
612
613    /* read the pixels */
614    for(i=0;i<2;i++) {
615        pixels[0] = cm[pixels[0] + block[0]];
616        pixels[1] = cm[pixels[1] + block[1]];
617        pixels += line_size;
618        block += 8;
619    }
620}
621
622static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
623{
624    int i;
625    for(i=0;i<8;i++) {
626        pixels[0] += block[0];
627        pixels[1] += block[1];
628        pixels[2] += block[2];
629        pixels[3] += block[3];
630        pixels[4] += block[4];
631        pixels[5] += block[5];
632        pixels[6] += block[6];
633        pixels[7] += block[7];
634        pixels += line_size;
635        block += 8;
636    }
637}
638
639static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
640{
641    int i;
642    for(i=0;i<4;i++) {
643        pixels[0] += block[0];
644        pixels[1] += block[1];
645        pixels[2] += block[2];
646        pixels[3] += block[3];
647        pixels += line_size;
648        block += 4;
649    }
650}
651
652static int sum_abs_dctelem_c(DCTELEM *block)
653{
654    int sum=0, i;
655    for(i=0; i<64; i++)
656        sum+= FFABS(block[i]);
657    return sum;
658}
659
660static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
661{
662    int i;
663
664    for (i = 0; i < h; i++) {
665        memset(block, value, 16);
666        block += line_size;
667    }
668}
669
670static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
671{
672    int i;
673
674    for (i = 0; i < h; i++) {
675        memset(block, value, 8);
676        block += line_size;
677    }
678}
679
680static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
681{
682    int i, j;
683    uint16_t *dst1 = (uint16_t *) dst;
684    uint16_t *dst2 = (uint16_t *)(dst + linesize);
685
686    for (j = 0; j < 8; j++) {
687        for (i = 0; i < 8; i++) {
688            dst1[i] = dst2[i] = src[i] * 0x0101;
689        }
690        src  += 8;
691        dst1 += linesize;
692        dst2 += linesize;
693    }
694}
695
696#if 0
697
698#define PIXOP2(OPNAME, OP) \
699static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
700{\
701    int i;\
702    for(i=0; i<h; i++){\
703        OP(*((uint64_t*)block), AV_RN64(pixels));\
704        pixels+=line_size;\
705        block +=line_size;\
706    }\
707}\
708\
709static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
710{\
711    int i;\
712    for(i=0; i<h; i++){\
713        const uint64_t a= AV_RN64(pixels  );\
714        const uint64_t b= AV_RN64(pixels+1);\
715        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
716        pixels+=line_size;\
717        block +=line_size;\
718    }\
719}\
720\
721static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
722{\
723    int i;\
724    for(i=0; i<h; i++){\
725        const uint64_t a= AV_RN64(pixels  );\
726        const uint64_t b= AV_RN64(pixels+1);\
727        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
728        pixels+=line_size;\
729        block +=line_size;\
730    }\
731}\
732\
733static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
734{\
735    int i;\
736    for(i=0; i<h; i++){\
737        const uint64_t a= AV_RN64(pixels          );\
738        const uint64_t b= AV_RN64(pixels+line_size);\
739        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
740        pixels+=line_size;\
741        block +=line_size;\
742    }\
743}\
744\
745static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
746{\
747    int i;\
748    for(i=0; i<h; i++){\
749        const uint64_t a= AV_RN64(pixels          );\
750        const uint64_t b= AV_RN64(pixels+line_size);\
751        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
752        pixels+=line_size;\
753        block +=line_size;\
754    }\
755}\
756\
757static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
758{\
759        int i;\
760        const uint64_t a= AV_RN64(pixels  );\
761        const uint64_t b= AV_RN64(pixels+1);\
762        uint64_t l0=  (a&0x0303030303030303ULL)\
763                    + (b&0x0303030303030303ULL)\
764                    + 0x0202020202020202ULL;\
765        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
766                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
767        uint64_t l1,h1;\
768\
769        pixels+=line_size;\
770        for(i=0; i<h; i+=2){\
771            uint64_t a= AV_RN64(pixels  );\
772            uint64_t b= AV_RN64(pixels+1);\
773            l1=  (a&0x0303030303030303ULL)\
774               + (b&0x0303030303030303ULL);\
775            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
776              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
777            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
778            pixels+=line_size;\
779            block +=line_size;\
780            a= AV_RN64(pixels  );\
781            b= AV_RN64(pixels+1);\
782            l0=  (a&0x0303030303030303ULL)\
783               + (b&0x0303030303030303ULL)\
784               + 0x0202020202020202ULL;\
785            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
786              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
787            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
788            pixels+=line_size;\
789            block +=line_size;\
790        }\
791}\
792\
793static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
794{\
795        int i;\
796        const uint64_t a= AV_RN64(pixels  );\
797        const uint64_t b= AV_RN64(pixels+1);\
798        uint64_t l0=  (a&0x0303030303030303ULL)\
799                    + (b&0x0303030303030303ULL)\
800                    + 0x0101010101010101ULL;\
801        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
802                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
803        uint64_t l1,h1;\
804\
805        pixels+=line_size;\
806        for(i=0; i<h; i+=2){\
807            uint64_t a= AV_RN64(pixels  );\
808            uint64_t b= AV_RN64(pixels+1);\
809            l1=  (a&0x0303030303030303ULL)\
810               + (b&0x0303030303030303ULL);\
811            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
812              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
813            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
814            pixels+=line_size;\
815            block +=line_size;\
816            a= AV_RN64(pixels  );\
817            b= AV_RN64(pixels+1);\
818            l0=  (a&0x0303030303030303ULL)\
819               + (b&0x0303030303030303ULL)\
820               + 0x0101010101010101ULL;\
821            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
822              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
823            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
824            pixels+=line_size;\
825            block +=line_size;\
826        }\
827}\
828\
829CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
830CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
831CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
832CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
833CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
834CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
835CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
836
837#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
838#else // 64 bit variant
839
840#define PIXOP2(OPNAME, OP) \
841static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
842    int i;\
843    for(i=0; i<h; i++){\
844        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
845        pixels+=line_size;\
846        block +=line_size;\
847    }\
848}\
849static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
850    int i;\
851    for(i=0; i<h; i++){\
852        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
853        pixels+=line_size;\
854        block +=line_size;\
855    }\
856}\
857static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
858    int i;\
859    for(i=0; i<h; i++){\
860        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
861        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
862        pixels+=line_size;\
863        block +=line_size;\
864    }\
865}\
866static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
867    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
868}\
869\
870static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
871                                                int src_stride1, int src_stride2, int h){\
872    int i;\
873    for(i=0; i<h; i++){\
874        uint32_t a,b;\
875        a= AV_RN32(&src1[i*src_stride1  ]);\
876        b= AV_RN32(&src2[i*src_stride2  ]);\
877        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
878        a= AV_RN32(&src1[i*src_stride1+4]);\
879        b= AV_RN32(&src2[i*src_stride2+4]);\
880        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
881    }\
882}\
883\
884static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
885                                                int src_stride1, int src_stride2, int h){\
886    int i;\
887    for(i=0; i<h; i++){\
888        uint32_t a,b;\
889        a= AV_RN32(&src1[i*src_stride1  ]);\
890        b= AV_RN32(&src2[i*src_stride2  ]);\
891        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
892        a= AV_RN32(&src1[i*src_stride1+4]);\
893        b= AV_RN32(&src2[i*src_stride2+4]);\
894        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
895    }\
896}\
897\
898static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
899                                                int src_stride1, int src_stride2, int h){\
900    int i;\
901    for(i=0; i<h; i++){\
902        uint32_t a,b;\
903        a= AV_RN32(&src1[i*src_stride1  ]);\
904        b= AV_RN32(&src2[i*src_stride2  ]);\
905        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
906    }\
907}\
908\
909static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
910                                                int src_stride1, int src_stride2, int h){\
911    int i;\
912    for(i=0; i<h; i++){\
913        uint32_t a,b;\
914        a= AV_RN16(&src1[i*src_stride1  ]);\
915        b= AV_RN16(&src2[i*src_stride2  ]);\
916        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
917    }\
918}\
919\
920static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
921                                                int src_stride1, int src_stride2, int h){\
922    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
923    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
924}\
925\
926static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
927                                                int src_stride1, int src_stride2, int h){\
928    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
929    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
930}\
931\
932static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
933    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
934}\
935\
936static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
937    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
938}\
939\
940static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
941    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
942}\
943\
944static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
945    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
946}\
947\
948static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
949                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
950    int i;\
951    for(i=0; i<h; i++){\
952        uint32_t a, b, c, d, l0, l1, h0, h1;\
953        a= AV_RN32(&src1[i*src_stride1]);\
954        b= AV_RN32(&src2[i*src_stride2]);\
955        c= AV_RN32(&src3[i*src_stride3]);\
956        d= AV_RN32(&src4[i*src_stride4]);\
957        l0=  (a&0x03030303UL)\
958           + (b&0x03030303UL)\
959           + 0x02020202UL;\
960        h0= ((a&0xFCFCFCFCUL)>>2)\
961          + ((b&0xFCFCFCFCUL)>>2);\
962        l1=  (c&0x03030303UL)\
963           + (d&0x03030303UL);\
964        h1= ((c&0xFCFCFCFCUL)>>2)\
965          + ((d&0xFCFCFCFCUL)>>2);\
966        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
967        a= AV_RN32(&src1[i*src_stride1+4]);\
968        b= AV_RN32(&src2[i*src_stride2+4]);\
969        c= AV_RN32(&src3[i*src_stride3+4]);\
970        d= AV_RN32(&src4[i*src_stride4+4]);\
971        l0=  (a&0x03030303UL)\
972           + (b&0x03030303UL)\
973           + 0x02020202UL;\
974        h0= ((a&0xFCFCFCFCUL)>>2)\
975          + ((b&0xFCFCFCFCUL)>>2);\
976        l1=  (c&0x03030303UL)\
977           + (d&0x03030303UL);\
978        h1= ((c&0xFCFCFCFCUL)>>2)\
979          + ((d&0xFCFCFCFCUL)>>2);\
980        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
981    }\
982}\
983\
984static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
985    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
986}\
987\
988static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
989    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
990}\
991\
992static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
993    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
994}\
995\
996static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
997    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
998}\
999\
1000static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1001                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1002    int i;\
1003    for(i=0; i<h; i++){\
1004        uint32_t a, b, c, d, l0, l1, h0, h1;\
1005        a= AV_RN32(&src1[i*src_stride1]);\
1006        b= AV_RN32(&src2[i*src_stride2]);\
1007        c= AV_RN32(&src3[i*src_stride3]);\
1008        d= AV_RN32(&src4[i*src_stride4]);\
1009        l0=  (a&0x03030303UL)\
1010           + (b&0x03030303UL)\
1011           + 0x01010101UL;\
1012        h0= ((a&0xFCFCFCFCUL)>>2)\
1013          + ((b&0xFCFCFCFCUL)>>2);\
1014        l1=  (c&0x03030303UL)\
1015           + (d&0x03030303UL);\
1016        h1= ((c&0xFCFCFCFCUL)>>2)\
1017          + ((d&0xFCFCFCFCUL)>>2);\
1018        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1019        a= AV_RN32(&src1[i*src_stride1+4]);\
1020        b= AV_RN32(&src2[i*src_stride2+4]);\
1021        c= AV_RN32(&src3[i*src_stride3+4]);\
1022        d= AV_RN32(&src4[i*src_stride4+4]);\
1023        l0=  (a&0x03030303UL)\
1024           + (b&0x03030303UL)\
1025           + 0x01010101UL;\
1026        h0= ((a&0xFCFCFCFCUL)>>2)\
1027          + ((b&0xFCFCFCFCUL)>>2);\
1028        l1=  (c&0x03030303UL)\
1029           + (d&0x03030303UL);\
1030        h1= ((c&0xFCFCFCFCUL)>>2)\
1031          + ((d&0xFCFCFCFCUL)>>2);\
1032        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1033    }\
1034}\
1035static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1036                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1037    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1038    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1039}\
1040static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1041                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1042    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1043    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1044}\
1045\
1046static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1047{\
1048        int i, a0, b0, a1, b1;\
1049        a0= pixels[0];\
1050        b0= pixels[1] + 2;\
1051        a0 += b0;\
1052        b0 += pixels[2];\
1053\
1054        pixels+=line_size;\
1055        for(i=0; i<h; i+=2){\
1056            a1= pixels[0];\
1057            b1= pixels[1];\
1058            a1 += b1;\
1059            b1 += pixels[2];\
1060\
1061            block[0]= (a1+a0)>>2; /* FIXME non put */\
1062            block[1]= (b1+b0)>>2;\
1063\
1064            pixels+=line_size;\
1065            block +=line_size;\
1066\
1067            a0= pixels[0];\
1068            b0= pixels[1] + 2;\
1069            a0 += b0;\
1070            b0 += pixels[2];\
1071\
1072            block[0]= (a1+a0)>>2;\
1073            block[1]= (b1+b0)>>2;\
1074            pixels+=line_size;\
1075            block +=line_size;\
1076        }\
1077}\
1078\
1079static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1080{\
1081        int i;\
1082        const uint32_t a= AV_RN32(pixels  );\
1083        const uint32_t b= AV_RN32(pixels+1);\
1084        uint32_t l0=  (a&0x03030303UL)\
1085                    + (b&0x03030303UL)\
1086                    + 0x02020202UL;\
1087        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1088                   + ((b&0xFCFCFCFCUL)>>2);\
1089        uint32_t l1,h1;\
1090\
1091        pixels+=line_size;\
1092        for(i=0; i<h; i+=2){\
1093            uint32_t a= AV_RN32(pixels  );\
1094            uint32_t b= AV_RN32(pixels+1);\
1095            l1=  (a&0x03030303UL)\
1096               + (b&0x03030303UL);\
1097            h1= ((a&0xFCFCFCFCUL)>>2)\
1098              + ((b&0xFCFCFCFCUL)>>2);\
1099            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1100            pixels+=line_size;\
1101            block +=line_size;\
1102            a= AV_RN32(pixels  );\
1103            b= AV_RN32(pixels+1);\
1104            l0=  (a&0x03030303UL)\
1105               + (b&0x03030303UL)\
1106               + 0x02020202UL;\
1107            h0= ((a&0xFCFCFCFCUL)>>2)\
1108              + ((b&0xFCFCFCFCUL)>>2);\
1109            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1110            pixels+=line_size;\
1111            block +=line_size;\
1112        }\
1113}\
1114\
1115static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1116{\
1117    int j;\
1118    for(j=0; j<2; j++){\
1119        int i;\
1120        const uint32_t a= AV_RN32(pixels  );\
1121        const uint32_t b= AV_RN32(pixels+1);\
1122        uint32_t l0=  (a&0x03030303UL)\
1123                    + (b&0x03030303UL)\
1124                    + 0x02020202UL;\
1125        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1126                   + ((b&0xFCFCFCFCUL)>>2);\
1127        uint32_t l1,h1;\
1128\
1129        pixels+=line_size;\
1130        for(i=0; i<h; i+=2){\
1131            uint32_t a= AV_RN32(pixels  );\
1132            uint32_t b= AV_RN32(pixels+1);\
1133            l1=  (a&0x03030303UL)\
1134               + (b&0x03030303UL);\
1135            h1= ((a&0xFCFCFCFCUL)>>2)\
1136              + ((b&0xFCFCFCFCUL)>>2);\
1137            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1138            pixels+=line_size;\
1139            block +=line_size;\
1140            a= AV_RN32(pixels  );\
1141            b= AV_RN32(pixels+1);\
1142            l0=  (a&0x03030303UL)\
1143               + (b&0x03030303UL)\
1144               + 0x02020202UL;\
1145            h0= ((a&0xFCFCFCFCUL)>>2)\
1146              + ((b&0xFCFCFCFCUL)>>2);\
1147            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1148            pixels+=line_size;\
1149            block +=line_size;\
1150        }\
1151        pixels+=4-line_size*(h+1);\
1152        block +=4-line_size*h;\
1153    }\
1154}\
1155\
1156static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1157{\
1158    int j;\
1159    for(j=0; j<2; j++){\
1160        int i;\
1161        const uint32_t a= AV_RN32(pixels  );\
1162        const uint32_t b= AV_RN32(pixels+1);\
1163        uint32_t l0=  (a&0x03030303UL)\
1164                    + (b&0x03030303UL)\
1165                    + 0x01010101UL;\
1166        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1167                   + ((b&0xFCFCFCFCUL)>>2);\
1168        uint32_t l1,h1;\
1169\
1170        pixels+=line_size;\
1171        for(i=0; i<h; i+=2){\
1172            uint32_t a= AV_RN32(pixels  );\
1173            uint32_t b= AV_RN32(pixels+1);\
1174            l1=  (a&0x03030303UL)\
1175               + (b&0x03030303UL);\
1176            h1= ((a&0xFCFCFCFCUL)>>2)\
1177              + ((b&0xFCFCFCFCUL)>>2);\
1178            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1179            pixels+=line_size;\
1180            block +=line_size;\
1181            a= AV_RN32(pixels  );\
1182            b= AV_RN32(pixels+1);\
1183            l0=  (a&0x03030303UL)\
1184               + (b&0x03030303UL)\
1185               + 0x01010101UL;\
1186            h0= ((a&0xFCFCFCFCUL)>>2)\
1187              + ((b&0xFCFCFCFCUL)>>2);\
1188            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1189            pixels+=line_size;\
1190            block +=line_size;\
1191        }\
1192        pixels+=4-line_size*(h+1);\
1193        block +=4-line_size*h;\
1194    }\
1195}\
1196\
1197CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1198CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1199CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1200CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1201CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1202CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1203CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1204CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1205
1206#define op_avg(a, b) a = rnd_avg32(a, b)
1207#endif
1208#define op_put(a, b) a = b
1209
1210PIXOP2(avg, op_avg)
1211PIXOP2(put, op_put)
1212#undef op_avg
1213#undef op_put
1214
1215#define avg2(a,b) ((a+b+1)>>1)
1216#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1217
1218static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1219    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1220}
1221
1222static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1223    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1224}
1225
1226static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1227{
1228    const int A=(16-x16)*(16-y16);
1229    const int B=(   x16)*(16-y16);
1230    const int C=(16-x16)*(   y16);
1231    const int D=(   x16)*(   y16);
1232    int i;
1233
1234    for(i=0; i<h; i++)
1235    {
1236        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1237        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1238        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1239        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1240        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1241        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1242        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1243        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1244        dst+= stride;
1245        src+= stride;
1246    }
1247}
1248
1249void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1250                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1251{
1252    int y, vx, vy;
1253    const int s= 1<<shift;
1254
1255    width--;
1256    height--;
1257
1258    for(y=0; y<h; y++){
1259        int x;
1260
1261        vx= ox;
1262        vy= oy;
1263        for(x=0; x<8; x++){ //XXX FIXME optimize
1264            int src_x, src_y, frac_x, frac_y, index;
1265
1266            src_x= vx>>16;
1267            src_y= vy>>16;
1268            frac_x= src_x&(s-1);
1269            frac_y= src_y&(s-1);
1270            src_x>>=shift;
1271            src_y>>=shift;
1272
1273            if((unsigned)src_x < width){
1274                if((unsigned)src_y < height){
1275                    index= src_x + src_y*stride;
1276                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1277                                           + src[index       +1]*   frac_x )*(s-frac_y)
1278                                        + (  src[index+stride  ]*(s-frac_x)
1279                                           + src[index+stride+1]*   frac_x )*   frac_y
1280                                        + r)>>(shift*2);
1281                }else{
1282                    index= src_x + av_clip(src_y, 0, height)*stride;
1283                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1284                                          + src[index       +1]*   frac_x )*s
1285                                        + r)>>(shift*2);
1286                }
1287            }else{
1288                if((unsigned)src_y < height){
1289                    index= av_clip(src_x, 0, width) + src_y*stride;
1290                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1291                                           + src[index+stride  ]*   frac_y )*s
1292                                        + r)>>(shift*2);
1293                }else{
1294                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1295                    dst[y*stride + x]=    src[index         ];
1296                }
1297            }
1298
1299            vx+= dxx;
1300            vy+= dyx;
1301        }
1302        ox += dxy;
1303        oy += dyy;
1304    }
1305}
1306
1307static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1308    switch(width){
1309    case 2: put_pixels2_c (dst, src, stride, height); break;
1310    case 4: put_pixels4_c (dst, src, stride, height); break;
1311    case 8: put_pixels8_c (dst, src, stride, height); break;
1312    case 16:put_pixels16_c(dst, src, stride, height); break;
1313    }
1314}
1315
1316static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1317    int i,j;
1318    for (i=0; i < height; i++) {
1319      for (j=0; j < width; j++) {
1320        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1321      }
1322      src += stride;
1323      dst += stride;
1324    }
1325}
1326
1327static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1328    int i,j;
1329    for (i=0; i < height; i++) {
1330      for (j=0; j < width; j++) {
1331        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1332      }
1333      src += stride;
1334      dst += stride;
1335    }
1336}
1337
1338static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1339    int i,j;
1340    for (i=0; i < height; i++) {
1341      for (j=0; j < width; j++) {
1342        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1343      }
1344      src += stride;
1345      dst += stride;
1346    }
1347}
1348
1349static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1350    int i,j;
1351    for (i=0; i < height; i++) {
1352      for (j=0; j < width; j++) {
1353        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1354      }
1355      src += stride;
1356      dst += stride;
1357    }
1358}
1359
1360static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1361    int i,j;
1362    for (i=0; i < height; i++) {
1363      for (j=0; j < width; j++) {
1364        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1365      }
1366      src += stride;
1367      dst += stride;
1368    }
1369}
1370
1371static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1372    int i,j;
1373    for (i=0; i < height; i++) {
1374      for (j=0; j < width; j++) {
1375        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1376      }
1377      src += stride;
1378      dst += stride;
1379    }
1380}
1381
1382static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1383    int i,j;
1384    for (i=0; i < height; i++) {
1385      for (j=0; j < width; j++) {
1386        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1387      }
1388      src += stride;
1389      dst += stride;
1390    }
1391}
1392
1393static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1394    int i,j;
1395    for (i=0; i < height; i++) {
1396      for (j=0; j < width; j++) {
1397        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1398      }
1399      src += stride;
1400      dst += stride;
1401    }
1402}
1403
1404static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1405    switch(width){
1406    case 2: avg_pixels2_c (dst, src, stride, height); break;
1407    case 4: avg_pixels4_c (dst, src, stride, height); break;
1408    case 8: avg_pixels8_c (dst, src, stride, height); break;
1409    case 16:avg_pixels16_c(dst, src, stride, height); break;
1410    }
1411}
1412
1413static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1414    int i,j;
1415    for (i=0; i < height; i++) {
1416      for (j=0; j < width; j++) {
1417        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1418      }
1419      src += stride;
1420      dst += stride;
1421    }
1422}
1423
1424static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1425    int i,j;
1426    for (i=0; i < height; i++) {
1427      for (j=0; j < width; j++) {
1428        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1429      }
1430      src += stride;
1431      dst += stride;
1432    }
1433}
1434
1435static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1436    int i,j;
1437    for (i=0; i < height; i++) {
1438      for (j=0; j < width; j++) {
1439        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1440      }
1441      src += stride;
1442      dst += stride;
1443    }
1444}
1445
1446static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1447    int i,j;
1448    for (i=0; i < height; i++) {
1449      for (j=0; j < width; j++) {
1450        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1451      }
1452      src += stride;
1453      dst += stride;
1454    }
1455}
1456
1457static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1458    int i,j;
1459    for (i=0; i < height; i++) {
1460      for (j=0; j < width; j++) {
1461        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1462      }
1463      src += stride;
1464      dst += stride;
1465    }
1466}
1467
1468static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1469    int i,j;
1470    for (i=0; i < height; i++) {
1471      for (j=0; j < width; j++) {
1472        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1473      }
1474      src += stride;
1475      dst += stride;
1476    }
1477}
1478
1479static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1480    int i,j;
1481    for (i=0; i < height; i++) {
1482      for (j=0; j < width; j++) {
1483        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1484      }
1485      src += stride;
1486      dst += stride;
1487    }
1488}
1489
1490static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1491    int i,j;
1492    for (i=0; i < height; i++) {
1493      for (j=0; j < width; j++) {
1494        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1495      }
1496      src += stride;
1497      dst += stride;
1498    }
1499}
1500#if 0
1501#define TPEL_WIDTH(width)\
1502static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1503    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1504static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1505    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1506static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1507    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1508static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1509    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1510static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1511    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1512static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1513    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1514static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1515    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1516static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1517    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1518static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1519    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1520#endif
1521
1522#define H264_CHROMA_MC(OPNAME, OP)\
1523static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1524    const int A=(8-x)*(8-y);\
1525    const int B=(  x)*(8-y);\
1526    const int C=(8-x)*(  y);\
1527    const int D=(  x)*(  y);\
1528    int i;\
1529    \
1530    assert(x<8 && y<8 && x>=0 && y>=0);\
1531\
1532    if(D){\
1533        for(i=0; i<h; i++){\
1534            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1535            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1536            dst+= stride;\
1537            src+= stride;\
1538        }\
1539    }else{\
1540        const int E= B+C;\
1541        const int step= C ? stride : 1;\
1542        for(i=0; i<h; i++){\
1543            OP(dst[0], (A*src[0] + E*src[step+0]));\
1544            OP(dst[1], (A*src[1] + E*src[step+1]));\
1545            dst+= stride;\
1546            src+= stride;\
1547        }\
1548    }\
1549}\
1550\
1551static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1552    const int A=(8-x)*(8-y);\
1553    const int B=(  x)*(8-y);\
1554    const int C=(8-x)*(  y);\
1555    const int D=(  x)*(  y);\
1556    int i;\
1557    \
1558    assert(x<8 && y<8 && x>=0 && y>=0);\
1559\
1560    if(D){\
1561        for(i=0; i<h; i++){\
1562            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1563            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1564            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1565            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1566            dst+= stride;\
1567            src+= stride;\
1568        }\
1569    }else{\
1570        const int E= B+C;\
1571        const int step= C ? stride : 1;\
1572        for(i=0; i<h; i++){\
1573            OP(dst[0], (A*src[0] + E*src[step+0]));\
1574            OP(dst[1], (A*src[1] + E*src[step+1]));\
1575            OP(dst[2], (A*src[2] + E*src[step+2]));\
1576            OP(dst[3], (A*src[3] + E*src[step+3]));\
1577            dst+= stride;\
1578            src+= stride;\
1579        }\
1580    }\
1581}\
1582\
1583static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1584    const int A=(8-x)*(8-y);\
1585    const int B=(  x)*(8-y);\
1586    const int C=(8-x)*(  y);\
1587    const int D=(  x)*(  y);\
1588    int i;\
1589    \
1590    assert(x<8 && y<8 && x>=0 && y>=0);\
1591\
1592    if(D){\
1593        for(i=0; i<h; i++){\
1594            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1595            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1596            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1597            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1598            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1599            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1600            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1601            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1602            dst+= stride;\
1603            src+= stride;\
1604        }\
1605    }else{\
1606        const int E= B+C;\
1607        const int step= C ? stride : 1;\
1608        for(i=0; i<h; i++){\
1609            OP(dst[0], (A*src[0] + E*src[step+0]));\
1610            OP(dst[1], (A*src[1] + E*src[step+1]));\
1611            OP(dst[2], (A*src[2] + E*src[step+2]));\
1612            OP(dst[3], (A*src[3] + E*src[step+3]));\
1613            OP(dst[4], (A*src[4] + E*src[step+4]));\
1614            OP(dst[5], (A*src[5] + E*src[step+5]));\
1615            OP(dst[6], (A*src[6] + E*src[step+6]));\
1616            OP(dst[7], (A*src[7] + E*src[step+7]));\
1617            dst+= stride;\
1618            src+= stride;\
1619        }\
1620    }\
1621}
1622
1623#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1624#define op_put(a, b) a = (((b) + 32)>>6)
1625
1626H264_CHROMA_MC(put_       , op_put)
1627H264_CHROMA_MC(avg_       , op_avg)
1628#undef op_avg
1629#undef op_put
1630
1631static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1632    const int A=(8-x)*(8-y);
1633    const int B=(  x)*(8-y);
1634    const int C=(8-x)*(  y);
1635    const int D=(  x)*(  y);
1636    int i;
1637
1638    assert(x<8 && y<8 && x>=0 && y>=0);
1639
1640    for(i=0; i<h; i++)
1641    {
1642        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1643        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1644        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1645        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1646        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1647        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1648        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1649        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1650        dst+= stride;
1651        src+= stride;
1652    }
1653}
1654
1655static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1656    const int A=(8-x)*(8-y);
1657    const int B=(  x)*(8-y);
1658    const int C=(8-x)*(  y);
1659    const int D=(  x)*(  y);
1660    int i;
1661
1662    assert(x<8 && y<8 && x>=0 && y>=0);
1663
1664    for(i=0; i<h; i++)
1665    {
1666        dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1667        dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1668        dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1669        dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1670        dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1671        dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1672        dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1673        dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1674        dst+= stride;
1675        src+= stride;
1676    }
1677}
1678
1679#define QPEL_MC(r, OPNAME, RND, OP) \
1680static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1681    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1682    int i;\
1683    for(i=0; i<h; i++)\
1684    {\
1685        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1686        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1687        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1688        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1689        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1690        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1691        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1692        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1693        dst+=dstStride;\
1694        src+=srcStride;\
1695    }\
1696}\
1697\
1698static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1699    const int w=8;\
1700    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1701    int i;\
1702    for(i=0; i<w; i++)\
1703    {\
1704        const int src0= src[0*srcStride];\
1705        const int src1= src[1*srcStride];\
1706        const int src2= src[2*srcStride];\
1707        const int src3= src[3*srcStride];\
1708        const int src4= src[4*srcStride];\
1709        const int src5= src[5*srcStride];\
1710        const int src6= src[6*srcStride];\
1711        const int src7= src[7*srcStride];\
1712        const int src8= src[8*srcStride];\
1713        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1714        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1715        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1716        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1717        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1718        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1719        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1720        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1721        dst++;\
1722        src++;\
1723    }\
1724}\
1725\
1726static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1727    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1728    int i;\
1729    \
1730    for(i=0; i<h; i++)\
1731    {\
1732        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1733        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1734        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1735        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1736        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1737        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1738        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1739        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1740        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1741        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1742        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1743        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1744        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1745        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1746        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1747        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1748        dst+=dstStride;\
1749        src+=srcStride;\
1750    }\
1751}\
1752\
1753static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1754    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1755    int i;\
1756    const int w=16;\
1757    for(i=0; i<w; i++)\
1758    {\
1759        const int src0= src[0*srcStride];\
1760        const int src1= src[1*srcStride];\
1761        const int src2= src[2*srcStride];\
1762        const int src3= src[3*srcStride];\
1763        const int src4= src[4*srcStride];\
1764        const int src5= src[5*srcStride];\
1765        const int src6= src[6*srcStride];\
1766        const int src7= src[7*srcStride];\
1767        const int src8= src[8*srcStride];\
1768        const int src9= src[9*srcStride];\
1769        const int src10= src[10*srcStride];\
1770        const int src11= src[11*srcStride];\
1771        const int src12= src[12*srcStride];\
1772        const int src13= src[13*srcStride];\
1773        const int src14= src[14*srcStride];\
1774        const int src15= src[15*srcStride];\
1775        const int src16= src[16*srcStride];\
1776        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1777        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1778        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1779        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1780        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1781        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1782        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1783        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1784        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1785        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1786        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1787        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1788        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1789        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1790        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1791        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1792        dst++;\
1793        src++;\
1794    }\
1795}\
1796\
1797static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1798    OPNAME ## pixels8_c(dst, src, stride, 8);\
1799}\
1800\
1801static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1802    uint8_t half[64];\
1803    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1804    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1805}\
1806\
1807static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1808    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1809}\
1810\
1811static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1812    uint8_t half[64];\
1813    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1814    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1815}\
1816\
1817static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1818    uint8_t full[16*9];\
1819    uint8_t half[64];\
1820    copy_block9(full, src, 16, stride, 9);\
1821    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1822    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1823}\
1824\
1825static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1826    uint8_t full[16*9];\
1827    copy_block9(full, src, 16, stride, 9);\
1828    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1829}\
1830\
1831static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1832    uint8_t full[16*9];\
1833    uint8_t half[64];\
1834    copy_block9(full, src, 16, stride, 9);\
1835    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1836    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1837}\
1838void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1839    uint8_t full[16*9];\
1840    uint8_t halfH[72];\
1841    uint8_t halfV[64];\
1842    uint8_t halfHV[64];\
1843    copy_block9(full, src, 16, stride, 9);\
1844    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1845    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1846    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1847    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1848}\
1849static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1850    uint8_t full[16*9];\
1851    uint8_t halfH[72];\
1852    uint8_t halfHV[64];\
1853    copy_block9(full, src, 16, stride, 9);\
1854    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1855    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1856    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1857    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1858}\
1859void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1860    uint8_t full[16*9];\
1861    uint8_t halfH[72];\
1862    uint8_t halfV[64];\
1863    uint8_t halfHV[64];\
1864    copy_block9(full, src, 16, stride, 9);\
1865    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1866    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1867    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1868    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1869}\
1870static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1871    uint8_t full[16*9];\
1872    uint8_t halfH[72];\
1873    uint8_t halfHV[64];\
1874    copy_block9(full, src, 16, stride, 9);\
1875    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1876    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1877    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1878    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1879}\
1880void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1881    uint8_t full[16*9];\
1882    uint8_t halfH[72];\
1883    uint8_t halfV[64];\
1884    uint8_t halfHV[64];\
1885    copy_block9(full, src, 16, stride, 9);\
1886    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1887    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1888    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1889    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1890}\
1891static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1892    uint8_t full[16*9];\
1893    uint8_t halfH[72];\
1894    uint8_t halfHV[64];\
1895    copy_block9(full, src, 16, stride, 9);\
1896    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1897    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1898    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1899    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1900}\
1901void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1902    uint8_t full[16*9];\
1903    uint8_t halfH[72];\
1904    uint8_t halfV[64];\
1905    uint8_t halfHV[64];\
1906    copy_block9(full, src, 16, stride, 9);\
1907    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1908    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1909    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1910    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1911}\
1912static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1913    uint8_t full[16*9];\
1914    uint8_t halfH[72];\
1915    uint8_t halfHV[64];\
1916    copy_block9(full, src, 16, stride, 9);\
1917    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1918    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1919    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1920    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1921}\
1922static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1923    uint8_t halfH[72];\
1924    uint8_t halfHV[64];\
1925    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1926    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1927    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1928}\
1929static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1930    uint8_t halfH[72];\
1931    uint8_t halfHV[64];\
1932    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1933    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1934    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1935}\
1936void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1937    uint8_t full[16*9];\
1938    uint8_t halfH[72];\
1939    uint8_t halfV[64];\
1940    uint8_t halfHV[64];\
1941    copy_block9(full, src, 16, stride, 9);\
1942    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1943    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1944    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1945    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1946}\
1947static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1948    uint8_t full[16*9];\
1949    uint8_t halfH[72];\
1950    copy_block9(full, src, 16, stride, 9);\
1951    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1952    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1953    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1954}\
1955void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956    uint8_t full[16*9];\
1957    uint8_t halfH[72];\
1958    uint8_t halfV[64];\
1959    uint8_t halfHV[64];\
1960    copy_block9(full, src, 16, stride, 9);\
1961    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1962    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1963    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1964    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1965}\
1966static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1967    uint8_t full[16*9];\
1968    uint8_t halfH[72];\
1969    copy_block9(full, src, 16, stride, 9);\
1970    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1971    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1972    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1973}\
1974static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1975    uint8_t halfH[72];\
1976    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1977    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1978}\
1979static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1980    OPNAME ## pixels16_c(dst, src, stride, 16);\
1981}\
1982\
1983static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1984    uint8_t half[256];\
1985    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1986    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1987}\
1988\
1989static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1990    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1991}\
1992\
1993static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1994    uint8_t half[256];\
1995    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1996    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1997}\
1998\
1999static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2000    uint8_t full[24*17];\
2001    uint8_t half[256];\
2002    copy_block17(full, src, 24, stride, 17);\
2003    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2004    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2005}\
2006\
2007static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2008    uint8_t full[24*17];\
2009    copy_block17(full, src, 24, stride, 17);\
2010    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2011}\
2012\
2013static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2014    uint8_t full[24*17];\
2015    uint8_t half[256];\
2016    copy_block17(full, src, 24, stride, 17);\
2017    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2018    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2019}\
2020void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2021    uint8_t full[24*17];\
2022    uint8_t halfH[272];\
2023    uint8_t halfV[256];\
2024    uint8_t halfHV[256];\
2025    copy_block17(full, src, 24, stride, 17);\
2026    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2027    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2028    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2029    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2030}\
2031static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2032    uint8_t full[24*17];\
2033    uint8_t halfH[272];\
2034    uint8_t halfHV[256];\
2035    copy_block17(full, src, 24, stride, 17);\
2036    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2037    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2038    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2039    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2040}\
2041void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2042    uint8_t full[24*17];\
2043    uint8_t halfH[272];\
2044    uint8_t halfV[256];\
2045    uint8_t halfHV[256];\
2046    copy_block17(full, src, 24, stride, 17);\
2047    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2048    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2049    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2050    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2051}\
2052static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2053    uint8_t full[24*17];\
2054    uint8_t halfH[272];\
2055    uint8_t halfHV[256];\
2056    copy_block17(full, src, 24, stride, 17);\
2057    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2058    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2059    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2060    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2061}\
2062void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2063    uint8_t full[24*17];\
2064    uint8_t halfH[272];\
2065    uint8_t halfV[256];\
2066    uint8_t halfHV[256];\
2067    copy_block17(full, src, 24, stride, 17);\
2068    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2069    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2070    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2071    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2072}\
2073static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2074    uint8_t full[24*17];\
2075    uint8_t halfH[272];\
2076    uint8_t halfHV[256];\
2077    copy_block17(full, src, 24, stride, 17);\
2078    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2079    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2080    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2081    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2082}\
2083void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2084    uint8_t full[24*17];\
2085    uint8_t halfH[272];\
2086    uint8_t halfV[256];\
2087    uint8_t halfHV[256];\
2088    copy_block17(full, src, 24, stride, 17);\
2089    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2090    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2091    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2092    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2093}\
2094static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2095    uint8_t full[24*17];\
2096    uint8_t halfH[272];\
2097    uint8_t halfHV[256];\
2098    copy_block17(full, src, 24, stride, 17);\
2099    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2100    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2101    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2102    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2103}\
2104static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2105    uint8_t halfH[272];\
2106    uint8_t halfHV[256];\
2107    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2108    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2109    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2110}\
2111static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2112    uint8_t halfH[272];\
2113    uint8_t halfHV[256];\
2114    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2115    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2116    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2117}\
2118void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2119    uint8_t full[24*17];\
2120    uint8_t halfH[272];\
2121    uint8_t halfV[256];\
2122    uint8_t halfHV[256];\
2123    copy_block17(full, src, 24, stride, 17);\
2124    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2125    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2126    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2127    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2128}\
2129static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2130    uint8_t full[24*17];\
2131    uint8_t halfH[272];\
2132    copy_block17(full, src, 24, stride, 17);\
2133    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2134    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2135    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2136}\
2137void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2138    uint8_t full[24*17];\
2139    uint8_t halfH[272];\
2140    uint8_t halfV[256];\
2141    uint8_t halfHV[256];\
2142    copy_block17(full, src, 24, stride, 17);\
2143    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2144    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2145    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2146    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2147}\
2148static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2149    uint8_t full[24*17];\
2150    uint8_t halfH[272];\
2151    copy_block17(full, src, 24, stride, 17);\
2152    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2153    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2154    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2155}\
2156static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2157    uint8_t halfH[272];\
2158    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2159    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2160}
2161
2162#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2163#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2164#define op_put(a, b) a = cm[((b) + 16)>>5]
2165#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2166
2167QPEL_MC(0, put_       , _       , op_put)
2168QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2169QPEL_MC(0, avg_       , _       , op_avg)
2170//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2171#undef op_avg
2172#undef op_avg_no_rnd
2173#undef op_put
2174#undef op_put_no_rnd
2175
2176#if 1
2177#define H264_LOWPASS(OPNAME, OP, OP2) \
2178static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2179    const int h=2;\
2180    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2181    int i;\
2182    for(i=0; i<h; i++)\
2183    {\
2184        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2185        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2186        dst+=dstStride;\
2187        src+=srcStride;\
2188    }\
2189}\
2190\
2191static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2192    const int w=2;\
2193    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2194    int i;\
2195    for(i=0; i<w; i++)\
2196    {\
2197        const int srcB= src[-2*srcStride];\
2198        const int srcA= src[-1*srcStride];\
2199        const int src0= src[0 *srcStride];\
2200        const int src1= src[1 *srcStride];\
2201        const int src2= src[2 *srcStride];\
2202        const int src3= src[3 *srcStride];\
2203        const int src4= src[4 *srcStride];\
2204        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2205        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2206        dst++;\
2207        src++;\
2208    }\
2209}\
2210\
2211static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2212    const int h=2;\
2213    const int w=2;\
2214    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2215    int i;\
2216    src -= 2*srcStride;\
2217    for(i=0; i<h+5; i++)\
2218    {\
2219        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2220        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2221        tmp+=tmpStride;\
2222        src+=srcStride;\
2223    }\
2224    tmp -= tmpStride*(h+5-2);\
2225    for(i=0; i<w; i++)\
2226    {\
2227        const int tmpB= tmp[-2*tmpStride];\
2228        const int tmpA= tmp[-1*tmpStride];\
2229        const int tmp0= tmp[0 *tmpStride];\
2230        const int tmp1= tmp[1 *tmpStride];\
2231        const int tmp2= tmp[2 *tmpStride];\
2232        const int tmp3= tmp[3 *tmpStride];\
2233        const int tmp4= tmp[4 *tmpStride];\
2234        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2235        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2236        dst++;\
2237        tmp++;\
2238    }\
2239}\
2240static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2241    const int h=4;\
2242    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2243    int i;\
2244    for(i=0; i<h; i++)\
2245    {\
2246        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2247        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2248        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2249        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2250        dst+=dstStride;\
2251        src+=srcStride;\
2252    }\
2253}\
2254\
2255static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2256    const int w=4;\
2257    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2258    int i;\
2259    for(i=0; i<w; i++)\
2260    {\
2261        const int srcB= src[-2*srcStride];\
2262        const int srcA= src[-1*srcStride];\
2263        const int src0= src[0 *srcStride];\
2264        const int src1= src[1 *srcStride];\
2265        const int src2= src[2 *srcStride];\
2266        const int src3= src[3 *srcStride];\
2267        const int src4= src[4 *srcStride];\
2268        const int src5= src[5 *srcStride];\
2269        const int src6= src[6 *srcStride];\
2270        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2271        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2272        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2273        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2274        dst++;\
2275        src++;\
2276    }\
2277}\
2278\
2279static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2280    const int h=4;\
2281    const int w=4;\
2282    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2283    int i;\
2284    src -= 2*srcStride;\
2285    for(i=0; i<h+5; i++)\
2286    {\
2287        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2288        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2289        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2290        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2291        tmp+=tmpStride;\
2292        src+=srcStride;\
2293    }\
2294    tmp -= tmpStride*(h+5-2);\
2295    for(i=0; i<w; i++)\
2296    {\
2297        const int tmpB= tmp[-2*tmpStride];\
2298        const int tmpA= tmp[-1*tmpStride];\
2299        const int tmp0= tmp[0 *tmpStride];\
2300        const int tmp1= tmp[1 *tmpStride];\
2301        const int tmp2= tmp[2 *tmpStride];\
2302        const int tmp3= tmp[3 *tmpStride];\
2303        const int tmp4= tmp[4 *tmpStride];\
2304        const int tmp5= tmp[5 *tmpStride];\
2305        const int tmp6= tmp[6 *tmpStride];\
2306        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2307        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2308        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2309        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2310        dst++;\
2311        tmp++;\
2312    }\
2313}\
2314\
2315static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2316    const int h=8;\
2317    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2318    int i;\
2319    for(i=0; i<h; i++)\
2320    {\
2321        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2322        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2323        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2324        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2325        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2326        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2327        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2328        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2329        dst+=dstStride;\
2330        src+=srcStride;\
2331    }\
2332}\
2333\
2334static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2335    const int w=8;\
2336    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2337    int i;\
2338    for(i=0; i<w; i++)\
2339    {\
2340        const int srcB= src[-2*srcStride];\
2341        const int srcA= src[-1*srcStride];\
2342        const int src0= src[0 *srcStride];\
2343        const int src1= src[1 *srcStride];\
2344        const int src2= src[2 *srcStride];\
2345        const int src3= src[3 *srcStride];\
2346        const int src4= src[4 *srcStride];\
2347        const int src5= src[5 *srcStride];\
2348        const int src6= src[6 *srcStride];\
2349        const int src7= src[7 *srcStride];\
2350        const int src8= src[8 *srcStride];\
2351        const int src9= src[9 *srcStride];\
2352        const int src10=src[10*srcStride];\
2353        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2354        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2355        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2356        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2357        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2358        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2359        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2360        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2361        dst++;\
2362        src++;\
2363    }\
2364}\
2365\
2366static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2367    const int h=8;\
2368    const int w=8;\
2369    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2370    int i;\
2371    src -= 2*srcStride;\
2372    for(i=0; i<h+5; i++)\
2373    {\
2374        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2375        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2376        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2377        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2378        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2379        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2380        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2381        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2382        tmp+=tmpStride;\
2383        src+=srcStride;\
2384    }\
2385    tmp -= tmpStride*(h+5-2);\
2386    for(i=0; i<w; i++)\
2387    {\
2388        const int tmpB= tmp[-2*tmpStride];\
2389        const int tmpA= tmp[-1*tmpStride];\
2390        const int tmp0= tmp[0 *tmpStride];\
2391        const int tmp1= tmp[1 *tmpStride];\
2392        const int tmp2= tmp[2 *tmpStride];\
2393        const int tmp3= tmp[3 *tmpStride];\
2394        const int tmp4= tmp[4 *tmpStride];\
2395        const int tmp5= tmp[5 *tmpStride];\
2396        const int tmp6= tmp[6 *tmpStride];\
2397        const int tmp7= tmp[7 *tmpStride];\
2398        const int tmp8= tmp[8 *tmpStride];\
2399        const int tmp9= tmp[9 *tmpStride];\
2400        const int tmp10=tmp[10*tmpStride];\
2401        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2402        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2403        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2404        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2405        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2406        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2407        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2408        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2409        dst++;\
2410        tmp++;\
2411    }\
2412}\
2413\
2414static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2415    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2416    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2417    src += 8*srcStride;\
2418    dst += 8*dstStride;\
2419    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2420    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2421}\
2422\
2423static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2424    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2425    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2426    src += 8*srcStride;\
2427    dst += 8*dstStride;\
2428    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2429    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2430}\
2431\
2432static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2433    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2434    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2435    src += 8*srcStride;\
2436    dst += 8*dstStride;\
2437    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2438    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2439}\
2440
2441#define H264_MC(OPNAME, SIZE) \
2442static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2443    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2444}\
2445\
2446static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2447    uint8_t half[SIZE*SIZE];\
2448    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2449    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2450}\
2451\
2452static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2453    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2454}\
2455\
2456static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2457    uint8_t half[SIZE*SIZE];\
2458    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2459    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2460}\
2461\
2462static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2463    uint8_t full[SIZE*(SIZE+5)];\
2464    uint8_t * const full_mid= full + SIZE*2;\
2465    uint8_t half[SIZE*SIZE];\
2466    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2467    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2468    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2469}\
2470\
2471static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2472    uint8_t full[SIZE*(SIZE+5)];\
2473    uint8_t * const full_mid= full + SIZE*2;\
2474    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2475    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2476}\
2477\
2478static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2479    uint8_t full[SIZE*(SIZE+5)];\
2480    uint8_t * const full_mid= full + SIZE*2;\
2481    uint8_t half[SIZE*SIZE];\
2482    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2483    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2484    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2485}\
2486\
2487static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2488    uint8_t full[SIZE*(SIZE+5)];\
2489    uint8_t * const full_mid= full + SIZE*2;\
2490    uint8_t halfH[SIZE*SIZE];\
2491    uint8_t halfV[SIZE*SIZE];\
2492    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2493    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2494    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2495    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2496}\
2497\
2498static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2499    uint8_t full[SIZE*(SIZE+5)];\
2500    uint8_t * const full_mid= full + SIZE*2;\
2501    uint8_t halfH[SIZE*SIZE];\
2502    uint8_t halfV[SIZE*SIZE];\
2503    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2504    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2505    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2506    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2507}\
2508\
2509static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2510    uint8_t full[SIZE*(SIZE+5)];\
2511    uint8_t * const full_mid= full + SIZE*2;\
2512    uint8_t halfH[SIZE*SIZE];\
2513    uint8_t halfV[SIZE*SIZE];\
2514    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2515    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2516    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2517    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2518}\
2519\
2520static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2521    uint8_t full[SIZE*(SIZE+5)];\
2522    uint8_t * const full_mid= full + SIZE*2;\
2523    uint8_t halfH[SIZE*SIZE];\
2524    uint8_t halfV[SIZE*SIZE];\
2525    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2526    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2527    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2528    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2529}\
2530\
2531static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2532    int16_t tmp[SIZE*(SIZE+5)];\
2533    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2534}\
2535\
2536static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2537    int16_t tmp[SIZE*(SIZE+5)];\
2538    uint8_t halfH[SIZE*SIZE];\
2539    uint8_t halfHV[SIZE*SIZE];\
2540    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2541    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2542    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2543}\
2544\
2545static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2546    int16_t tmp[SIZE*(SIZE+5)];\
2547    uint8_t halfH[SIZE*SIZE];\
2548    uint8_t halfHV[SIZE*SIZE];\
2549    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2550    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2551    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2552}\
2553\
2554static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2555    uint8_t full[SIZE*(SIZE+5)];\
2556    uint8_t * const full_mid= full + SIZE*2;\
2557    int16_t tmp[SIZE*(SIZE+5)];\
2558    uint8_t halfV[SIZE*SIZE];\
2559    uint8_t halfHV[SIZE*SIZE];\
2560    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2561    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2562    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2563    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2564}\
2565\
2566static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2567    uint8_t full[SIZE*(SIZE+5)];\
2568    uint8_t * const full_mid= full + SIZE*2;\
2569    int16_t tmp[SIZE*(SIZE+5)];\
2570    uint8_t halfV[SIZE*SIZE];\
2571    uint8_t halfHV[SIZE*SIZE];\
2572    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2573    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2574    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2575    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2576}\
2577
2578#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2579//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2580#define op_put(a, b)  a = cm[((b) + 16)>>5]
2581#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2582#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2583
2584H264_LOWPASS(put_       , op_put, op2_put)
2585H264_LOWPASS(avg_       , op_avg, op2_avg)
2586H264_MC(put_, 2)
2587H264_MC(put_, 4)
2588H264_MC(put_, 8)
2589H264_MC(put_, 16)
2590H264_MC(avg_, 4)
2591H264_MC(avg_, 8)
2592H264_MC(avg_, 16)
2593
2594#undef op_avg
2595#undef op_put
2596#undef op2_avg
2597#undef op2_put
2598#endif
2599
2600static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2601    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2602    int i;
2603
2604    for(i=0; i<h; i++){
2605        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2606        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2607        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2608        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2609        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2610        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2611        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2612        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2613        dst+=dstStride;
2614        src+=srcStride;
2615    }
2616}
2617
2618#if CONFIG_CAVS_DECODER
2619/* AVS specific */
2620void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2621    put_pixels8_c(dst, src, stride, 8);
2622}
2623void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2624    avg_pixels8_c(dst, src, stride, 8);
2625}
2626void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2627    put_pixels16_c(dst, src, stride, 16);
2628}
2629void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2630    avg_pixels16_c(dst, src, stride, 16);
2631}
2632#endif /* CONFIG_CAVS_DECODER */
2633
2634#if CONFIG_VC1_DECODER
2635/* VC-1 specific */
2636void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2637    put_pixels8_c(dst, src, stride, 8);
2638}
2639void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2640    avg_pixels8_c(dst, src, stride, 8);
2641}
2642#endif /* CONFIG_VC1_DECODER */
2643
2644#if CONFIG_RV40_DECODER
2645static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2646    put_pixels16_xy2_c(dst, src, stride, 16);
2647}
2648static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2649    avg_pixels16_xy2_c(dst, src, stride, 16);
2650}
2651static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2652    put_pixels8_xy2_c(dst, src, stride, 8);
2653}
2654static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2655    avg_pixels8_xy2_c(dst, src, stride, 8);
2656}
2657#endif /* CONFIG_RV40_DECODER */
2658
2659static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2660    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2661    int i;
2662
2663    for(i=0; i<w; i++){
2664        const int src_1= src[ -srcStride];
2665        const int src0 = src[0          ];
2666        const int src1 = src[  srcStride];
2667        const int src2 = src[2*srcStride];
2668        const int src3 = src[3*srcStride];
2669        const int src4 = src[4*srcStride];
2670        const int src5 = src[5*srcStride];
2671        const int src6 = src[6*srcStride];
2672        const int src7 = src[7*srcStride];
2673        const int src8 = src[8*srcStride];
2674        const int src9 = src[9*srcStride];
2675        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2676        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2677        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2678        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2679        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2680        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2681        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2682        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2683        src++;
2684        dst++;
2685    }
2686}
2687
2688static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2689    put_pixels8_c(dst, src, stride, 8);
2690}
2691
2692static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2693    uint8_t half[64];
2694    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2695    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2696}
2697
2698static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2699    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2700}
2701
2702static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2703    uint8_t half[64];
2704    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2705    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2706}
2707
2708static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2709    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2710}
2711
2712static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2713    uint8_t halfH[88];
2714    uint8_t halfV[64];
2715    uint8_t halfHV[64];
2716    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2717    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2718    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2719    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2720}
2721static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2722    uint8_t halfH[88];
2723    uint8_t halfV[64];
2724    uint8_t halfHV[64];
2725    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2726    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2727    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2728    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2729}
2730static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2731    uint8_t halfH[88];
2732    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2733    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2734}
2735
2736static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2737    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2738    int x;
2739    const int strength= ff_h263_loop_filter_strength[qscale];
2740
2741    for(x=0; x<8; x++){
2742        int d1, d2, ad1;
2743        int p0= src[x-2*stride];
2744        int p1= src[x-1*stride];
2745        int p2= src[x+0*stride];
2746        int p3= src[x+1*stride];
2747        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2748
2749        if     (d<-2*strength) d1= 0;
2750        else if(d<-  strength) d1=-2*strength - d;
2751        else if(d<   strength) d1= d;
2752        else if(d< 2*strength) d1= 2*strength - d;
2753        else                   d1= 0;
2754
2755        p1 += d1;
2756        p2 -= d1;
2757        if(p1&256) p1= ~(p1>>31);
2758        if(p2&256) p2= ~(p2>>31);
2759
2760        src[x-1*stride] = p1;
2761        src[x+0*stride] = p2;
2762
2763        ad1= FFABS(d1)>>1;
2764
2765        d2= av_clip((p0-p3)/4, -ad1, ad1);
2766
2767        src[x-2*stride] = p0 - d2;
2768        src[x+  stride] = p3 + d2;
2769    }
2770    }
2771}
2772
2773static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2774    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2775    int y;
2776    const int strength= ff_h263_loop_filter_strength[qscale];
2777
2778    for(y=0; y<8; y++){
2779        int d1, d2, ad1;
2780        int p0= src[y*stride-2];
2781        int p1= src[y*stride-1];
2782        int p2= src[y*stride+0];
2783        int p3= src[y*stride+1];
2784        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2785
2786        if     (d<-2*strength) d1= 0;
2787        else if(d<-  strength) d1=-2*strength - d;
2788        else if(d<   strength) d1= d;
2789        else if(d< 2*strength) d1= 2*strength - d;
2790        else                   d1= 0;
2791
2792        p1 += d1;
2793        p2 -= d1;
2794        if(p1&256) p1= ~(p1>>31);
2795        if(p2&256) p2= ~(p2>>31);
2796
2797        src[y*stride-1] = p1;
2798        src[y*stride+0] = p2;
2799
2800        ad1= FFABS(d1)>>1;
2801
2802        d2= av_clip((p0-p3)/4, -ad1, ad1);
2803
2804        src[y*stride-2] = p0 - d2;
2805        src[y*stride+1] = p3 + d2;
2806    }
2807    }
2808}
2809
2810static void h261_loop_filter_c(uint8_t *src, int stride){
2811    int x,y,xy,yz;
2812    int temp[64];
2813
2814    for(x=0; x<8; x++){
2815        temp[x      ] = 4*src[x           ];
2816        temp[x + 7*8] = 4*src[x + 7*stride];
2817    }
2818    for(y=1; y<7; y++){
2819        for(x=0; x<8; x++){
2820            xy = y * stride + x;
2821            yz = y * 8 + x;
2822            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2823        }
2824    }
2825
2826    for(y=0; y<8; y++){
2827        src[  y*stride] = (temp[  y*8] + 2)>>2;
2828        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2829        for(x=1; x<7; x++){
2830            xy = y * stride + x;
2831            yz = y * 8 + x;
2832            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2833        }
2834    }
2835}
2836
2837static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2838{
2839    int s, i;
2840
2841    s = 0;
2842    for(i=0;i<h;i++) {
2843        s += abs(pix1[0] - pix2[0]);
2844        s += abs(pix1[1] - pix2[1]);
2845        s += abs(pix1[2] - pix2[2]);
2846        s += abs(pix1[3] - pix2[3]);
2847        s += abs(pix1[4] - pix2[4]);
2848        s += abs(pix1[5] - pix2[5]);
2849        s += abs(pix1[6] - pix2[6]);
2850        s += abs(pix1[7] - pix2[7]);
2851        s += abs(pix1[8] - pix2[8]);
2852        s += abs(pix1[9] - pix2[9]);
2853        s += abs(pix1[10] - pix2[10]);
2854        s += abs(pix1[11] - pix2[11]);
2855        s += abs(pix1[12] - pix2[12]);
2856        s += abs(pix1[13] - pix2[13]);
2857        s += abs(pix1[14] - pix2[14]);
2858        s += abs(pix1[15] - pix2[15]);
2859        pix1 += line_size;
2860        pix2 += line_size;
2861    }
2862    return s;
2863}
2864
2865static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2866{
2867    int s, i;
2868
2869    s = 0;
2870    for(i=0;i<h;i++) {
2871        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2872        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2873        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2874        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2875        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2876        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2877        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2878        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2879        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2880        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2881        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2882        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2883        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2884        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2885        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2886        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2887        pix1 += line_size;
2888        pix2 += line_size;
2889    }
2890    return s;
2891}
2892
2893static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2894{
2895    int s, i;
2896    uint8_t *pix3 = pix2 + line_size;
2897
2898    s = 0;
2899    for(i=0;i<h;i++) {
2900        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2901        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2902        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2903        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2904        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2905        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2906        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2907        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2908        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2909        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2910        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2911        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2912        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2913        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2914        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2915        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2916        pix1 += line_size;
2917        pix2 += line_size;
2918        pix3 += line_size;
2919    }
2920    return s;
2921}
2922
2923static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2924{
2925    int s, i;
2926    uint8_t *pix3 = pix2 + line_size;
2927
2928    s = 0;
2929    for(i=0;i<h;i++) {
2930        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2931        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2932        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2933        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2934        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2935        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2936        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2937        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2938        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2939        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2940        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2941        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2942        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2943        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2944        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2945        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2946        pix1 += line_size;
2947        pix2 += line_size;
2948        pix3 += line_size;
2949    }
2950    return s;
2951}
2952
2953static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2954{
2955    int s, i;
2956
2957    s = 0;
2958    for(i=0;i<h;i++) {
2959        s += abs(pix1[0] - pix2[0]);
2960        s += abs(pix1[1] - pix2[1]);
2961        s += abs(pix1[2] - pix2[2]);
2962        s += abs(pix1[3] - pix2[3]);
2963        s += abs(pix1[4] - pix2[4]);
2964        s += abs(pix1[5] - pix2[5]);
2965        s += abs(pix1[6] - pix2[6]);
2966        s += abs(pix1[7] - pix2[7]);
2967        pix1 += line_size;
2968        pix2 += line_size;
2969    }
2970    return s;
2971}
2972
2973static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2974{
2975    int s, i;
2976
2977    s = 0;
2978    for(i=0;i<h;i++) {
2979        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2980        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2981        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2982        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2983        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2984        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2985        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2986        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2987        pix1 += line_size;
2988        pix2 += line_size;
2989    }
2990    return s;
2991}
2992
2993static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2994{
2995    int s, i;
2996    uint8_t *pix3 = pix2 + line_size;
2997
2998    s = 0;
2999    for(i=0;i<h;i++) {
3000        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3001        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3002        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3003        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3004        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3005        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3006        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3007        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3008        pix1 += line_size;
3009        pix2 += line_size;
3010        pix3 += line_size;
3011    }
3012    return s;
3013}
3014
3015static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3016{
3017    int s, i;
3018    uint8_t *pix3 = pix2 + line_size;
3019
3020    s = 0;
3021    for(i=0;i<h;i++) {
3022        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3023        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3024        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3025        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3026        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3027        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3028        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3029        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3030        pix1 += line_size;
3031        pix2 += line_size;
3032        pix3 += line_size;
3033    }
3034    return s;
3035}
3036
3037static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3038    MpegEncContext *c = v;
3039    int score1=0;
3040    int score2=0;
3041    int x,y;
3042
3043    for(y=0; y<h; y++){
3044        for(x=0; x<16; x++){
3045            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3046        }
3047        if(y+1<h){
3048            for(x=0; x<15; x++){
3049                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3050                             - s1[x+1] + s1[x+1+stride])
3051                        -FFABS(  s2[x  ] - s2[x  +stride]
3052                             - s2[x+1] + s2[x+1+stride]);
3053            }
3054        }
3055        s1+= stride;
3056        s2+= stride;
3057    }
3058
3059    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3060    else  return score1 + FFABS(score2)*8;
3061}
3062
3063static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3064    MpegEncContext *c = v;
3065    int score1=0;
3066    int score2=0;
3067    int x,y;
3068
3069    for(y=0; y<h; y++){
3070        for(x=0; x<8; x++){
3071            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3072        }
3073        if(y+1<h){
3074            for(x=0; x<7; x++){
3075                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3076                             - s1[x+1] + s1[x+1+stride])
3077                        -FFABS(  s2[x  ] - s2[x  +stride]
3078                             - s2[x+1] + s2[x+1+stride]);
3079            }
3080        }
3081        s1+= stride;
3082        s2+= stride;
3083    }
3084
3085    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3086    else  return score1 + FFABS(score2)*8;
3087}
3088
3089static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3090    int i;
3091    unsigned int sum=0;
3092
3093    for(i=0; i<8*8; i++){
3094        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3095        int w= weight[i];
3096        b>>= RECON_SHIFT;
3097        assert(-512<b && b<512);
3098
3099        sum += (w*b)*(w*b)>>4;
3100    }
3101    return sum>>2;
3102}
3103
3104static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3105    int i;
3106
3107    for(i=0; i<8*8; i++){
3108        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3109    }
3110}
3111
3112/**
3113 * permutes an 8x8 block.
3114 * @param block the block which will be permuted according to the given permutation vector
3115 * @param permutation the permutation vector
3116 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3117 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3118 *                  (inverse) permutated to scantable order!
3119 */
3120void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3121{
3122    int i;
3123    DCTELEM temp[64];
3124
3125    if(last<=0) return;
3126    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3127
3128    for(i=0; i<=last; i++){
3129        const int j= scantable[i];
3130        temp[j]= block[j];
3131        block[j]=0;
3132    }
3133
3134    for(i=0; i<=last; i++){
3135        const int j= scantable[i];
3136        const int perm_j= permutation[j];
3137        block[perm_j]= temp[j];
3138    }
3139}
3140
3141static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3142    return 0;
3143}
3144
3145void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3146    int i;
3147
3148    memset(cmp, 0, sizeof(void*)*6);
3149
3150    for(i=0; i<6; i++){
3151        switch(type&0xFF){
3152        case FF_CMP_SAD:
3153            cmp[i]= c->sad[i];
3154            break;
3155        case FF_CMP_SATD:
3156            cmp[i]= c->hadamard8_diff[i];
3157            break;
3158        case FF_CMP_SSE:
3159            cmp[i]= c->sse[i];
3160            break;
3161        case FF_CMP_DCT:
3162            cmp[i]= c->dct_sad[i];
3163            break;
3164        case FF_CMP_DCT264:
3165            cmp[i]= c->dct264_sad[i];
3166            break;
3167        case FF_CMP_DCTMAX:
3168            cmp[i]= c->dct_max[i];
3169            break;
3170        case FF_CMP_PSNR:
3171            cmp[i]= c->quant_psnr[i];
3172            break;
3173        case FF_CMP_BIT:
3174            cmp[i]= c->bit[i];
3175            break;
3176        case FF_CMP_RD:
3177            cmp[i]= c->rd[i];
3178            break;
3179        case FF_CMP_VSAD:
3180            cmp[i]= c->vsad[i];
3181            break;
3182        case FF_CMP_VSSE:
3183            cmp[i]= c->vsse[i];
3184            break;
3185        case FF_CMP_ZERO:
3186            cmp[i]= zero_cmp;
3187            break;
3188        case FF_CMP_NSSE:
3189            cmp[i]= c->nsse[i];
3190            break;
3191#if CONFIG_DWT
3192        case FF_CMP_W53:
3193            cmp[i]= c->w53[i];
3194            break;
3195        case FF_CMP_W97:
3196            cmp[i]= c->w97[i];
3197            break;
3198#endif
3199        default:
3200            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3201        }
3202    }
3203}
3204
3205static void clear_block_c(DCTELEM *block)
3206{
3207    memset(block, 0, sizeof(DCTELEM)*64);
3208}
3209
3210/**
3211 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3212 */
3213static void clear_blocks_c(DCTELEM *blocks)
3214{
3215    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3216}
3217
3218static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3219    long i;
3220    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3221        long a = *(long*)(src+i);
3222        long b = *(long*)(dst+i);
3223        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3224    }
3225    for(; i<w; i++)
3226        dst[i+0] += src[i+0];
3227}
3228
3229static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3230    long i;
3231    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3232        long a = *(long*)(src1+i);
3233        long b = *(long*)(src2+i);
3234        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3235    }
3236    for(; i<w; i++)
3237        dst[i] = src1[i]+src2[i];
3238}
3239
3240static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3241    long i;
3242#if !HAVE_FAST_UNALIGNED
3243    if((long)src2 & (sizeof(long)-1)){
3244        for(i=0; i+7<w; i+=8){
3245            dst[i+0] = src1[i+0]-src2[i+0];
3246            dst[i+1] = src1[i+1]-src2[i+1];
3247            dst[i+2] = src1[i+2]-src2[i+2];
3248            dst[i+3] = src1[i+3]-src2[i+3];
3249            dst[i+4] = src1[i+4]-src2[i+4];
3250            dst[i+5] = src1[i+5]-src2[i+5];
3251            dst[i+6] = src1[i+6]-src2[i+6];
3252            dst[i+7] = src1[i+7]-src2[i+7];
3253        }
3254    }else
3255#endif
3256    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3257        long a = *(long*)(src1+i);
3258        long b = *(long*)(src2+i);
3259        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3260    }
3261    for(; i<w; i++)
3262        dst[i+0] = src1[i+0]-src2[i+0];
3263}
3264
3265static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3266    int i;
3267    uint8_t l, lt;
3268
3269    l= *left;
3270    lt= *left_top;
3271
3272    for(i=0; i<w; i++){
3273        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3274        lt= src1[i];
3275        dst[i]= l;
3276    }
3277
3278    *left= l;
3279    *left_top= lt;
3280}
3281
3282static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3283    int i;
3284    uint8_t l, lt;
3285
3286    l= *left;
3287    lt= *left_top;
3288
3289    for(i=0; i<w; i++){
3290        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3291        lt= src1[i];
3292        l= src2[i];
3293        dst[i]= l - pred;
3294    }
3295
3296    *left= l;
3297    *left_top= lt;
3298}
3299
3300static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3301    int i;
3302
3303    for(i=0; i<w-1; i++){
3304        acc+= src[i];
3305        dst[i]= acc;
3306        i++;
3307        acc+= src[i];
3308        dst[i]= acc;
3309    }
3310
3311    for(; i<w; i++){
3312        acc+= src[i];
3313        dst[i]= acc;
3314    }
3315
3316    return acc;
3317}
3318
3319#if HAVE_BIGENDIAN
3320#define B 3
3321#define G 2
3322#define R 1
3323#define A 0
3324#else
3325#define B 0
3326#define G 1
3327#define R 2
3328#define A 3
3329#endif
3330static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3331    int i;
3332    int r,g,b,a;
3333    r= *red;
3334    g= *green;
3335    b= *blue;
3336    a= *alpha;
3337
3338    for(i=0; i<w; i++){
3339        b+= src[4*i+B];
3340        g+= src[4*i+G];
3341        r+= src[4*i+R];
3342        a+= src[4*i+A];
3343
3344        dst[4*i+B]= b;
3345        dst[4*i+G]= g;
3346        dst[4*i+R]= r;
3347        dst[4*i+A]= a;
3348    }
3349
3350    *red= r;
3351    *green= g;
3352    *blue= b;
3353    *alpha= a;
3354}
3355#undef B
3356#undef G
3357#undef R
3358#undef A
3359
3360#define BUTTERFLY2(o1,o2,i1,i2) \
3361o1= (i1)+(i2);\
3362o2= (i1)-(i2);
3363
3364#define BUTTERFLY1(x,y) \
3365{\
3366    int a,b;\
3367    a= x;\
3368    b= y;\
3369    x= a+b;\
3370    y= a-b;\
3371}
3372
3373#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3374
3375static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3376    int i;
3377    int temp[64];
3378    int sum=0;
3379
3380    assert(h==8);
3381
3382    for(i=0; i<8; i++){
3383        //FIXME try pointer walks
3384        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3385        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3386        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3387        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3388
3389        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3390        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3391        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3392        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3393
3394        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3395        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3396        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3397        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3398    }
3399
3400    for(i=0; i<8; i++){
3401        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3402        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3403        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3404        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3405
3406        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3407        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3408        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3409        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3410
3411        sum +=
3412             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3413            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3414            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3415            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3416    }
3417#if 0
3418static int maxi=0;
3419if(sum>maxi){
3420    maxi=sum;
3421    printf("MAX:%d\n", maxi);
3422}
3423#endif
3424    return sum;
3425}
3426
3427static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3428    int i;
3429    int temp[64];
3430    int sum=0;
3431
3432    assert(h==8);
3433
3434    for(i=0; i<8; i++){
3435        //FIXME try pointer walks
3436        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3437        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3438        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3439        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3440
3441        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3442        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3443        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3444        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3445
3446        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3447        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3448        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3449        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3450    }
3451
3452    for(i=0; i<8; i++){
3453        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3454        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3455        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3456        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3457
3458        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3459        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3460        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3461        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3462
3463        sum +=
3464             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3465            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3466            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3467            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3468    }
3469
3470    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3471
3472    return sum;
3473}
3474
3475static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3476    MpegEncContext * const s= (MpegEncContext *)c;
3477    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3478
3479    assert(h==8);
3480
3481    s->dsp.diff_pixels(temp, src1, src2, stride);
3482    s->dsp.fdct(temp);
3483    return s->dsp.sum_abs_dctelem(temp);
3484}
3485
3486#if CONFIG_GPL
3487#define DCT8_1D {\
3488    const int s07 = SRC(0) + SRC(7);\
3489    const int s16 = SRC(1) + SRC(6);\
3490    const int s25 = SRC(2) + SRC(5);\
3491    const int s34 = SRC(3) + SRC(4);\
3492    const int a0 = s07 + s34;\
3493    const int a1 = s16 + s25;\
3494    const int a2 = s07 - s34;\
3495    const int a3 = s16 - s25;\
3496    const int d07 = SRC(0) - SRC(7);\
3497    const int d16 = SRC(1) - SRC(6);\
3498    const int d25 = SRC(2) - SRC(5);\
3499    const int d34 = SRC(3) - SRC(4);\
3500    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3501    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3502    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3503    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3504    DST(0,  a0 + a1     ) ;\
3505    DST(1,  a4 + (a7>>2)) ;\
3506    DST(2,  a2 + (a3>>1)) ;\
3507    DST(3,  a5 + (a6>>2)) ;\
3508    DST(4,  a0 - a1     ) ;\
3509    DST(5,  a6 - (a5>>2)) ;\
3510    DST(6, (a2>>1) - a3 ) ;\
3511    DST(7, (a4>>2) - a7 ) ;\
3512}
3513
3514static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3515    MpegEncContext * const s= (MpegEncContext *)c;
3516    DCTELEM dct[8][8];
3517    int i;
3518    int sum=0;
3519
3520    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3521
3522#define SRC(x) dct[i][x]
3523#define DST(x,v) dct[i][x]= v
3524    for( i = 0; i < 8; i++ )
3525        DCT8_1D
3526#undef SRC
3527#undef DST
3528
3529#define SRC(x) dct[x][i]
3530#define DST(x,v) sum += FFABS(v)
3531    for( i = 0; i < 8; i++ )
3532        DCT8_1D
3533#undef SRC
3534#undef DST
3535    return sum;
3536}
3537#endif
3538
3539static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3540    MpegEncContext * const s= (MpegEncContext *)c;
3541    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3542    int sum=0, i;
3543
3544    assert(h==8);
3545
3546    s->dsp.diff_pixels(temp, src1, src2, stride);
3547    s->dsp.fdct(temp);
3548
3549    for(i=0; i<64; i++)
3550        sum= FFMAX(sum, FFABS(temp[i]));
3551
3552    return sum;
3553}
3554
3555static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3556    MpegEncContext * const s= (MpegEncContext *)c;
3557    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3558    DCTELEM * const bak = temp+64;
3559    int sum=0, i;
3560
3561    assert(h==8);
3562    s->mb_intra=0;
3563
3564    s->dsp.diff_pixels(temp, src1, src2, stride);
3565
3566    memcpy(bak, temp, 64*sizeof(DCTELEM));
3567
3568    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3569    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3570    ff_simple_idct(temp); //FIXME
3571
3572    for(i=0; i<64; i++)
3573        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3574
3575    return sum;
3576}
3577
3578static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3579    MpegEncContext * const s= (MpegEncContext *)c;
3580    const uint8_t *scantable= s->intra_scantable.permutated;
3581    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3582    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3583    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3584    int i, last, run, bits, level, distortion, start_i;
3585    const int esc_length= s->ac_esc_length;
3586    uint8_t * length;
3587    uint8_t * last_length;
3588
3589    assert(h==8);
3590
3591    copy_block8(lsrc1, src1, 8, stride, 8);
3592    copy_block8(lsrc2, src2, 8, stride, 8);
3593
3594    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3595
3596    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3597
3598    bits=0;
3599
3600    if (s->mb_intra) {
3601        start_i = 1;
3602        length     = s->intra_ac_vlc_length;
3603        last_length= s->intra_ac_vlc_last_length;
3604        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3605    } else {
3606        start_i = 0;
3607        length     = s->inter_ac_vlc_length;
3608        last_length= s->inter_ac_vlc_last_length;
3609    }
3610
3611    if(last>=start_i){
3612        run=0;
3613        for(i=start_i; i<last; i++){
3614            int j= scantable[i];
3615            level= temp[j];
3616
3617            if(level){
3618                level+=64;
3619                if((level&(~127)) == 0){
3620                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3621                }else
3622                    bits+= esc_length;
3623                run=0;
3624            }else
3625                run++;
3626        }
3627        i= scantable[last];
3628
3629        level= temp[i] + 64;
3630
3631        assert(level - 64);
3632
3633        if((level&(~127)) == 0){
3634            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3635        }else
3636            bits+= esc_length;
3637
3638    }
3639
3640    if(last>=0){
3641        if(s->mb_intra)
3642            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3643        else
3644            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3645    }
3646
3647    s->dsp.idct_add(lsrc2, 8, temp);
3648
3649    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3650
3651    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3652}
3653
3654static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3655    MpegEncContext * const s= (MpegEncContext *)c;
3656    const uint8_t *scantable= s->intra_scantable.permutated;
3657    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3658    int i, last, run, bits, level, start_i;
3659    const int esc_length= s->ac_esc_length;
3660    uint8_t * length;
3661    uint8_t * last_length;
3662
3663    assert(h==8);
3664
3665    s->dsp.diff_pixels(temp, src1, src2, stride);
3666
3667    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3668
3669    bits=0;
3670
3671    if (s->mb_intra) {
3672        start_i = 1;
3673        length     = s->intra_ac_vlc_length;
3674        last_length= s->intra_ac_vlc_last_length;
3675        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3676    } else {
3677        start_i = 0;
3678        length     = s->inter_ac_vlc_length;
3679        last_length= s->inter_ac_vlc_last_length;
3680    }
3681
3682    if(last>=start_i){
3683        run=0;
3684        for(i=start_i; i<last; i++){
3685            int j= scantable[i];
3686            level= temp[j];
3687
3688            if(level){
3689                level+=64;
3690                if((level&(~127)) == 0){
3691                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3692                }else
3693                    bits+= esc_length;
3694                run=0;
3695            }else
3696                run++;
3697        }
3698        i= scantable[last];
3699
3700        level= temp[i] + 64;
3701
3702        assert(level - 64);
3703
3704        if((level&(~127)) == 0){
3705            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3706        }else
3707            bits+= esc_length;
3708    }
3709
3710    return bits;
3711}
3712
3713#define VSAD_INTRA(size) \
3714static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3715    int score=0;                                                                                            \
3716    int x,y;                                                                                                \
3717                                                                                                            \
3718    for(y=1; y<h; y++){                                                                                     \
3719        for(x=0; x<size; x+=4){                                                                             \
3720            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3721                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3722        }                                                                                                   \
3723        s+= stride;                                                                                         \
3724    }                                                                                                       \
3725                                                                                                            \
3726    return score;                                                                                           \
3727}
3728VSAD_INTRA(8)
3729VSAD_INTRA(16)
3730
3731static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3732    int score=0;
3733    int x,y;
3734
3735    for(y=1; y<h; y++){
3736        for(x=0; x<16; x++){
3737            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3738        }
3739        s1+= stride;
3740        s2+= stride;
3741    }
3742
3743    return score;
3744}
3745
3746#define SQ(a) ((a)*(a))
3747#define VSSE_INTRA(size) \
3748static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3749    int score=0;                                                                                            \
3750    int x,y;                                                                                                \
3751                                                                                                            \
3752    for(y=1; y<h; y++){                                                                                     \
3753        for(x=0; x<size; x+=4){                                                                               \
3754            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3755                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3756        }                                                                                                   \
3757        s+= stride;                                                                                         \
3758    }                                                                                                       \
3759                                                                                                            \
3760    return score;                                                                                           \
3761}
3762VSSE_INTRA(8)
3763VSSE_INTRA(16)
3764
3765static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3766    int score=0;
3767    int x,y;
3768
3769    for(y=1; y<h; y++){
3770        for(x=0; x<16; x++){
3771            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3772        }
3773        s1+= stride;
3774        s2+= stride;
3775    }
3776
3777    return score;
3778}
3779
3780static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3781                               int size){
3782    int score=0;
3783    int i;
3784    for(i=0; i<size; i++)
3785        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3786    return score;
3787}
3788
3789WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3790WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3791WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3792#if CONFIG_GPL
3793WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3794#endif
3795WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3796WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3797WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3798WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3799
3800static void vector_fmul_c(float *dst, const float *src, int len){
3801    int i;
3802    for(i=0; i<len; i++)
3803        dst[i] *= src[i];
3804}
3805
3806static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3807    int i;
3808    src1 += len-1;
3809    for(i=0; i<len; i++)
3810        dst[i] = src0[i] * src1[-i];
3811}
3812
3813static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3814    int i;
3815    for(i=0; i<len; i++)
3816        dst[i] = src0[i] * src1[i] + src2[i];
3817}
3818
3819void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3820    int i,j;
3821    dst += len;
3822    win += len;
3823    src0+= len;
3824    for(i=-len, j=len-1; i<0; i++, j--) {
3825        float s0 = src0[i];
3826        float s1 = src1[j];
3827        float wi = win[i];
3828        float wj = win[j];
3829        dst[i] = s0*wj - s1*wi + add_bias;
3830        dst[j] = s0*wi + s1*wj + add_bias;
3831    }
3832}
3833
3834static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3835                                 int len)
3836{
3837    int i;
3838    for (i = 0; i < len; i++)
3839        dst[i] = src[i] * mul;
3840}
3841
3842static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3843                                      const float **sv, float mul, int len)
3844{
3845    int i;
3846    for (i = 0; i < len; i += 2, sv++) {
3847        dst[i  ] = src[i  ] * sv[0][0] * mul;
3848        dst[i+1] = src[i+1] * sv[0][1] * mul;
3849    }
3850}
3851
3852static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3853                                      const float **sv, float mul, int len)
3854{
3855    int i;
3856    for (i = 0; i < len; i += 4, sv++) {
3857        dst[i  ] = src[i  ] * sv[0][0] * mul;
3858        dst[i+1] = src[i+1] * sv[0][1] * mul;
3859        dst[i+2] = src[i+2] * sv[0][2] * mul;
3860        dst[i+3] = src[i+3] * sv[0][3] * mul;
3861    }
3862}
3863
3864static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3865                               int len)
3866{
3867    int i;
3868    for (i = 0; i < len; i += 2, sv++) {
3869        dst[i  ] = sv[0][0] * mul;
3870        dst[i+1] = sv[0][1] * mul;
3871    }
3872}
3873
3874static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3875                               int len)
3876{
3877    int i;
3878    for (i = 0; i < len; i += 4, sv++) {
3879        dst[i  ] = sv[0][0] * mul;
3880        dst[i+1] = sv[0][1] * mul;
3881        dst[i+2] = sv[0][2] * mul;
3882        dst[i+3] = sv[0][3] * mul;
3883    }
3884}
3885
3886static void butterflies_float_c(float *restrict v1, float *restrict v2,
3887                                int len)
3888{
3889    int i;
3890    for (i = 0; i < len; i++) {
3891        float t = v1[i] - v2[i];
3892        v1[i] += v2[i];
3893        v2[i] = t;
3894    }
3895}
3896
3897static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3898{
3899    float p = 0.0;
3900    int i;
3901
3902    for (i = 0; i < len; i++)
3903        p += v1[i] * v2[i];
3904
3905    return p;
3906}
3907
3908static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3909    int i;
3910    for(i=0; i<len; i++)
3911        dst[i] = src[i] * mul;
3912}
3913
3914static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3915                   uint32_t maxi, uint32_t maxisign)
3916{
3917
3918    if(a > mini) return mini;
3919    else if((a^(1<<31)) > maxisign) return maxi;
3920    else return a;
3921}
3922
3923static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3924    int i;
3925    uint32_t mini = *(uint32_t*)min;
3926    uint32_t maxi = *(uint32_t*)max;
3927    uint32_t maxisign = maxi ^ (1<<31);
3928    uint32_t *dsti = (uint32_t*)dst;
3929    const uint32_t *srci = (const uint32_t*)src;
3930    for(i=0; i<len; i+=8) {
3931        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3932        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3933        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3934        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3935        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3936        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3937        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3938        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3939    }
3940}
3941static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3942    int i;
3943    if(min < 0 && max > 0) {
3944        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3945    } else {
3946        for(i=0; i < len; i+=8) {
3947            dst[i    ] = av_clipf(src[i    ], min, max);
3948            dst[i + 1] = av_clipf(src[i + 1], min, max);
3949            dst[i + 2] = av_clipf(src[i + 2], min, max);
3950            dst[i + 3] = av_clipf(src[i + 3], min, max);
3951            dst[i + 4] = av_clipf(src[i + 4], min, max);
3952            dst[i + 5] = av_clipf(src[i + 5], min, max);
3953            dst[i + 6] = av_clipf(src[i + 6], min, max);
3954            dst[i + 7] = av_clipf(src[i + 7], min, max);
3955        }
3956    }
3957}
3958
3959static av_always_inline int float_to_int16_one(const float *src){
3960    int_fast32_t tmp = *(const int32_t*)src;
3961    if(tmp & 0xf0000){
3962        tmp = (0x43c0ffff - tmp)>>31;
3963        // is this faster on some gcc/cpu combinations?
3964//      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3965//      else                 tmp = 0;
3966    }
3967    return tmp - 0x8000;
3968}
3969
3970void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3971    int i;
3972    for(i=0; i<len; i++)
3973        dst[i] = float_to_int16_one(src+i);
3974}
3975
3976void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3977    int i,j,c;
3978    if(channels==2){
3979        for(i=0; i<len; i++){
3980            dst[2*i]   = float_to_int16_one(src[0]+i);
3981            dst[2*i+1] = float_to_int16_one(src[1]+i);
3982        }
3983    }else{
3984        for(c=0; c<channels; c++)
3985            for(i=0, j=c; i<len; i++, j+=channels)
3986                dst[j] = float_to_int16_one(src[c]+i);
3987    }
3988}
3989
3990static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
3991{
3992    int res = 0;
3993
3994    while (order--)
3995        res += (*v1++ * *v2++) >> shift;
3996
3997    return res;
3998}
3999
4000static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
4001{
4002    int res = 0;
4003    while (order--) {
4004        res   += *v1 * *v2++;
4005        *v1++ += mul * *v3++;
4006    }
4007    return res;
4008}
4009
4010#define W0 2048
4011#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4012#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4013#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4014#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4015#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4016#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4017#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4018
4019static void wmv2_idct_row(short * b)
4020{
4021    int s1,s2;
4022    int a0,a1,a2,a3,a4,a5,a6,a7;
4023    /*step 1*/
4024    a1 = W1*b[1]+W7*b[7];
4025    a7 = W7*b[1]-W1*b[7];
4026    a5 = W5*b[5]+W3*b[3];
4027    a3 = W3*b[5]-W5*b[3];
4028    a2 = W2*b[2]+W6*b[6];
4029    a6 = W6*b[2]-W2*b[6];
4030    a0 = W0*b[0]+W0*b[4];
4031    a4 = W0*b[0]-W0*b[4];
4032    /*step 2*/
4033    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4034    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4035    /*step 3*/
4036    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4037    b[1] = (a4+a6 +s1   + (1<<7))>>8;
4038    b[2] = (a4-a6 +s2   + (1<<7))>>8;
4039    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4040    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4041    b[5] = (a4-a6 -s2   + (1<<7))>>8;
4042    b[6] = (a4+a6 -s1   + (1<<7))>>8;
4043    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4044}
4045static void wmv2_idct_col(short * b)
4046{
4047    int s1,s2;
4048    int a0,a1,a2,a3,a4,a5,a6,a7;
4049    /*step 1, with extended precision*/
4050    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4051    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4052    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4053    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4054    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4055    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4056    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4057    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4058    /*step 2*/
4059    s1 = (181*(a1-a5+a7-a3)+128)>>8;
4060    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4061    /*step 3*/
4062    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4063    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4064    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4065    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4066
4067    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4068    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4069    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4070    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4071}
4072void ff_wmv2_idct_c(short * block){
4073    int i;
4074
4075    for(i=0;i<64;i+=8){
4076        wmv2_idct_row(block+i);
4077    }
4078    for(i=0;i<8;i++){
4079        wmv2_idct_col(block+i);
4080    }
4081}
4082/* XXX: those functions should be suppressed ASAP when all IDCTs are
4083 converted */
4084static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4085{
4086    ff_wmv2_idct_c(block);
4087    put_pixels_clamped_c(block, dest, line_size);
4088}
4089static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4090{
4091    ff_wmv2_idct_c(block);
4092    add_pixels_clamped_c(block, dest, line_size);
4093}
4094static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4095{
4096    j_rev_dct (block);
4097    put_pixels_clamped_c(block, dest, line_size);
4098}
4099static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4100{
4101    j_rev_dct (block);
4102    add_pixels_clamped_c(block, dest, line_size);
4103}
4104
4105static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4106{
4107    j_rev_dct4 (block);
4108    put_pixels_clamped4_c(block, dest, line_size);
4109}
4110static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4111{
4112    j_rev_dct4 (block);
4113    add_pixels_clamped4_c(block, dest, line_size);
4114}
4115
4116static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4117{
4118    j_rev_dct2 (block);
4119    put_pixels_clamped2_c(block, dest, line_size);
4120}
4121static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4122{
4123    j_rev_dct2 (block);
4124    add_pixels_clamped2_c(block, dest, line_size);
4125}
4126
4127static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4128{
4129    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4130
4131    dest[0] = cm[(block[0] + 4)>>3];
4132}
4133static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4134{
4135    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4136
4137    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4138}
4139
4140static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4141
4142/* init static data */
4143av_cold void dsputil_static_init(void)
4144{
4145    int i;
4146
4147    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4148    for(i=0;i<MAX_NEG_CROP;i++) {
4149        ff_cropTbl[i] = 0;
4150        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4151    }
4152
4153    for(i=0;i<512;i++) {
4154        ff_squareTbl[i] = (i - 256) * (i - 256);
4155    }
4156
4157    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4158}
4159
4160int ff_check_alignment(void){
4161    static int did_fail=0;
4162    DECLARE_ALIGNED(16, int, aligned);
4163
4164    if((intptr_t)&aligned & 15){
4165        if(!did_fail){
4166#if HAVE_MMX || HAVE_ALTIVEC
4167            av_log(NULL, AV_LOG_ERROR,
4168                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4169                "and may be very slow or crash. This is not a bug in libavcodec,\n"
4170                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4171                "Do not report crashes to FFmpeg developers.\n");
4172#endif
4173            did_fail=1;
4174        }
4175        return -1;
4176    }
4177    return 0;
4178}
4179
4180av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4181{
4182    int i;
4183
4184    ff_check_alignment();
4185
4186#if CONFIG_ENCODERS
4187    if(avctx->dct_algo==FF_DCT_FASTINT) {
4188        c->fdct = fdct_ifast;
4189        c->fdct248 = fdct_ifast248;
4190    }
4191    else if(avctx->dct_algo==FF_DCT_FAAN) {
4192        c->fdct = ff_faandct;
4193        c->fdct248 = ff_faandct248;
4194    }
4195    else {
4196        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4197        c->fdct248 = ff_fdct248_islow;
4198    }
4199#endif //CONFIG_ENCODERS
4200
4201    if(avctx->lowres==1){
4202        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4203            c->idct_put= ff_jref_idct4_put;
4204            c->idct_add= ff_jref_idct4_add;
4205        }else{
4206            c->idct_put= ff_h264_lowres_idct_put_c;
4207            c->idct_add= ff_h264_lowres_idct_add_c;
4208        }
4209        c->idct    = j_rev_dct4;
4210        c->idct_permutation_type= FF_NO_IDCT_PERM;
4211    }else if(avctx->lowres==2){
4212        c->idct_put= ff_jref_idct2_put;
4213        c->idct_add= ff_jref_idct2_add;
4214        c->idct    = j_rev_dct2;
4215        c->idct_permutation_type= FF_NO_IDCT_PERM;
4216    }else if(avctx->lowres==3){
4217        c->idct_put= ff_jref_idct1_put;
4218        c->idct_add= ff_jref_idct1_add;
4219        c->idct    = j_rev_dct1;
4220        c->idct_permutation_type= FF_NO_IDCT_PERM;
4221    }else{
4222        if(avctx->idct_algo==FF_IDCT_INT){
4223            c->idct_put= ff_jref_idct_put;
4224            c->idct_add= ff_jref_idct_add;
4225            c->idct    = j_rev_dct;
4226            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4227        }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4228                avctx->idct_algo==FF_IDCT_VP3){
4229            c->idct_put= ff_vp3_idct_put_c;
4230            c->idct_add= ff_vp3_idct_add_c;
4231            c->idct    = ff_vp3_idct_c;
4232            c->idct_permutation_type= FF_NO_IDCT_PERM;
4233        }else if(avctx->idct_algo==FF_IDCT_WMV2){
4234            c->idct_put= ff_wmv2_idct_put_c;
4235            c->idct_add= ff_wmv2_idct_add_c;
4236            c->idct    = ff_wmv2_idct_c;
4237            c->idct_permutation_type= FF_NO_IDCT_PERM;
4238        }else if(avctx->idct_algo==FF_IDCT_FAAN){
4239            c->idct_put= ff_faanidct_put;
4240            c->idct_add= ff_faanidct_add;
4241            c->idct    = ff_faanidct;
4242            c->idct_permutation_type= FF_NO_IDCT_PERM;
4243        }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4244            c->idct_put= ff_ea_idct_put_c;
4245            c->idct_permutation_type= FF_NO_IDCT_PERM;
4246        }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4247            c->idct     = ff_bink_idct_c;
4248            c->idct_add = ff_bink_idct_add_c;
4249            c->idct_put = ff_bink_idct_put_c;
4250            c->idct_permutation_type = FF_NO_IDCT_PERM;
4251        }else{ //accurate/default
4252            c->idct_put= ff_simple_idct_put;
4253            c->idct_add= ff_simple_idct_add;
4254            c->idct    = ff_simple_idct;
4255            c->idct_permutation_type= FF_NO_IDCT_PERM;
4256        }
4257    }
4258
4259    c->get_pixels = get_pixels_c;
4260    c->diff_pixels = diff_pixels_c;
4261    c->put_pixels_clamped = put_pixels_clamped_c;
4262    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4263    c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4264    c->add_pixels_clamped = add_pixels_clamped_c;
4265    c->add_pixels8 = add_pixels8_c;
4266    c->add_pixels4 = add_pixels4_c;
4267    c->sum_abs_dctelem = sum_abs_dctelem_c;
4268    c->gmc1 = gmc1_c;
4269    c->gmc = ff_gmc_c;
4270    c->clear_block = clear_block_c;
4271    c->clear_blocks = clear_blocks_c;
4272    c->pix_sum = pix_sum_c;
4273    c->pix_norm1 = pix_norm1_c;
4274
4275    c->fill_block_tab[0] = fill_block16_c;
4276    c->fill_block_tab[1] = fill_block8_c;
4277    c->scale_block = scale_block_c;
4278
4279    /* TODO [0] 16  [1] 8 */
4280    c->pix_abs[0][0] = pix_abs16_c;
4281    c->pix_abs[0][1] = pix_abs16_x2_c;
4282    c->pix_abs[0][2] = pix_abs16_y2_c;
4283    c->pix_abs[0][3] = pix_abs16_xy2_c;
4284    c->pix_abs[1][0] = pix_abs8_c;
4285    c->pix_abs[1][1] = pix_abs8_x2_c;
4286    c->pix_abs[1][2] = pix_abs8_y2_c;
4287    c->pix_abs[1][3] = pix_abs8_xy2_c;
4288
4289#define dspfunc(PFX, IDX, NUM) \
4290    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4291    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4292    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4293    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4294
4295    dspfunc(put, 0, 16);
4296    dspfunc(put_no_rnd, 0, 16);
4297    dspfunc(put, 1, 8);
4298    dspfunc(put_no_rnd, 1, 8);
4299    dspfunc(put, 2, 4);
4300    dspfunc(put, 3, 2);
4301
4302    dspfunc(avg, 0, 16);
4303    dspfunc(avg_no_rnd, 0, 16);
4304    dspfunc(avg, 1, 8);
4305    dspfunc(avg_no_rnd, 1, 8);
4306    dspfunc(avg, 2, 4);
4307    dspfunc(avg, 3, 2);
4308#undef dspfunc
4309
4310    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4311    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4312
4313    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4314    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4315    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4316    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4317    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4318    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4319    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4320    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4321    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4322
4323    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4324    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4325    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4326    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4327    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4328    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4329    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4330    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4331    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4332
4333#define dspfunc(PFX, IDX, NUM) \
4334    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4335    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4336    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4337    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4338    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4339    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4340    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4341    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4342    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4343    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4344    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4345    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4346    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4347    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4348    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4349    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4350
4351    dspfunc(put_qpel, 0, 16);
4352    dspfunc(put_no_rnd_qpel, 0, 16);
4353
4354    dspfunc(avg_qpel, 0, 16);
4355    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4356
4357    dspfunc(put_qpel, 1, 8);
4358    dspfunc(put_no_rnd_qpel, 1, 8);
4359
4360    dspfunc(avg_qpel, 1, 8);
4361    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4362
4363    dspfunc(put_h264_qpel, 0, 16);
4364    dspfunc(put_h264_qpel, 1, 8);
4365    dspfunc(put_h264_qpel, 2, 4);
4366    dspfunc(put_h264_qpel, 3, 2);
4367    dspfunc(avg_h264_qpel, 0, 16);
4368    dspfunc(avg_h264_qpel, 1, 8);
4369    dspfunc(avg_h264_qpel, 2, 4);
4370
4371#undef dspfunc
4372    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4373    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4374    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4375    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4376    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4377    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4378    c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4379    c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4380
4381    c->draw_edges = draw_edges_c;
4382
4383#if CONFIG_CAVS_DECODER
4384    ff_cavsdsp_init(c,avctx);
4385#endif
4386
4387#if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4388    ff_mlp_init(c, avctx);
4389#endif
4390#if CONFIG_VC1_DECODER
4391    ff_vc1dsp_init(c,avctx);
4392#endif
4393#if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4394    ff_intrax8dsp_init(c,avctx);
4395#endif
4396#if CONFIG_RV30_DECODER
4397    ff_rv30dsp_init(c,avctx);
4398#endif
4399#if CONFIG_RV40_DECODER
4400    ff_rv40dsp_init(c,avctx);
4401    c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4402    c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4403    c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4404    c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4405#endif
4406
4407    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4408    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4409    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4410    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4411    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4412    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4413    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4414    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4415
4416#define SET_CMP_FUNC(name) \
4417    c->name[0]= name ## 16_c;\
4418    c->name[1]= name ## 8x8_c;
4419
4420    SET_CMP_FUNC(hadamard8_diff)
4421    c->hadamard8_diff[4]= hadamard8_intra16_c;
4422    c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4423    SET_CMP_FUNC(dct_sad)
4424    SET_CMP_FUNC(dct_max)
4425#if CONFIG_GPL
4426    SET_CMP_FUNC(dct264_sad)
4427#endif
4428    c->sad[0]= pix_abs16_c;
4429    c->sad[1]= pix_abs8_c;
4430    c->sse[0]= sse16_c;
4431    c->sse[1]= sse8_c;
4432    c->sse[2]= sse4_c;
4433    SET_CMP_FUNC(quant_psnr)
4434    SET_CMP_FUNC(rd)
4435    SET_CMP_FUNC(bit)
4436    c->vsad[0]= vsad16_c;
4437    c->vsad[4]= vsad_intra16_c;
4438    c->vsad[5]= vsad_intra8_c;
4439    c->vsse[0]= vsse16_c;
4440    c->vsse[4]= vsse_intra16_c;
4441    c->vsse[5]= vsse_intra8_c;
4442    c->nsse[0]= nsse16_c;
4443    c->nsse[1]= nsse8_c;
4444#if CONFIG_DWT
4445    ff_dsputil_init_dwt(c);
4446#endif
4447
4448    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4449
4450    c->add_bytes= add_bytes_c;
4451    c->add_bytes_l2= add_bytes_l2_c;
4452    c->diff_bytes= diff_bytes_c;
4453    c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4454    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4455    c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
4456    c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4457    c->bswap_buf= bswap_buf;
4458#if CONFIG_PNG_DECODER
4459    c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4460#endif
4461
4462    if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4463        c->h263_h_loop_filter= h263_h_loop_filter_c;
4464        c->h263_v_loop_filter= h263_v_loop_filter_c;
4465    }
4466
4467    if (CONFIG_VP3_DECODER) {
4468        c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4469        c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4470        c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4471    }
4472    if (CONFIG_VP6_DECODER) {
4473        c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
4474    }
4475
4476    c->h261_loop_filter= h261_loop_filter_c;
4477
4478    c->try_8x8basis= try_8x8basis_c;
4479    c->add_8x8basis= add_8x8basis_c;
4480
4481#if CONFIG_VORBIS_DECODER
4482    c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4483#endif
4484#if CONFIG_AC3_DECODER
4485    c->ac3_downmix = ff_ac3_downmix_c;
4486#endif
4487#if CONFIG_LPC
4488    c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
4489#endif
4490    c->vector_fmul = vector_fmul_c;
4491    c->vector_fmul_reverse = vector_fmul_reverse_c;
4492    c->vector_fmul_add = vector_fmul_add_c;
4493    c->vector_fmul_window = ff_vector_fmul_window_c;
4494    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4495    c->vector_clipf = vector_clipf_c;
4496    c->float_to_int16 = ff_float_to_int16_c;
4497    c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4498    c->scalarproduct_int16 = scalarproduct_int16_c;
4499    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4500    c->scalarproduct_float = scalarproduct_float_c;
4501    c->butterflies_float = butterflies_float_c;
4502    c->vector_fmul_scalar = vector_fmul_scalar_c;
4503
4504    c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4505    c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4506
4507    c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4508    c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4509
4510    c->shrink[0]= ff_img_copy_plane;
4511    c->shrink[1]= ff_shrink22;
4512    c->shrink[2]= ff_shrink44;
4513    c->shrink[3]= ff_shrink88;
4514
4515    c->prefetch= just_return;
4516
4517    memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4518    memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4519
4520    if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
4521    if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
4522    if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
4523    if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
4524    if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
4525    if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
4526    if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
4527    if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
4528    if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
4529
4530    for(i=0; i<64; i++){
4531        if(!c->put_2tap_qpel_pixels_tab[0][i])
4532            c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4533        if(!c->avg_2tap_qpel_pixels_tab[0][i])
4534            c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4535    }
4536
4537    switch(c->idct_permutation_type){
4538    case FF_NO_IDCT_PERM:
4539        for(i=0; i<64; i++)
4540            c->idct_permutation[i]= i;
4541        break;
4542    case FF_LIBMPEG2_IDCT_PERM:
4543        for(i=0; i<64; i++)
4544            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4545        break;
4546    case FF_SIMPLE_IDCT_PERM:
4547        for(i=0; i<64; i++)
4548            c->idct_permutation[i]= simple_mmx_permutation[i];
4549        break;
4550    case FF_TRANSPOSE_IDCT_PERM:
4551        for(i=0; i<64; i++)
4552            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4553        break;
4554    case FF_PARTTRANS_IDCT_PERM:
4555        for(i=0; i<64; i++)
4556            c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4557        break;
4558    case FF_SSE2_IDCT_PERM:
4559        for(i=0; i<64; i++)
4560            c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4561        break;
4562    default:
4563        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4564    }
4565}
4566
4567