1/*
2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
3 * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22/**
23 * @file
24 * H.264 / AVC / MPEG4 part10 prediction functions.
25 * @author Michael Niedermayer <michaelni@gmx.at>
26 */
27
28#include "libavutil/intreadwrite.h"
29
30#include "mathops.h"
31
32#include "bit_depth_template.c"
33
34static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright,
35                                    ptrdiff_t _stride)
36{
37    pixel *src = (pixel*)_src;
38    int stride = _stride>>(sizeof(pixel)-1);
39    const pixel4 a= AV_RN4PA(src-stride);
40
41    AV_WN4PA(src+0*stride, a);
42    AV_WN4PA(src+1*stride, a);
43    AV_WN4PA(src+2*stride, a);
44    AV_WN4PA(src+3*stride, a);
45}
46
47static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright,
48                                      ptrdiff_t _stride)
49{
50    pixel *src = (pixel*)_src;
51    int stride = _stride>>(sizeof(pixel)-1);
52    AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4(src[-1+0*stride]));
53    AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4(src[-1+1*stride]));
54    AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4(src[-1+2*stride]));
55    AV_WN4PA(src+3*stride, PIXEL_SPLAT_X4(src[-1+3*stride]));
56}
57
58static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright,
59                              ptrdiff_t _stride)
60{
61    pixel *src = (pixel*)_src;
62    int stride = _stride>>(sizeof(pixel)-1);
63    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
64                   + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
65    const pixel4 a = PIXEL_SPLAT_X4(dc);
66
67    AV_WN4PA(src+0*stride, a);
68    AV_WN4PA(src+1*stride, a);
69    AV_WN4PA(src+2*stride, a);
70    AV_WN4PA(src+3*stride, a);
71}
72
73static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright,
74                                   ptrdiff_t _stride)
75{
76    pixel *src = (pixel*)_src;
77    int stride = _stride>>(sizeof(pixel)-1);
78    const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
79    const pixel4 a = PIXEL_SPLAT_X4(dc);
80
81    AV_WN4PA(src+0*stride, a);
82    AV_WN4PA(src+1*stride, a);
83    AV_WN4PA(src+2*stride, a);
84    AV_WN4PA(src+3*stride, a);
85}
86
87static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright,
88                                  ptrdiff_t _stride)
89{
90    pixel *src = (pixel*)_src;
91    int stride = _stride>>(sizeof(pixel)-1);
92    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
93    const pixel4 a = PIXEL_SPLAT_X4(dc);
94
95    AV_WN4PA(src+0*stride, a);
96    AV_WN4PA(src+1*stride, a);
97    AV_WN4PA(src+2*stride, a);
98    AV_WN4PA(src+3*stride, a);
99}
100
101static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright,
102                                  ptrdiff_t _stride)
103{
104    pixel *src = (pixel*)_src;
105    int stride = _stride>>(sizeof(pixel)-1);
106    const pixel4 a = PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
107
108    AV_WN4PA(src+0*stride, a);
109    AV_WN4PA(src+1*stride, a);
110    AV_WN4PA(src+2*stride, a);
111    AV_WN4PA(src+3*stride, a);
112}
113
114static void FUNCC(pred4x4_127_dc)(uint8_t *_src, const uint8_t *topright,
115                                  ptrdiff_t _stride)
116{
117    pixel *src = (pixel*)_src;
118    int stride = _stride>>(sizeof(pixel)-1);
119    const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))-1);
120
121    AV_WN4PA(src+0*stride, a);
122    AV_WN4PA(src+1*stride, a);
123    AV_WN4PA(src+2*stride, a);
124    AV_WN4PA(src+3*stride, a);
125}
126
127static void FUNCC(pred4x4_129_dc)(uint8_t *_src, const uint8_t *topright,
128                                  ptrdiff_t _stride)
129{
130    pixel *src = (pixel*)_src;
131    int stride = _stride>>(sizeof(pixel)-1);
132    const pixel4 a = PIXEL_SPLAT_X4((1<<(BIT_DEPTH-1))+1);
133
134    AV_WN4PA(src+0*stride, a);
135    AV_WN4PA(src+1*stride, a);
136    AV_WN4PA(src+2*stride, a);
137    AV_WN4PA(src+3*stride, a);
138}
139
140
141#define LOAD_TOP_RIGHT_EDGE\
142    const unsigned av_unused t4 = topright[0];\
143    const unsigned av_unused t5 = topright[1];\
144    const unsigned av_unused t6 = topright[2];\
145    const unsigned av_unused t7 = topright[3];\
146
147#define LOAD_DOWN_LEFT_EDGE\
148    const unsigned av_unused l4 = src[-1+4*stride];\
149    const unsigned av_unused l5 = src[-1+5*stride];\
150    const unsigned av_unused l6 = src[-1+6*stride];\
151    const unsigned av_unused l7 = src[-1+7*stride];\
152
153#define LOAD_LEFT_EDGE\
154    const unsigned av_unused l0 = src[-1+0*stride];\
155    const unsigned av_unused l1 = src[-1+1*stride];\
156    const unsigned av_unused l2 = src[-1+2*stride];\
157    const unsigned av_unused l3 = src[-1+3*stride];\
158
159#define LOAD_TOP_EDGE\
160    const unsigned av_unused t0 = src[ 0-1*stride];\
161    const unsigned av_unused t1 = src[ 1-1*stride];\
162    const unsigned av_unused t2 = src[ 2-1*stride];\
163    const unsigned av_unused t3 = src[ 3-1*stride];\
164
165static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright,
166                                      ptrdiff_t _stride)
167{
168    pixel *src = (pixel*)_src;
169    int stride = _stride>>(sizeof(pixel)-1);
170    const int lt= src[-1-1*stride];
171    LOAD_TOP_EDGE
172    LOAD_LEFT_EDGE
173
174    src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
175    src[0+2*stride]=
176    src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
177    src[0+1*stride]=
178    src[1+2*stride]=
179    src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
180    src[0+0*stride]=
181    src[1+1*stride]=
182    src[2+2*stride]=
183    src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
184    src[1+0*stride]=
185    src[2+1*stride]=
186    src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
187    src[2+0*stride]=
188    src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
189    src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
190}
191
192static void FUNCC(pred4x4_down_left)(uint8_t *_src, const uint8_t *_topright,
193                                     ptrdiff_t _stride)
194{
195    pixel *src = (pixel*)_src;
196    const pixel *topright = (const pixel*)_topright;
197    int stride = _stride>>(sizeof(pixel)-1);
198    LOAD_TOP_EDGE
199    LOAD_TOP_RIGHT_EDGE
200//    LOAD_LEFT_EDGE
201
202    src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
203    src[1+0*stride]=
204    src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
205    src[2+0*stride]=
206    src[1+1*stride]=
207    src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
208    src[3+0*stride]=
209    src[2+1*stride]=
210    src[1+2*stride]=
211    src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
212    src[3+1*stride]=
213    src[2+2*stride]=
214    src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
215    src[3+2*stride]=
216    src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
217    src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
218}
219
220static void FUNCC(pred4x4_vertical_right)(uint8_t *_src,
221                                          const uint8_t *topright,
222                                          ptrdiff_t _stride)
223{
224    pixel *src = (pixel*)_src;
225    int stride = _stride>>(sizeof(pixel)-1);
226    const int lt= src[-1-1*stride];
227    LOAD_TOP_EDGE
228    LOAD_LEFT_EDGE
229
230    src[0+0*stride]=
231    src[1+2*stride]=(lt + t0 + 1)>>1;
232    src[1+0*stride]=
233    src[2+2*stride]=(t0 + t1 + 1)>>1;
234    src[2+0*stride]=
235    src[3+2*stride]=(t1 + t2 + 1)>>1;
236    src[3+0*stride]=(t2 + t3 + 1)>>1;
237    src[0+1*stride]=
238    src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
239    src[1+1*stride]=
240    src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
241    src[2+1*stride]=
242    src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
243    src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
244    src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
245    src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
246}
247
248static void FUNCC(pred4x4_vertical_left)(uint8_t *_src,
249                                         const uint8_t *_topright,
250                                         ptrdiff_t _stride)
251{
252    pixel *src = (pixel*)_src;
253    const pixel *topright = (const pixel*)_topright;
254    int stride = _stride>>(sizeof(pixel)-1);
255    LOAD_TOP_EDGE
256    LOAD_TOP_RIGHT_EDGE
257
258    src[0+0*stride]=(t0 + t1 + 1)>>1;
259    src[1+0*stride]=
260    src[0+2*stride]=(t1 + t2 + 1)>>1;
261    src[2+0*stride]=
262    src[1+2*stride]=(t2 + t3 + 1)>>1;
263    src[3+0*stride]=
264    src[2+2*stride]=(t3 + t4+ 1)>>1;
265    src[3+2*stride]=(t4 + t5+ 1)>>1;
266    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
267    src[1+1*stride]=
268    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
269    src[2+1*stride]=
270    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
271    src[3+1*stride]=
272    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
273    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
274}
275
276static void FUNCC(pred4x4_horizontal_up)(uint8_t *_src, const uint8_t *topright,
277                                         ptrdiff_t _stride)
278{
279    pixel *src = (pixel*)_src;
280    int stride = _stride>>(sizeof(pixel)-1);
281    LOAD_LEFT_EDGE
282
283    src[0+0*stride]=(l0 + l1 + 1)>>1;
284    src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
285    src[2+0*stride]=
286    src[0+1*stride]=(l1 + l2 + 1)>>1;
287    src[3+0*stride]=
288    src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
289    src[2+1*stride]=
290    src[0+2*stride]=(l2 + l3 + 1)>>1;
291    src[3+1*stride]=
292    src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
293    src[3+2*stride]=
294    src[1+3*stride]=
295    src[0+3*stride]=
296    src[2+2*stride]=
297    src[2+3*stride]=
298    src[3+3*stride]=l3;
299}
300
301static void FUNCC(pred4x4_horizontal_down)(uint8_t *_src,
302                                           const uint8_t *topright,
303                                           ptrdiff_t _stride)
304{
305    pixel *src = (pixel*)_src;
306    int stride = _stride>>(sizeof(pixel)-1);
307    const int lt= src[-1-1*stride];
308    LOAD_TOP_EDGE
309    LOAD_LEFT_EDGE
310
311    src[0+0*stride]=
312    src[2+1*stride]=(lt + l0 + 1)>>1;
313    src[1+0*stride]=
314    src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
315    src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
316    src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
317    src[0+1*stride]=
318    src[2+2*stride]=(l0 + l1 + 1)>>1;
319    src[1+1*stride]=
320    src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
321    src[0+2*stride]=
322    src[2+3*stride]=(l1 + l2+ 1)>>1;
323    src[1+2*stride]=
324    src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
325    src[0+3*stride]=(l2 + l3 + 1)>>1;
326    src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
327}
328
329static void FUNCC(pred16x16_vertical)(uint8_t *_src, ptrdiff_t _stride)
330{
331    int i;
332    pixel *src = (pixel*)_src;
333    int stride = _stride>>(sizeof(pixel)-1);
334    const pixel4 a = AV_RN4PA(((pixel4*)(src-stride))+0);
335    const pixel4 b = AV_RN4PA(((pixel4*)(src-stride))+1);
336    const pixel4 c = AV_RN4PA(((pixel4*)(src-stride))+2);
337    const pixel4 d = AV_RN4PA(((pixel4*)(src-stride))+3);
338
339    for(i=0; i<16; i++){
340        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
341        AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
342        AV_WN4PA(((pixel4*)(src+i*stride))+2, c);
343        AV_WN4PA(((pixel4*)(src+i*stride))+3, d);
344    }
345}
346
347static void FUNCC(pred16x16_horizontal)(uint8_t *_src, ptrdiff_t stride)
348{
349    int i;
350    pixel *src = (pixel*)_src;
351    stride >>= sizeof(pixel)-1;
352
353    for(i=0; i<16; i++){
354        const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
355
356        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
357        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
358        AV_WN4PA(((pixel4*)(src+i*stride))+2, a);
359        AV_WN4PA(((pixel4*)(src+i*stride))+3, a);
360    }
361}
362
363#define PREDICT_16x16_DC(v)\
364    for(i=0; i<16; i++){\
365        AV_WN4PA(src+ 0, v);\
366        AV_WN4PA(src+ 4, v);\
367        AV_WN4PA(src+ 8, v);\
368        AV_WN4PA(src+12, v);\
369        src += stride;\
370    }
371
372static void FUNCC(pred16x16_dc)(uint8_t *_src, ptrdiff_t stride)
373{
374    int i, dc=0;
375    pixel *src = (pixel*)_src;
376    pixel4 dcsplat;
377    stride >>= sizeof(pixel)-1;
378
379    for(i=0;i<16; i++){
380        dc+= src[-1+i*stride];
381    }
382
383    for(i=0;i<16; i++){
384        dc+= src[i-stride];
385    }
386
387    dcsplat = PIXEL_SPLAT_X4((dc+16)>>5);
388    PREDICT_16x16_DC(dcsplat);
389}
390
391static void FUNCC(pred16x16_left_dc)(uint8_t *_src, ptrdiff_t stride)
392{
393    int i, dc=0;
394    pixel *src = (pixel*)_src;
395    pixel4 dcsplat;
396    stride >>= sizeof(pixel)-1;
397
398    for(i=0;i<16; i++){
399        dc+= src[-1+i*stride];
400    }
401
402    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
403    PREDICT_16x16_DC(dcsplat);
404}
405
406static void FUNCC(pred16x16_top_dc)(uint8_t *_src, ptrdiff_t stride)
407{
408    int i, dc=0;
409    pixel *src = (pixel*)_src;
410    pixel4 dcsplat;
411    stride >>= sizeof(pixel)-1;
412
413    for(i=0;i<16; i++){
414        dc+= src[i-stride];
415    }
416
417    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
418    PREDICT_16x16_DC(dcsplat);
419}
420
421#define PRED16x16_X(n, v) \
422static void FUNCC(pred16x16_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\
423{\
424    int i;\
425    pixel *src = (pixel*)_src;\
426    stride >>= sizeof(pixel)-1;\
427    PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
428}
429
430PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1)
431PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0)
432PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1)
433
434static inline void FUNCC(pred16x16_plane_compat)(uint8_t *_src,
435                                                 ptrdiff_t _stride,
436                                                 const int svq3,
437                                                 const int rv40)
438{
439  int i, j, k;
440  int a;
441  INIT_CLIP
442  pixel *src = (pixel*)_src;
443  int stride = _stride>>(sizeof(pixel)-1);
444  const pixel * const src0 = src +7-stride;
445  const pixel *       src1 = src +8*stride-1;
446  const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
447  int H = src0[1] - src0[-1];
448  int V = src1[0] - src2[ 0];
449  for(k=2; k<=8; ++k) {
450    src1 += stride; src2 -= stride;
451    H += k*(src0[k] - src0[-k]);
452    V += k*(src1[0] - src2[ 0]);
453  }
454  if(svq3){
455    H = ( 5*(H/4) ) / 16;
456    V = ( 5*(V/4) ) / 16;
457
458    /* required for 100% accuracy */
459    i = H; H = V; V = i;
460  }else if(rv40){
461    H = ( H + (H>>2) ) >> 4;
462    V = ( V + (V>>2) ) >> 4;
463  }else{
464    H = ( 5*H+32 ) >> 6;
465    V = ( 5*V+32 ) >> 6;
466  }
467
468  a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
469  for(j=16; j>0; --j) {
470    int b = a;
471    a += V;
472    for(i=-16; i<0; i+=4) {
473      src[16+i] = CLIP((b    ) >> 5);
474      src[17+i] = CLIP((b+  H) >> 5);
475      src[18+i] = CLIP((b+2*H) >> 5);
476      src[19+i] = CLIP((b+3*H) >> 5);
477      b += 4*H;
478    }
479    src += stride;
480  }
481}
482
483static void FUNCC(pred16x16_plane)(uint8_t *src, ptrdiff_t stride)
484{
485    FUNCC(pred16x16_plane_compat)(src, stride, 0, 0);
486}
487
488static void FUNCC(pred8x8_vertical)(uint8_t *_src, ptrdiff_t _stride)
489{
490    int i;
491    pixel *src = (pixel*)_src;
492    int stride = _stride>>(sizeof(pixel)-1);
493    const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0);
494    const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1);
495
496    for(i=0; i<8; i++){
497        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
498        AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
499    }
500}
501
502static void FUNCC(pred8x16_vertical)(uint8_t *_src, ptrdiff_t _stride)
503{
504    int i;
505    pixel *src = (pixel*)_src;
506    int stride = _stride>>(sizeof(pixel)-1);
507    const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0);
508    const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1);
509
510    for(i=0; i<16; i++){
511        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
512        AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
513    }
514}
515
516static void FUNCC(pred8x8_horizontal)(uint8_t *_src, ptrdiff_t stride)
517{
518    int i;
519    pixel *src = (pixel*)_src;
520    stride >>= sizeof(pixel)-1;
521
522    for(i=0; i<8; i++){
523        const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
524        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
525        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
526    }
527}
528
529static void FUNCC(pred8x16_horizontal)(uint8_t *_src, ptrdiff_t stride)
530{
531    int i;
532    pixel *src = (pixel*)_src;
533    stride >>= sizeof(pixel)-1;
534    for(i=0; i<16; i++){
535        const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
536        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
537        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
538    }
539}
540
541#define PRED8x8_X(n, v)\
542static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\
543{\
544    int i;\
545    const pixel4 a = PIXEL_SPLAT_X4(v);\
546    pixel *src = (pixel*)_src;\
547    stride >>= sizeof(pixel)-1;\
548    for(i=0; i<8; i++){\
549        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);\
550        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);\
551    }\
552}
553
554PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1)
555PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0)
556PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1)
557
558static void FUNCC(pred8x16_128_dc)(uint8_t *_src, ptrdiff_t stride)
559{
560    FUNCC(pred8x8_128_dc)(_src, stride);
561    FUNCC(pred8x8_128_dc)(_src+8*stride, stride);
562}
563
564static void FUNCC(pred8x8_left_dc)(uint8_t *_src, ptrdiff_t stride)
565{
566    int i;
567    int dc0, dc2;
568    pixel4 dc0splat, dc2splat;
569    pixel *src = (pixel*)_src;
570    stride >>= sizeof(pixel)-1;
571
572    dc0=dc2=0;
573    for(i=0;i<4; i++){
574        dc0+= src[-1+i*stride];
575        dc2+= src[-1+(i+4)*stride];
576    }
577    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
578    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
579
580    for(i=0; i<4; i++){
581        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
582        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc0splat);
583    }
584    for(i=4; i<8; i++){
585        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
586        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc2splat);
587    }
588}
589
590static void FUNCC(pred8x16_left_dc)(uint8_t *_src, ptrdiff_t stride)
591{
592    FUNCC(pred8x8_left_dc)(_src, stride);
593    FUNCC(pred8x8_left_dc)(_src+8*stride, stride);
594}
595
596static void FUNCC(pred8x8_top_dc)(uint8_t *_src, ptrdiff_t stride)
597{
598    int i;
599    int dc0, dc1;
600    pixel4 dc0splat, dc1splat;
601    pixel *src = (pixel*)_src;
602    stride >>= sizeof(pixel)-1;
603
604    dc0=dc1=0;
605    for(i=0;i<4; i++){
606        dc0+= src[i-stride];
607        dc1+= src[4+i-stride];
608    }
609    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
610    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
611
612    for(i=0; i<4; i++){
613        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
614        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
615    }
616    for(i=4; i<8; i++){
617        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
618        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
619    }
620}
621
622static void FUNCC(pred8x16_top_dc)(uint8_t *_src, ptrdiff_t stride)
623{
624    int i;
625    int dc0, dc1;
626    pixel4 dc0splat, dc1splat;
627    pixel *src = (pixel*)_src;
628    stride >>= sizeof(pixel)-1;
629
630    dc0=dc1=0;
631    for(i=0;i<4; i++){
632        dc0+= src[i-stride];
633        dc1+= src[4+i-stride];
634    }
635    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
636    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
637
638    for(i=0; i<16; i++){
639        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
640        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
641    }
642}
643
644static void FUNCC(pred8x8_dc)(uint8_t *_src, ptrdiff_t stride)
645{
646    int i;
647    int dc0, dc1, dc2;
648    pixel4 dc0splat, dc1splat, dc2splat, dc3splat;
649    pixel *src = (pixel*)_src;
650    stride >>= sizeof(pixel)-1;
651
652    dc0=dc1=dc2=0;
653    for(i=0;i<4; i++){
654        dc0+= src[-1+i*stride] + src[i-stride];
655        dc1+= src[4+i-stride];
656        dc2+= src[-1+(i+4)*stride];
657    }
658    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
659    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
660    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
661    dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
662
663    for(i=0; i<4; i++){
664        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
665        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
666    }
667    for(i=4; i<8; i++){
668        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
669        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat);
670    }
671}
672
673static void FUNCC(pred8x16_dc)(uint8_t *_src, ptrdiff_t stride)
674{
675    int i;
676    int dc0, dc1, dc2, dc3, dc4;
677    pixel4 dc0splat, dc1splat, dc2splat, dc3splat, dc4splat, dc5splat, dc6splat, dc7splat;
678    pixel *src = (pixel*)_src;
679    stride >>= sizeof(pixel)-1;
680
681    dc0=dc1=dc2=dc3=dc4=0;
682    for(i=0;i<4; i++){
683        dc0+= src[-1+i*stride] + src[i-stride];
684        dc1+= src[4+i-stride];
685        dc2+= src[-1+(i+4)*stride];
686        dc3+= src[-1+(i+8)*stride];
687        dc4+= src[-1+(i+12)*stride];
688    }
689    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
690    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
691    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
692    dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
693    dc4splat = PIXEL_SPLAT_X4((dc3 + 2)>>2);
694    dc5splat = PIXEL_SPLAT_X4((dc1 + dc3 + 4)>>3);
695    dc6splat = PIXEL_SPLAT_X4((dc4 + 2)>>2);
696    dc7splat = PIXEL_SPLAT_X4((dc1 + dc4 + 4)>>3);
697
698    for(i=0; i<4; i++){
699        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
700        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
701    }
702    for(i=4; i<8; i++){
703        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
704        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat);
705    }
706    for(i=8; i<12; i++){
707        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc4splat);
708        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc5splat);
709    }
710    for(i=12; i<16; i++){
711        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc6splat);
712        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc7splat);
713    }
714}
715
716//the following 4 function should not be optimized!
717static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, ptrdiff_t stride)
718{
719    FUNCC(pred8x8_top_dc)(src, stride);
720    FUNCC(pred4x4_dc)(src, NULL, stride);
721}
722
723static void FUNC(pred8x16_mad_cow_dc_l0t)(uint8_t *src, ptrdiff_t stride)
724{
725    FUNCC(pred8x16_top_dc)(src, stride);
726    FUNCC(pred4x4_dc)(src, NULL, stride);
727}
728
729static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, ptrdiff_t stride)
730{
731    FUNCC(pred8x8_dc)(src, stride);
732    FUNCC(pred4x4_top_dc)(src, NULL, stride);
733}
734
735static void FUNC(pred8x16_mad_cow_dc_0lt)(uint8_t *src, ptrdiff_t stride)
736{
737    FUNCC(pred8x16_dc)(src, stride);
738    FUNCC(pred4x4_top_dc)(src, NULL, stride);
739}
740
741static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, ptrdiff_t stride)
742{
743    FUNCC(pred8x8_left_dc)(src, stride);
744    FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
745    FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
746}
747
748static void FUNC(pred8x16_mad_cow_dc_l00)(uint8_t *src, ptrdiff_t stride)
749{
750    FUNCC(pred8x16_left_dc)(src, stride);
751    FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
752    FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
753}
754
755static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, ptrdiff_t stride)
756{
757    FUNCC(pred8x8_left_dc)(src, stride);
758    FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
759    FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
760}
761
762static void FUNC(pred8x16_mad_cow_dc_0l0)(uint8_t *src, ptrdiff_t stride)
763{
764    FUNCC(pred8x16_left_dc)(src, stride);
765    FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
766    FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
767}
768
769static void FUNCC(pred8x8_plane)(uint8_t *_src, ptrdiff_t _stride)
770{
771  int j, k;
772  int a;
773  INIT_CLIP
774  pixel *src = (pixel*)_src;
775  int stride = _stride>>(sizeof(pixel)-1);
776  const pixel * const src0 = src +3-stride;
777  const pixel *       src1 = src +4*stride-1;
778  const pixel *       src2 = src1-2*stride;    // == src+2*stride-1;
779  int H = src0[1] - src0[-1];
780  int V = src1[0] - src2[ 0];
781  for(k=2; k<=4; ++k) {
782    src1 += stride; src2 -= stride;
783    H += k*(src0[k] - src0[-k]);
784    V += k*(src1[0] - src2[ 0]);
785  }
786  H = ( 17*H+16 ) >> 5;
787  V = ( 17*V+16 ) >> 5;
788
789  a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
790  for(j=8; j>0; --j) {
791    int b = a;
792    a += V;
793    src[0] = CLIP((b    ) >> 5);
794    src[1] = CLIP((b+  H) >> 5);
795    src[2] = CLIP((b+2*H) >> 5);
796    src[3] = CLIP((b+3*H) >> 5);
797    src[4] = CLIP((b+4*H) >> 5);
798    src[5] = CLIP((b+5*H) >> 5);
799    src[6] = CLIP((b+6*H) >> 5);
800    src[7] = CLIP((b+7*H) >> 5);
801    src += stride;
802  }
803}
804
805static void FUNCC(pred8x16_plane)(uint8_t *_src, ptrdiff_t _stride)
806{
807  int j, k;
808  int a;
809  INIT_CLIP
810  pixel *src = (pixel*)_src;
811  int stride = _stride>>(sizeof(pixel)-1);
812  const pixel * const src0 = src +3-stride;
813  const pixel *       src1 = src +8*stride-1;
814  const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
815  int H = src0[1] - src0[-1];
816  int V = src1[0] - src2[ 0];
817
818  for (k = 2; k <= 4; ++k) {
819      src1 += stride; src2 -= stride;
820      H += k*(src0[k] - src0[-k]);
821      V += k*(src1[0] - src2[ 0]);
822  }
823  for (; k <= 8; ++k) {
824      src1 += stride; src2 -= stride;
825      V += k*(src1[0] - src2[0]);
826  }
827
828  H = (17*H+16) >> 5;
829  V = (5*V+32) >> 6;
830
831  a = 16*(src1[0] + src2[8] + 1) - 7*V - 3*H;
832  for(j=16; j>0; --j) {
833    int b = a;
834    a += V;
835    src[0] = CLIP((b    ) >> 5);
836    src[1] = CLIP((b+  H) >> 5);
837    src[2] = CLIP((b+2*H) >> 5);
838    src[3] = CLIP((b+3*H) >> 5);
839    src[4] = CLIP((b+4*H) >> 5);
840    src[5] = CLIP((b+5*H) >> 5);
841    src[6] = CLIP((b+6*H) >> 5);
842    src[7] = CLIP((b+7*H) >> 5);
843    src += stride;
844  }
845}
846
847#define SRC(x,y) src[(x)+(y)*stride]
848#define PL(y) \
849    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
850#define PREDICT_8x8_LOAD_LEFT \
851    const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
852                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
853    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
854    const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
855
856#define PT(x) \
857    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
858#define PREDICT_8x8_LOAD_TOP \
859    const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
860                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
861    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
862    const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
863                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
864
865#define PTR(x) \
866    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
867#define PREDICT_8x8_LOAD_TOPRIGHT \
868    int t8, t9, t10, t11, t12, t13, t14, t15; \
869    if(has_topright) { \
870        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
871        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
872    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
873
874#define PREDICT_8x8_LOAD_TOPLEFT \
875    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
876
877#define PREDICT_8x8_DC(v) \
878    int y; \
879    for( y = 0; y < 8; y++ ) { \
880        AV_WN4PA(((pixel4*)src)+0, v); \
881        AV_WN4PA(((pixel4*)src)+1, v); \
882        src += stride; \
883    }
884
885static void FUNCC(pred8x8l_128_dc)(uint8_t *_src, int has_topleft,
886                                   int has_topright, ptrdiff_t _stride)
887{
888    pixel *src = (pixel*)_src;
889    int stride = _stride>>(sizeof(pixel)-1);
890
891    PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)));
892}
893static void FUNCC(pred8x8l_left_dc)(uint8_t *_src, int has_topleft,
894                                    int has_topright, ptrdiff_t _stride)
895{
896    pixel *src = (pixel*)_src;
897    int stride = _stride>>(sizeof(pixel)-1);
898
899    PREDICT_8x8_LOAD_LEFT;
900    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3);
901    PREDICT_8x8_DC(dc);
902}
903static void FUNCC(pred8x8l_top_dc)(uint8_t *_src, int has_topleft,
904                                   int has_topright, ptrdiff_t _stride)
905{
906    pixel *src = (pixel*)_src;
907    int stride = _stride>>(sizeof(pixel)-1);
908
909    PREDICT_8x8_LOAD_TOP;
910    const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3);
911    PREDICT_8x8_DC(dc);
912}
913static void FUNCC(pred8x8l_dc)(uint8_t *_src, int has_topleft,
914                               int has_topright, ptrdiff_t _stride)
915{
916    pixel *src = (pixel*)_src;
917    int stride = _stride>>(sizeof(pixel)-1);
918
919    PREDICT_8x8_LOAD_LEFT;
920    PREDICT_8x8_LOAD_TOP;
921    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7
922                                     +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4);
923    PREDICT_8x8_DC(dc);
924}
925static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft,
926                                       int has_topright, ptrdiff_t _stride)
927{
928    pixel *src = (pixel*)_src;
929    int stride = _stride>>(sizeof(pixel)-1);
930    pixel4 a;
931
932    PREDICT_8x8_LOAD_LEFT;
933#define ROW(y) a = PIXEL_SPLAT_X4(l##y); \
934               AV_WN4PA(src+y*stride, a); \
935               AV_WN4PA(src+y*stride+4, a);
936    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
937#undef ROW
938}
939static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft,
940                                     int has_topright, ptrdiff_t _stride)
941{
942    int y;
943    pixel *src = (pixel*)_src;
944    int stride = _stride>>(sizeof(pixel)-1);
945    pixel4 a, b;
946
947    PREDICT_8x8_LOAD_TOP;
948    src[0] = t0;
949    src[1] = t1;
950    src[2] = t2;
951    src[3] = t3;
952    src[4] = t4;
953    src[5] = t5;
954    src[6] = t6;
955    src[7] = t7;
956    a = AV_RN4PA(((pixel4*)src)+0);
957    b = AV_RN4PA(((pixel4*)src)+1);
958    for( y = 1; y < 8; y++ ) {
959        AV_WN4PA(((pixel4*)(src+y*stride))+0, a);
960        AV_WN4PA(((pixel4*)(src+y*stride))+1, b);
961    }
962}
963static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft,
964                                      int has_topright, ptrdiff_t _stride)
965{
966    pixel *src = (pixel*)_src;
967    int stride = _stride>>(sizeof(pixel)-1);
968    PREDICT_8x8_LOAD_TOP;
969    PREDICT_8x8_LOAD_TOPRIGHT;
970    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
971    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
972    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
973    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
974    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
975    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
976    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
977    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
978    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
979    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
980    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
981    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
982    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
983    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
984    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
985}
986static void FUNCC(pred8x8l_down_right)(uint8_t *_src, int has_topleft,
987                                       int has_topright, ptrdiff_t _stride)
988{
989    pixel *src = (pixel*)_src;
990    int stride = _stride>>(sizeof(pixel)-1);
991    PREDICT_8x8_LOAD_TOP;
992    PREDICT_8x8_LOAD_LEFT;
993    PREDICT_8x8_LOAD_TOPLEFT;
994    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
995    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
996    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
997    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
998    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
999    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
1000    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
1001    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
1002    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
1003    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
1004    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
1005    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
1006    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
1007    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
1008    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
1009}
1010static void FUNCC(pred8x8l_vertical_right)(uint8_t *_src, int has_topleft,
1011                                           int has_topright, ptrdiff_t _stride)
1012{
1013    pixel *src = (pixel*)_src;
1014    int stride = _stride>>(sizeof(pixel)-1);
1015    PREDICT_8x8_LOAD_TOP;
1016    PREDICT_8x8_LOAD_LEFT;
1017    PREDICT_8x8_LOAD_TOPLEFT;
1018    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
1019    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
1020    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
1021    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
1022    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
1023    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
1024    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
1025    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
1026    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
1027    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
1028    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
1029    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
1030    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
1031    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
1032    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
1033    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
1034    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
1035    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
1036    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
1037    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
1038    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
1039    SRC(7,0)= (t6 + t7 + 1) >> 1;
1040}
1041static void FUNCC(pred8x8l_horizontal_down)(uint8_t *_src, int has_topleft,
1042                                            int has_topright, ptrdiff_t _stride)
1043{
1044    pixel *src = (pixel*)_src;
1045    int stride = _stride>>(sizeof(pixel)-1);
1046    PREDICT_8x8_LOAD_TOP;
1047    PREDICT_8x8_LOAD_LEFT;
1048    PREDICT_8x8_LOAD_TOPLEFT;
1049    SRC(0,7)= (l6 + l7 + 1) >> 1;
1050    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
1051    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
1052    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
1053    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
1054    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
1055    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
1056    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
1057    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
1058    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
1059    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
1060    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
1061    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
1062    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
1063    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
1064    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
1065    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
1066    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
1067    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
1068    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
1069    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
1070    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
1071}
1072static void FUNCC(pred8x8l_vertical_left)(uint8_t *_src, int has_topleft,
1073                                          int has_topright, ptrdiff_t _stride)
1074{
1075    pixel *src = (pixel*)_src;
1076    int stride = _stride>>(sizeof(pixel)-1);
1077    PREDICT_8x8_LOAD_TOP;
1078    PREDICT_8x8_LOAD_TOPRIGHT;
1079    SRC(0,0)= (t0 + t1 + 1) >> 1;
1080    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
1081    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
1082    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
1083    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
1084    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
1085    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
1086    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
1087    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
1088    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
1089    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
1090    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
1091    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
1092    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
1093    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
1094    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
1095    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
1096    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
1097    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
1098    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
1099    SRC(7,6)= (t10 + t11 + 1) >> 1;
1100    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
1101}
1102static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft,
1103                                          int has_topright, ptrdiff_t _stride)
1104{
1105    pixel *src = (pixel*)_src;
1106    int stride = _stride>>(sizeof(pixel)-1);
1107    PREDICT_8x8_LOAD_LEFT;
1108    SRC(0,0)= (l0 + l1 + 1) >> 1;
1109    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
1110    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
1111    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
1112    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
1113    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
1114    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
1115    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
1116    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
1117    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
1118    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
1119    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
1120    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
1121    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
1122    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
1123    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
1124    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
1125    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
1126}
1127
1128static void FUNCC(pred8x8l_vertical_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft,
1129                                     int has_topright, ptrdiff_t _stride)
1130{
1131    int i;
1132    pixel *src = (pixel*)_src;
1133    const dctcoef *block = (const dctcoef*)_block;
1134    pixel pix[8];
1135    int stride = _stride>>(sizeof(pixel)-1);
1136    PREDICT_8x8_LOAD_TOP;
1137
1138    pix[0] = t0;
1139    pix[1] = t1;
1140    pix[2] = t2;
1141    pix[3] = t3;
1142    pix[4] = t4;
1143    pix[5] = t5;
1144    pix[6] = t6;
1145    pix[7] = t7;
1146
1147    for(i=0; i<8; i++){
1148        pixel v = pix[i];
1149        src[0*stride]= v += block[0];
1150        src[1*stride]= v += block[8];
1151        src[2*stride]= v += block[16];
1152        src[3*stride]= v += block[24];
1153        src[4*stride]= v += block[32];
1154        src[5*stride]= v += block[40];
1155        src[6*stride]= v += block[48];
1156        src[7*stride]= v +  block[56];
1157        src++;
1158        block++;
1159    }
1160
1161    memset(_block, 0, sizeof(dctcoef) * 64);
1162}
1163
1164static void FUNCC(pred8x8l_horizontal_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft,
1165                               int has_topright, ptrdiff_t _stride)
1166{
1167    int i;
1168    pixel *src = (pixel*)_src;
1169    const dctcoef *block = (const dctcoef*)_block;
1170    pixel pix[8];
1171    int stride = _stride>>(sizeof(pixel)-1);
1172    PREDICT_8x8_LOAD_LEFT;
1173
1174    pix[0] = l0;
1175    pix[1] = l1;
1176    pix[2] = l2;
1177    pix[3] = l3;
1178    pix[4] = l4;
1179    pix[5] = l5;
1180    pix[6] = l6;
1181    pix[7] = l7;
1182
1183    for(i=0; i<8; i++){
1184        pixel v = pix[i];
1185        src[0]= v += block[0];
1186        src[1]= v += block[1];
1187        src[2]= v += block[2];
1188        src[3]= v += block[3];
1189        src[4]= v += block[4];
1190        src[5]= v += block[5];
1191        src[6]= v += block[6];
1192        src[7]= v +  block[7];
1193        src+= stride;
1194        block+= 8;
1195    }
1196
1197    memset(_block, 0, sizeof(dctcoef) * 64);
1198}
1199
1200#undef PREDICT_8x8_LOAD_LEFT
1201#undef PREDICT_8x8_LOAD_TOP
1202#undef PREDICT_8x8_LOAD_TOPLEFT
1203#undef PREDICT_8x8_LOAD_TOPRIGHT
1204#undef PREDICT_8x8_DC
1205#undef PTR
1206#undef PT
1207#undef PL
1208#undef SRC
1209
1210static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, int16_t *_block,
1211                                        ptrdiff_t stride)
1212{
1213    int i;
1214    pixel *pix = (pixel*)_pix;
1215    const dctcoef *block = (const dctcoef*)_block;
1216    stride >>= sizeof(pixel)-1;
1217    pix -= stride;
1218    for(i=0; i<4; i++){
1219        pixel v = pix[0];
1220        pix[1*stride]= v += block[0];
1221        pix[2*stride]= v += block[4];
1222        pix[3*stride]= v += block[8];
1223        pix[4*stride]= v +  block[12];
1224        pix++;
1225        block++;
1226    }
1227
1228    memset(_block, 0, sizeof(dctcoef) * 16);
1229}
1230
1231static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, int16_t *_block,
1232                                          ptrdiff_t stride)
1233{
1234    int i;
1235    pixel *pix = (pixel*)_pix;
1236    const dctcoef *block = (const dctcoef*)_block;
1237    stride >>= sizeof(pixel)-1;
1238    for(i=0; i<4; i++){
1239        pixel v = pix[-1];
1240        pix[0]= v += block[0];
1241        pix[1]= v += block[1];
1242        pix[2]= v += block[2];
1243        pix[3]= v +  block[3];
1244        pix+= stride;
1245        block+= 4;
1246    }
1247
1248    memset(_block, 0, sizeof(dctcoef) * 16);
1249}
1250
1251static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, int16_t *_block,
1252                                         ptrdiff_t stride)
1253{
1254    int i;
1255    pixel *pix = (pixel*)_pix;
1256    const dctcoef *block = (const dctcoef*)_block;
1257    stride >>= sizeof(pixel)-1;
1258    pix -= stride;
1259    for(i=0; i<8; i++){
1260        pixel v = pix[0];
1261        pix[1*stride]= v += block[0];
1262        pix[2*stride]= v += block[8];
1263        pix[3*stride]= v += block[16];
1264        pix[4*stride]= v += block[24];
1265        pix[5*stride]= v += block[32];
1266        pix[6*stride]= v += block[40];
1267        pix[7*stride]= v += block[48];
1268        pix[8*stride]= v +  block[56];
1269        pix++;
1270        block++;
1271    }
1272
1273    memset(_block, 0, sizeof(dctcoef) * 64);
1274}
1275
1276static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, int16_t *_block,
1277                                           ptrdiff_t stride)
1278{
1279    int i;
1280    pixel *pix = (pixel*)_pix;
1281    const dctcoef *block = (const dctcoef*)_block;
1282    stride >>= sizeof(pixel)-1;
1283    for(i=0; i<8; i++){
1284        pixel v = pix[-1];
1285        pix[0]= v += block[0];
1286        pix[1]= v += block[1];
1287        pix[2]= v += block[2];
1288        pix[3]= v += block[3];
1289        pix[4]= v += block[4];
1290        pix[5]= v += block[5];
1291        pix[6]= v += block[6];
1292        pix[7]= v +  block[7];
1293        pix+= stride;
1294        block+= 8;
1295    }
1296
1297    memset(_block, 0, sizeof(dctcoef) * 64);
1298}
1299
1300static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset,
1301                                          int16_t *block,
1302                                          ptrdiff_t stride)
1303{
1304    int i;
1305    for(i=0; i<16; i++)
1306        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1307}
1308
1309static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix,
1310                                            const int *block_offset,
1311                                            int16_t *block,
1312                                            ptrdiff_t stride)
1313{
1314    int i;
1315    for(i=0; i<16; i++)
1316        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1317}
1318
1319static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset,
1320                                        int16_t *block, ptrdiff_t stride)
1321{
1322    int i;
1323    for(i=0; i<4; i++)
1324        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1325}
1326
1327static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset,
1328                                         int16_t *block, ptrdiff_t stride)
1329{
1330    int i;
1331    for(i=0; i<4; i++)
1332        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1333    for(i=4; i<8; i++)
1334        FUNCC(pred4x4_vertical_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
1335}
1336
1337static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset,
1338                                          int16_t *block,
1339                                          ptrdiff_t stride)
1340{
1341    int i;
1342    for(i=0; i<4; i++)
1343        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1344}
1345
1346static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix,
1347                                           const int *block_offset,
1348                                           int16_t *block, ptrdiff_t stride)
1349{
1350    int i;
1351    for(i=0; i<4; i++)
1352        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
1353    for(i=4; i<8; i++)
1354        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
1355}
1356