1/*
2 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3 *
4 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23/**
24 * @file libpostproc/postprocess.c
25 * postprocessing.
26 */
27
28/*
29                        C       MMX     MMX2    3DNow   AltiVec
30isVertDC                Ec      Ec                      Ec
31isVertMinMaxOk          Ec      Ec                      Ec
32doVertLowPass           E               e       e       Ec
33doVertDefFilter         Ec      Ec      e       e       Ec
34isHorizDC               Ec      Ec                      Ec
35isHorizMinMaxOk         a       E                       Ec
36doHorizLowPass          E               e       e       Ec
37doHorizDefFilter        Ec      Ec      e       e       Ec
38do_a_deblock            Ec      E       Ec      E
39deRing                  E               e       e*      Ecp
40Vertical RKAlgo1        E               a       a
41Horizontal RKAlgo1                      a       a
42Vertical X1#            a               E       E
43Horizontal X1#          a               E       E
44LinIpolDeinterlace      e               E       E*
45CubicIpolDeinterlace    a               e       e*
46LinBlendDeinterlace     e               E       E*
47MedianDeinterlace#      E       Ec      Ec
48TempDeNoiser#           E               e       e       Ec
49
50* I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
51# more or less selfinvented filters so the exactness is not too meaningful
52E = Exact implementation
53e = almost exact implementation (slightly different rounding,...)
54a = alternative / approximate impl
55c = checked against the other implementations (-vo md5)
56p = partially optimized, still some work to do
57*/
58
59/*
60TODO:
61reduce the time wasted on the mem transfer
62unroll stuff if instructions depend too much on the prior one
63move YScale thing to the end instead of fixing QP
64write a faster and higher quality deblocking filter :)
65make the mainloop more flexible (variable number of blocks at once
66        (the if/else stuff per block is slowing things down)
67compare the quality & speed of all filters
68split this huge file
69optimize c versions
70try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
71...
72*/
73
74//Changelog: use the Subversion log
75
76#include "config.h"
77#include "libavutil/avutil.h"
78#include <inttypes.h>
79#include <stdio.h>
80#include <stdlib.h>
81#include <string.h>
82//#undef HAVE_MMX2
83//#define HAVE_AMD3DNOW
84//#undef HAVE_MMX
85//#undef ARCH_X86
86//#define DEBUG_BRIGHTNESS
87#include "postprocess.h"
88#include "postprocess_internal.h"
89
90unsigned postproc_version(void)
91{
92    return LIBPOSTPROC_VERSION_INT;
93}
94
95#if HAVE_ALTIVEC_H
96#include <altivec.h>
97#endif
98
99#define GET_MODE_BUFFER_SIZE 500
100#define OPTIONS_ARRAY_SIZE 10
101#define BLOCK_SIZE 8
102#define TEMP_STRIDE 8
103//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
104
105#if ARCH_X86
106DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
107DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
108DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
109DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
110DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
111DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
112DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
113DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
114#endif
115
116DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
117
118
119static struct PPFilter filters[]=
120{
121    {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
122    {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
123/*  {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
124    {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
125    {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
126    {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
127    {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
128    {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
129    {"dr", "dering",                1, 5, 6, DERING},
130    {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
131    {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
132    {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
133    {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
134    {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
135    {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
136    {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
137    {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
138    {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
139    {NULL, NULL,0,0,0,0} //End Marker
140};
141
142static const char *replaceTable[]=
143{
144    "default",      "hb:a,vb:a,dr:a",
145    "de",           "hb:a,vb:a,dr:a",
146    "fast",         "h1:a,v1:a,dr:a",
147    "fa",           "h1:a,v1:a,dr:a",
148    "ac",           "ha:a:128:7,va:a,dr:a",
149    NULL //End Marker
150};
151
152
153#if ARCH_X86
154static inline void prefetchnta(void *p)
155{
156    __asm__ volatile(   "prefetchnta (%0)\n\t"
157        : : "r" (p)
158    );
159}
160
161static inline void prefetcht0(void *p)
162{
163    __asm__ volatile(   "prefetcht0 (%0)\n\t"
164        : : "r" (p)
165    );
166}
167
168static inline void prefetcht1(void *p)
169{
170    __asm__ volatile(   "prefetcht1 (%0)\n\t"
171        : : "r" (p)
172    );
173}
174
175static inline void prefetcht2(void *p)
176{
177    __asm__ volatile(   "prefetcht2 (%0)\n\t"
178        : : "r" (p)
179    );
180}
181#endif
182
183/* The horizontal functions exist only in C because the MMX
184 * code is faster with vertical filters and transposing. */
185
186/**
187 * Check if the given 8x8 Block is mostly "flat"
188 */
189static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
190{
191    int numEq= 0;
192    int y;
193    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
194    const int dcThreshold= dcOffset*2 + 1;
195
196    for(y=0; y<BLOCK_SIZE; y++){
197        if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
198        if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
199        if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
200        if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
201        if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
202        if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
203        if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
204        src+= stride;
205    }
206    return numEq > c->ppMode.flatnessThreshold;
207}
208
209/**
210 * Check if the middle 8x8 Block in the given 8x16 block is flat
211 */
212static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c)
213{
214    int numEq= 0;
215    int y;
216    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
217    const int dcThreshold= dcOffset*2 + 1;
218
219    src+= stride*4; // src points to begin of the 8x8 Block
220    for(y=0; y<BLOCK_SIZE-1; y++){
221        if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
222        if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
223        if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
224        if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
225        if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
226        if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
227        if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
228        if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
229        src+= stride;
230    }
231    return numEq > c->ppMode.flatnessThreshold;
232}
233
234static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
235{
236    int i;
237#if 1
238    for(i=0; i<2; i++){
239        if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
240        src += stride;
241        if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
242        src += stride;
243        if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
244        src += stride;
245        if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
246        src += stride;
247    }
248#else
249    for(i=0; i<8; i++){
250        if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
251        src += stride;
252    }
253#endif
254    return 1;
255}
256
257static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
258{
259#if 1
260#if 1
261    int x;
262    src+= stride*4;
263    for(x=0; x<BLOCK_SIZE; x+=4){
264        if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
265        if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
266        if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
267        if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
268    }
269#else
270    int x;
271    src+= stride*3;
272    for(x=0; x<BLOCK_SIZE; x++){
273        if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
274    }
275#endif
276    return 1;
277#else
278    int x;
279    src+= stride*4;
280    for(x=0; x<BLOCK_SIZE; x++){
281        int min=255;
282        int max=0;
283        int y;
284        for(y=0; y<8; y++){
285            int v= src[x + y*stride];
286            if(v>max) max=v;
287            if(v<min) min=v;
288        }
289        if(max-min > 2*QP) return 0;
290    }
291    return 1;
292#endif
293}
294
295static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c)
296{
297    if( isHorizDC_C(src, stride, c) ){
298        if( isHorizMinMaxOk_C(src, stride, c->QP) )
299            return 1;
300        else
301            return 0;
302    }else{
303        return 2;
304    }
305}
306
307static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c)
308{
309    if( isVertDC_C(src, stride, c) ){
310        if( isVertMinMaxOk_C(src, stride, c->QP) )
311            return 1;
312        else
313            return 0;
314    }else{
315        return 2;
316    }
317}
318
319static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
320{
321    int y;
322    for(y=0; y<BLOCK_SIZE; y++){
323        const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
324
325        if(FFABS(middleEnergy) < 8*c->QP){
326            const int q=(dst[3] - dst[4])/2;
327            const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
328            const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
329
330            int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
331            d= FFMAX(d, 0);
332
333            d= (5*d + 32) >> 6;
334            d*= FFSIGN(-middleEnergy);
335
336            if(q>0)
337            {
338                d= d<0 ? 0 : d;
339                d= d>q ? q : d;
340            }
341            else
342            {
343                d= d>0 ? 0 : d;
344                d= d<q ? q : d;
345            }
346
347            dst[3]-= d;
348            dst[4]+= d;
349        }
350        dst+= stride;
351    }
352}
353
354/**
355 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
356 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
357 */
358static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
359{
360    int y;
361    for(y=0; y<BLOCK_SIZE; y++){
362        const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
363        const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
364
365        int sums[10];
366        sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
367        sums[1] = sums[0] - first  + dst[3];
368        sums[2] = sums[1] - first  + dst[4];
369        sums[3] = sums[2] - first  + dst[5];
370        sums[4] = sums[3] - first  + dst[6];
371        sums[5] = sums[4] - dst[0] + dst[7];
372        sums[6] = sums[5] - dst[1] + last;
373        sums[7] = sums[6] - dst[2] + last;
374        sums[8] = sums[7] - dst[3] + last;
375        sums[9] = sums[8] - dst[4] + last;
376
377        dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
378        dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
379        dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
380        dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
381        dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
382        dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
383        dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
384        dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
385
386        dst+= stride;
387    }
388}
389
390/**
391 * Experimental Filter 1 (Horizontal)
392 * will not damage linear gradients
393 * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
394 * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
395 * MMX2 version does correct clipping C version does not
396 * not identical with the vertical one
397 */
398static inline void horizX1Filter(uint8_t *src, int stride, int QP)
399{
400    int y;
401    static uint64_t *lut= NULL;
402    if(lut==NULL)
403    {
404        int i;
405        lut = av_malloc(256*8);
406        for(i=0; i<256; i++)
407        {
408            int v= i < 128 ? 2*i : 2*(i-256);
409/*
410//Simulate 112242211 9-Tap filter
411            uint64_t a= (v/16)  & 0xFF;
412            uint64_t b= (v/8)   & 0xFF;
413            uint64_t c= (v/4)   & 0xFF;
414            uint64_t d= (3*v/8) & 0xFF;
415*/
416//Simulate piecewise linear interpolation
417            uint64_t a= (v/16)   & 0xFF;
418            uint64_t b= (v*3/16) & 0xFF;
419            uint64_t c= (v*5/16) & 0xFF;
420            uint64_t d= (7*v/16) & 0xFF;
421            uint64_t A= (0x100 - a)&0xFF;
422            uint64_t B= (0x100 - b)&0xFF;
423            uint64_t C= (0x100 - c)&0xFF;
424            uint64_t D= (0x100 - c)&0xFF;
425
426            lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
427                       (D<<24) | (C<<16) | (B<<8)  | (A);
428            //lut[i] = (v<<32) | (v<<24);
429        }
430    }
431
432    for(y=0; y<BLOCK_SIZE; y++){
433        int a= src[1] - src[2];
434        int b= src[3] - src[4];
435        int c= src[5] - src[6];
436
437        int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
438
439        if(d < QP){
440            int v = d * FFSIGN(-b);
441
442            src[1] +=v/8;
443            src[2] +=v/4;
444            src[3] +=3*v/8;
445            src[4] -=3*v/8;
446            src[5] -=v/4;
447            src[6] -=v/8;
448        }
449        src+=stride;
450    }
451}
452
453/**
454 * accurate deblock filter
455 */
456static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
457    int y;
458    const int QP= c->QP;
459    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
460    const int dcThreshold= dcOffset*2 + 1;
461//START_TIMER
462    src+= step*4; // src points to begin of the 8x8 Block
463    for(y=0; y<8; y++){
464        int numEq= 0;
465
466        if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
467        if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
468        if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
469        if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
470        if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
471        if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
472        if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
473        if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
474        if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
475        if(numEq > c->ppMode.flatnessThreshold){
476            int min, max, x;
477
478            if(src[0] > src[step]){
479                max= src[0];
480                min= src[step];
481            }else{
482                max= src[step];
483                min= src[0];
484            }
485            for(x=2; x<8; x+=2){
486                if(src[x*step] > src[(x+1)*step]){
487                        if(src[x    *step] > max) max= src[ x   *step];
488                        if(src[(x+1)*step] < min) min= src[(x+1)*step];
489                }else{
490                        if(src[(x+1)*step] > max) max= src[(x+1)*step];
491                        if(src[ x   *step] < min) min= src[ x   *step];
492                }
493            }
494            if(max-min < 2*QP){
495                const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
496                const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
497
498                int sums[10];
499                sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
500                sums[1] = sums[0] - first       + src[3*step];
501                sums[2] = sums[1] - first       + src[4*step];
502                sums[3] = sums[2] - first       + src[5*step];
503                sums[4] = sums[3] - first       + src[6*step];
504                sums[5] = sums[4] - src[0*step] + src[7*step];
505                sums[6] = sums[5] - src[1*step] + last;
506                sums[7] = sums[6] - src[2*step] + last;
507                sums[8] = sums[7] - src[3*step] + last;
508                sums[9] = sums[8] - src[4*step] + last;
509
510                src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
511                src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
512                src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
513                src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
514                src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
515                src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
516                src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
517                src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
518            }
519        }else{
520            const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
521
522            if(FFABS(middleEnergy) < 8*QP){
523                const int q=(src[3*step] - src[4*step])/2;
524                const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
525                const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
526
527                int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
528                d= FFMAX(d, 0);
529
530                d= (5*d + 32) >> 6;
531                d*= FFSIGN(-middleEnergy);
532
533                if(q>0){
534                    d= d<0 ? 0 : d;
535                    d= d>q ? q : d;
536                }else{
537                    d= d>0 ? 0 : d;
538                    d= d<q ? q : d;
539                }
540
541                src[3*step]-= d;
542                src[4*step]+= d;
543            }
544        }
545
546        src += stride;
547    }
548/*if(step==16){
549    STOP_TIMER("step16")
550}else{
551    STOP_TIMER("stepX")
552}*/
553}
554
555//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
556//Plain C versions
557#if !(HAVE_MMX || HAVE_ALTIVEC) || CONFIG_RUNTIME_CPUDETECT
558#define COMPILE_C
559#endif
560
561#if HAVE_ALTIVEC
562#define COMPILE_ALTIVEC
563#endif //HAVE_ALTIVEC
564
565#if ARCH_X86
566
567#if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
568#define COMPILE_MMX
569#endif
570
571#if HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT
572#define COMPILE_MMX2
573#endif
574
575#if (HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
576#define COMPILE_3DNOW
577#endif
578#endif /* ARCH_X86 */
579
580#undef HAVE_MMX
581#define HAVE_MMX 0
582#undef HAVE_MMX2
583#define HAVE_MMX2 0
584#undef HAVE_AMD3DNOW
585#define HAVE_AMD3DNOW 0
586#undef HAVE_ALTIVEC
587#define HAVE_ALTIVEC 0
588
589#ifdef COMPILE_C
590#define RENAME(a) a ## _C
591#include "postprocess_template.c"
592#endif
593
594#ifdef COMPILE_ALTIVEC
595#undef RENAME
596#undef HAVE_ALTIVEC
597#define HAVE_ALTIVEC 1
598#define RENAME(a) a ## _altivec
599#include "postprocess_altivec_template.c"
600#include "postprocess_template.c"
601#endif
602
603//MMX versions
604#ifdef COMPILE_MMX
605#undef RENAME
606#undef HAVE_MMX
607#define HAVE_MMX 1
608#define RENAME(a) a ## _MMX
609#include "postprocess_template.c"
610#endif
611
612//MMX2 versions
613#ifdef COMPILE_MMX2
614#undef RENAME
615#undef HAVE_MMX
616#undef HAVE_MMX2
617#define HAVE_MMX 1
618#define HAVE_MMX2 1
619#define RENAME(a) a ## _MMX2
620#include "postprocess_template.c"
621#endif
622
623//3DNOW versions
624#ifdef COMPILE_3DNOW
625#undef RENAME
626#undef HAVE_MMX
627#undef HAVE_MMX2
628#undef HAVE_AMD3DNOW
629#define HAVE_MMX 1
630#define HAVE_MMX2 0
631#define HAVE_AMD3DNOW 1
632#define RENAME(a) a ## _3DNow
633#include "postprocess_template.c"
634#endif
635
636// minor note: the HAVE_xyz is messed up after that line so do not use it.
637
638static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
639        const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
640{
641    PPContext *c= (PPContext *)vc;
642    PPMode *ppMode= (PPMode *)vm;
643    c->ppMode= *ppMode; //FIXME
644
645    // Using ifs here as they are faster than function pointers although the
646    // difference would not be measurable here but it is much better because
647    // someone might exchange the CPU whithout restarting MPlayer ;)
648#if CONFIG_RUNTIME_CPUDETECT
649#if ARCH_X86
650    // ordered per speed fastest first
651    if(c->cpuCaps & PP_CPU_CAPS_MMX2)
652        postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
653    else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
654        postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
655    else if(c->cpuCaps & PP_CPU_CAPS_MMX)
656        postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
657    else
658        postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
659#else
660#if HAVE_ALTIVEC
661    if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
662            postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
663    else
664#endif
665            postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
666#endif
667#else //CONFIG_RUNTIME_CPUDETECT
668#if   HAVE_MMX2
669            postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
670#elif HAVE_AMD3DNOW
671            postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
672#elif HAVE_MMX
673            postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
674#elif HAVE_ALTIVEC
675            postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
676#else
677            postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
678#endif
679#endif //!CONFIG_RUNTIME_CPUDETECT
680}
681
682//static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
683//        QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
684
685/* -pp Command line Help
686*/
687#if LIBPOSTPROC_VERSION_INT < (52<<16)
688const char *const pp_help=
689#else
690const char pp_help[] =
691#endif
692"Available postprocessing filters:\n"
693"Filters                        Options\n"
694"short  long name       short   long option     Description\n"
695"*      *               a       autoq           CPU power dependent enabler\n"
696"                       c       chrom           chrominance filtering enabled\n"
697"                       y       nochrom         chrominance filtering disabled\n"
698"                       n       noluma          luma filtering disabled\n"
699"hb     hdeblock        (2 threshold)           horizontal deblocking filter\n"
700"       1. difference factor: default=32, higher -> more deblocking\n"
701"       2. flatness threshold: default=39, lower -> more deblocking\n"
702"                       the h & v deblocking filters share these\n"
703"                       so you can't set different thresholds for h / v\n"
704"vb     vdeblock        (2 threshold)           vertical deblocking filter\n"
705"ha     hadeblock       (2 threshold)           horizontal deblocking filter\n"
706"va     vadeblock       (2 threshold)           vertical deblocking filter\n"
707"h1     x1hdeblock                              experimental h deblock filter 1\n"
708"v1     x1vdeblock                              experimental v deblock filter 1\n"
709"dr     dering                                  deringing filter\n"
710"al     autolevels                              automatic brightness / contrast\n"
711"                       f        fullyrange     stretch luminance to (0..255)\n"
712"lb     linblenddeint                           linear blend deinterlacer\n"
713"li     linipoldeint                            linear interpolating deinterlace\n"
714"ci     cubicipoldeint                          cubic interpolating deinterlacer\n"
715"md     mediandeint                             median deinterlacer\n"
716"fd     ffmpegdeint                             ffmpeg deinterlacer\n"
717"l5     lowpass5                                FIR lowpass deinterlacer\n"
718"de     default                                 hb:a,vb:a,dr:a\n"
719"fa     fast                                    h1:a,v1:a,dr:a\n"
720"ac                                             ha:a:128:7,va:a,dr:a\n"
721"tn     tmpnoise        (3 threshold)           temporal noise reducer\n"
722"                     1. <= 2. <= 3.            larger -> stronger filtering\n"
723"fq     forceQuant      <quantizer>             force quantizer\n"
724"Usage:\n"
725"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
726"long form example:\n"
727"vdeblock:autoq/hdeblock:autoq/linblenddeint    default,-vdeblock\n"
728"short form example:\n"
729"vb:a/hb:a/lb                                   de,-vb\n"
730"more examples:\n"
731"tn:64:128:256\n"
732"\n"
733;
734
735pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
736{
737    char temp[GET_MODE_BUFFER_SIZE];
738    char *p= temp;
739    static const char filterDelimiters[] = ",/";
740    static const char optionDelimiters[] = ":";
741    struct PPMode *ppMode;
742    char *filterToken;
743
744    ppMode= av_malloc(sizeof(PPMode));
745
746    ppMode->lumMode= 0;
747    ppMode->chromMode= 0;
748    ppMode->maxTmpNoise[0]= 700;
749    ppMode->maxTmpNoise[1]= 1500;
750    ppMode->maxTmpNoise[2]= 3000;
751    ppMode->maxAllowedY= 234;
752    ppMode->minAllowedY= 16;
753    ppMode->baseDcDiff= 256/8;
754    ppMode->flatnessThreshold= 56-16-1;
755    ppMode->maxClippedThreshold= 0.01;
756    ppMode->error=0;
757
758    strncpy(temp, name, GET_MODE_BUFFER_SIZE);
759
760    av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
761
762    for(;;){
763        char *filterName;
764        int q= 1000000; //PP_QUALITY_MAX;
765        int chrom=-1;
766        int luma=-1;
767        char *option;
768        char *options[OPTIONS_ARRAY_SIZE];
769        int i;
770        int filterNameOk=0;
771        int numOfUnknownOptions=0;
772        int enable=1; //does the user want us to enabled or disabled the filter
773
774        filterToken= strtok(p, filterDelimiters);
775        if(filterToken == NULL) break;
776        p+= strlen(filterToken) + 1; // p points to next filterToken
777        filterName= strtok(filterToken, optionDelimiters);
778        av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
779
780        if(*filterName == '-'){
781            enable=0;
782            filterName++;
783        }
784
785        for(;;){ //for all options
786            option= strtok(NULL, optionDelimiters);
787            if(option == NULL) break;
788
789            av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
790            if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
791            else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
792            else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
793            else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
794            else{
795                options[numOfUnknownOptions] = option;
796                numOfUnknownOptions++;
797            }
798            if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
799        }
800        options[numOfUnknownOptions] = NULL;
801
802        /* replace stuff from the replace Table */
803        for(i=0; replaceTable[2*i]!=NULL; i++){
804            if(!strcmp(replaceTable[2*i], filterName)){
805                int newlen= strlen(replaceTable[2*i + 1]);
806                int plen;
807                int spaceLeft;
808
809                if(p==NULL) p= temp, *p=0;      //last filter
810                else p--, *p=',';               //not last filter
811
812                plen= strlen(p);
813                spaceLeft= p - temp + plen;
814                if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE){
815                    ppMode->error++;
816                    break;
817                }
818                memmove(p + newlen, p, plen+1);
819                memcpy(p, replaceTable[2*i + 1], newlen);
820                filterNameOk=1;
821            }
822        }
823
824        for(i=0; filters[i].shortName!=NULL; i++){
825            if(   !strcmp(filters[i].longName, filterName)
826               || !strcmp(filters[i].shortName, filterName)){
827                ppMode->lumMode &= ~filters[i].mask;
828                ppMode->chromMode &= ~filters[i].mask;
829
830                filterNameOk=1;
831                if(!enable) break; // user wants to disable it
832
833                if(q >= filters[i].minLumQuality && luma)
834                    ppMode->lumMode|= filters[i].mask;
835                if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
836                    if(q >= filters[i].minChromQuality)
837                            ppMode->chromMode|= filters[i].mask;
838
839                if(filters[i].mask == LEVEL_FIX){
840                    int o;
841                    ppMode->minAllowedY= 16;
842                    ppMode->maxAllowedY= 234;
843                    for(o=0; options[o]!=NULL; o++){
844                        if(  !strcmp(options[o],"fullyrange")
845                           ||!strcmp(options[o],"f")){
846                            ppMode->minAllowedY= 0;
847                            ppMode->maxAllowedY= 255;
848                            numOfUnknownOptions--;
849                        }
850                    }
851                }
852                else if(filters[i].mask == TEMP_NOISE_FILTER)
853                {
854                    int o;
855                    int numOfNoises=0;
856
857                    for(o=0; options[o]!=NULL; o++){
858                        char *tail;
859                        ppMode->maxTmpNoise[numOfNoises]=
860                            strtol(options[o], &tail, 0);
861                        if(tail!=options[o]){
862                            numOfNoises++;
863                            numOfUnknownOptions--;
864                            if(numOfNoises >= 3) break;
865                        }
866                    }
867                }
868                else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
869                     || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
870                    int o;
871
872                    for(o=0; options[o]!=NULL && o<2; o++){
873                        char *tail;
874                        int val= strtol(options[o], &tail, 0);
875                        if(tail==options[o]) break;
876
877                        numOfUnknownOptions--;
878                        if(o==0) ppMode->baseDcDiff= val;
879                        else ppMode->flatnessThreshold= val;
880                    }
881                }
882                else if(filters[i].mask == FORCE_QUANT){
883                    int o;
884                    ppMode->forcedQuant= 15;
885
886                    for(o=0; options[o]!=NULL && o<1; o++){
887                        char *tail;
888                        int val= strtol(options[o], &tail, 0);
889                        if(tail==options[o]) break;
890
891                        numOfUnknownOptions--;
892                        ppMode->forcedQuant= val;
893                    }
894                }
895            }
896        }
897        if(!filterNameOk) ppMode->error++;
898        ppMode->error += numOfUnknownOptions;
899    }
900
901    av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
902    if(ppMode->error){
903        av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
904        av_free(ppMode);
905        return NULL;
906    }
907    return ppMode;
908}
909
910void pp_free_mode(pp_mode *mode){
911    av_free(mode);
912}
913
914static void reallocAlign(void **p, int alignment, int size){
915    av_free(*p);
916    *p= av_mallocz(size);
917}
918
919static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
920    int mbWidth = (width+15)>>4;
921    int mbHeight= (height+15)>>4;
922    int i;
923
924    c->stride= stride;
925    c->qpStride= qpStride;
926
927    reallocAlign((void **)&c->tempDst, 8, stride*24);
928    reallocAlign((void **)&c->tempSrc, 8, stride*24);
929    reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
930    reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
931    for(i=0; i<256; i++)
932            c->yHistogram[i]= width*height/64*15/256;
933
934    for(i=0; i<3; i++){
935        //Note: The +17*1024 is just there so i do not have to worry about r/w over the end.
936        reallocAlign((void **)&c->tempBlurred[i], 8, stride*mbHeight*16 + 17*1024);
937        reallocAlign((void **)&c->tempBlurredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
938    }
939
940    reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
941    reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
942    reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
943    reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
944}
945
946static const char * context_to_name(void * ptr) {
947    return "postproc";
948}
949
950static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
951
952pp_context *pp_get_context(int width, int height, int cpuCaps){
953    PPContext *c= av_malloc(sizeof(PPContext));
954    int stride= (width+15)&(~15);    //assumed / will realloc if needed
955    int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
956
957    memset(c, 0, sizeof(PPContext));
958    c->av_class = &av_codec_context_class;
959    c->cpuCaps= cpuCaps;
960    if(cpuCaps&PP_FORMAT){
961        c->hChromaSubSample= cpuCaps&0x3;
962        c->vChromaSubSample= (cpuCaps>>4)&0x3;
963    }else{
964        c->hChromaSubSample= 1;
965        c->vChromaSubSample= 1;
966    }
967
968    reallocBuffers(c, width, height, stride, qpStride);
969
970    c->frameNum=-1;
971
972    return c;
973}
974
975void pp_free_context(void *vc){
976    PPContext *c = (PPContext*)vc;
977    int i;
978
979    for(i=0; i<3; i++) av_free(c->tempBlurred[i]);
980    for(i=0; i<3; i++) av_free(c->tempBlurredPast[i]);
981
982    av_free(c->tempBlocks);
983    av_free(c->yHistogram);
984    av_free(c->tempDst);
985    av_free(c->tempSrc);
986    av_free(c->deintTemp);
987    av_free(c->stdQPTable);
988    av_free(c->nonBQPTable);
989    av_free(c->forcedQPTable);
990
991    memset(c, 0, sizeof(PPContext));
992
993    av_free(c);
994}
995
996void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
997                     uint8_t * dst[3], const int dstStride[3],
998                     int width, int height,
999                     const QP_STORE_T *QP_store,  int QPStride,
1000                     pp_mode *vm,  void *vc, int pict_type)
1001{
1002    int mbWidth = (width+15)>>4;
1003    int mbHeight= (height+15)>>4;
1004    PPMode *mode = (PPMode*)vm;
1005    PPContext *c = (PPContext*)vc;
1006    int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
1007    int absQPStride = FFABS(QPStride);
1008
1009    // c->stride and c->QPStride are always positive
1010    if(c->stride < minStride || c->qpStride < absQPStride)
1011        reallocBuffers(c, width, height,
1012                       FFMAX(minStride, c->stride),
1013                       FFMAX(c->qpStride, absQPStride));
1014
1015    if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)){
1016        int i;
1017        QP_store= c->forcedQPTable;
1018        absQPStride = QPStride = 0;
1019        if(mode->lumMode & FORCE_QUANT)
1020            for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
1021        else
1022            for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
1023    }
1024
1025    if(pict_type & PP_PICT_TYPE_QP2){
1026        int i;
1027        const int count= mbHeight * absQPStride;
1028        for(i=0; i<(count>>2); i++){
1029            ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1030        }
1031        for(i<<=2; i<count; i++){
1032            c->stdQPTable[i] = QP_store[i]>>1;
1033        }
1034        QP_store= c->stdQPTable;
1035        QPStride= absQPStride;
1036    }
1037
1038    if(0){
1039        int x,y;
1040        for(y=0; y<mbHeight; y++){
1041            for(x=0; x<mbWidth; x++){
1042                av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
1043            }
1044            av_log(c, AV_LOG_INFO, "\n");
1045        }
1046        av_log(c, AV_LOG_INFO, "\n");
1047    }
1048
1049    if((pict_type&7)!=3){
1050        if (QPStride >= 0){
1051            int i;
1052            const int count= mbHeight * QPStride;
1053            for(i=0; i<(count>>2); i++){
1054                ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1055            }
1056            for(i<<=2; i<count; i++){
1057                c->nonBQPTable[i] = QP_store[i] & 0x3F;
1058            }
1059        } else {
1060            int i,j;
1061            for(i=0; i<mbHeight; i++) {
1062                for(j=0; j<absQPStride; j++) {
1063                    c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1064                }
1065            }
1066        }
1067    }
1068
1069    av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1070           mode->lumMode, mode->chromMode);
1071
1072    postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1073                width, height, QP_store, QPStride, 0, mode, c);
1074
1075    width  = (width )>>c->hChromaSubSample;
1076    height = (height)>>c->vChromaSubSample;
1077
1078    if(mode->chromMode){
1079        postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1080                    width, height, QP_store, QPStride, 1, mode, c);
1081        postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1082                    width, height, QP_store, QPStride, 2, mode, c);
1083    }
1084    else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
1085        linecpy(dst[1], src[1], height, srcStride[1]);
1086        linecpy(dst[2], src[2], height, srcStride[2]);
1087    }else{
1088        int y;
1089        for(y=0; y<height; y++){
1090            memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1091            memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1092        }
1093    }
1094}
1095
1096