1/*
2 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3 *
4 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of Libav.
7 *
8 * Libav is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * Libav is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with Libav; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23/**
24 * @file
25 * postprocessing.
26 */
27
28/*
29                        C       MMX     MMX2    3DNow   AltiVec
30isVertDC                Ec      Ec                      Ec
31isVertMinMaxOk          Ec      Ec                      Ec
32doVertLowPass           E               e       e       Ec
33doVertDefFilter         Ec      Ec      e       e       Ec
34isHorizDC               Ec      Ec                      Ec
35isHorizMinMaxOk         a       E                       Ec
36doHorizLowPass          E               e       e       Ec
37doHorizDefFilter        Ec      Ec      e       e       Ec
38do_a_deblock            Ec      E       Ec      E
39deRing                  E               e       e*      Ecp
40Vertical RKAlgo1        E               a       a
41Horizontal RKAlgo1                      a       a
42Vertical X1#            a               E       E
43Horizontal X1#          a               E       E
44LinIpolDeinterlace      e               E       E*
45CubicIpolDeinterlace    a               e       e*
46LinBlendDeinterlace     e               E       E*
47MedianDeinterlace#      E       Ec      Ec
48TempDeNoiser#           E               e       e       Ec
49
50* I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
51# more or less selfinvented filters so the exactness is not too meaningful
52E = Exact implementation
53e = almost exact implementation (slightly different rounding,...)
54a = alternative / approximate impl
55c = checked against the other implementations (-vo md5)
56p = partially optimized, still some work to do
57*/
58
59/*
60TODO:
61reduce the time wasted on the mem transfer
62unroll stuff if instructions depend too much on the prior one
63move YScale thing to the end instead of fixing QP
64write a faster and higher quality deblocking filter :)
65make the mainloop more flexible (variable number of blocks at once
66        (the if/else stuff per block is slowing things down)
67compare the quality & speed of all filters
68split this huge file
69optimize c versions
70try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
71...
72*/
73
74//Changelog: use git log
75
76#include "config.h"
77#include "libavutil/avutil.h"
78#include <inttypes.h>
79#include <stdio.h>
80#include <stdlib.h>
81#include <string.h>
82//#undef HAVE_MMX2
83//#define HAVE_AMD3DNOW
84//#undef HAVE_MMX
85//#undef ARCH_X86
86//#define DEBUG_BRIGHTNESS
87#include "postprocess.h"
88#include "postprocess_internal.h"
89#include "libavutil/avstring.h"
90
91unsigned postproc_version(void)
92{
93    return LIBPOSTPROC_VERSION_INT;
94}
95
96const char *postproc_configuration(void)
97{
98    return LIBAV_CONFIGURATION;
99}
100
101const char *postproc_license(void)
102{
103#define LICENSE_PREFIX "libpostproc license: "
104    return LICENSE_PREFIX LIBAV_LICENSE + sizeof(LICENSE_PREFIX) - 1;
105}
106
107#if HAVE_ALTIVEC_H
108#include <altivec.h>
109#endif
110
111#define GET_MODE_BUFFER_SIZE 500
112#define OPTIONS_ARRAY_SIZE 10
113#define BLOCK_SIZE 8
114#define TEMP_STRIDE 8
115//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
116
117#if ARCH_X86
118DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
119DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
120DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
121DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
122DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
123DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
124DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
125DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
126#endif
127
128DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
129
130
131static struct PPFilter filters[]=
132{
133    {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
134    {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
135/*  {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
136    {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
137    {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
138    {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
139    {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
140    {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
141    {"dr", "dering",                1, 5, 6, DERING},
142    {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
143    {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
144    {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
145    {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
146    {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
147    {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
148    {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
149    {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
150    {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
151    {NULL, NULL,0,0,0,0} //End Marker
152};
153
154static const char *replaceTable[]=
155{
156    "default",      "hb:a,vb:a,dr:a",
157    "de",           "hb:a,vb:a,dr:a",
158    "fast",         "h1:a,v1:a,dr:a",
159    "fa",           "h1:a,v1:a,dr:a",
160    "ac",           "ha:a:128:7,va:a,dr:a",
161    NULL //End Marker
162};
163
164
165#if ARCH_X86
166static inline void prefetchnta(void *p)
167{
168    __asm__ volatile(   "prefetchnta (%0)\n\t"
169        : : "r" (p)
170    );
171}
172
173static inline void prefetcht0(void *p)
174{
175    __asm__ volatile(   "prefetcht0 (%0)\n\t"
176        : : "r" (p)
177    );
178}
179
180static inline void prefetcht1(void *p)
181{
182    __asm__ volatile(   "prefetcht1 (%0)\n\t"
183        : : "r" (p)
184    );
185}
186
187static inline void prefetcht2(void *p)
188{
189    __asm__ volatile(   "prefetcht2 (%0)\n\t"
190        : : "r" (p)
191    );
192}
193#endif
194
195/* The horizontal functions exist only in C because the MMX
196 * code is faster with vertical filters and transposing. */
197
198/**
199 * Check if the given 8x8 Block is mostly "flat"
200 */
201static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
202{
203    int numEq= 0;
204    int y;
205    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
206    const int dcThreshold= dcOffset*2 + 1;
207
208    for(y=0; y<BLOCK_SIZE; y++){
209        if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
210        if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
211        if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
212        if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
213        if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
214        if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
215        if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
216        src+= stride;
217    }
218    return numEq > c->ppMode.flatnessThreshold;
219}
220
221/**
222 * Check if the middle 8x8 Block in the given 8x16 block is flat
223 */
224static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c)
225{
226    int numEq= 0;
227    int y;
228    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
229    const int dcThreshold= dcOffset*2 + 1;
230
231    src+= stride*4; // src points to begin of the 8x8 Block
232    for(y=0; y<BLOCK_SIZE-1; y++){
233        if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
234        if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
235        if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
236        if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
237        if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
238        if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
239        if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
240        if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
241        src+= stride;
242    }
243    return numEq > c->ppMode.flatnessThreshold;
244}
245
246static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
247{
248    int i;
249    for(i=0; i<2; i++){
250        if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
251        src += stride;
252        if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
253        src += stride;
254        if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
255        src += stride;
256        if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
257        src += stride;
258    }
259    return 1;
260}
261
262static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
263{
264    int x;
265    src+= stride*4;
266    for(x=0; x<BLOCK_SIZE; x+=4){
267        if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
268        if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
269        if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
270        if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
271    }
272    return 1;
273}
274
275static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c)
276{
277    if( isHorizDC_C(src, stride, c) ){
278        if( isHorizMinMaxOk_C(src, stride, c->QP) )
279            return 1;
280        else
281            return 0;
282    }else{
283        return 2;
284    }
285}
286
287static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c)
288{
289    if( isVertDC_C(src, stride, c) ){
290        if( isVertMinMaxOk_C(src, stride, c->QP) )
291            return 1;
292        else
293            return 0;
294    }else{
295        return 2;
296    }
297}
298
299static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
300{
301    int y;
302    for(y=0; y<BLOCK_SIZE; y++){
303        const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
304
305        if(FFABS(middleEnergy) < 8*c->QP){
306            const int q=(dst[3] - dst[4])/2;
307            const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
308            const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
309
310            int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
311            d= FFMAX(d, 0);
312
313            d= (5*d + 32) >> 6;
314            d*= FFSIGN(-middleEnergy);
315
316            if(q>0)
317            {
318                d= d<0 ? 0 : d;
319                d= d>q ? q : d;
320            }
321            else
322            {
323                d= d>0 ? 0 : d;
324                d= d<q ? q : d;
325            }
326
327            dst[3]-= d;
328            dst[4]+= d;
329        }
330        dst+= stride;
331    }
332}
333
334/**
335 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
336 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
337 */
338static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
339{
340    int y;
341    for(y=0; y<BLOCK_SIZE; y++){
342        const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
343        const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
344
345        int sums[10];
346        sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
347        sums[1] = sums[0] - first  + dst[3];
348        sums[2] = sums[1] - first  + dst[4];
349        sums[3] = sums[2] - first  + dst[5];
350        sums[4] = sums[3] - first  + dst[6];
351        sums[5] = sums[4] - dst[0] + dst[7];
352        sums[6] = sums[5] - dst[1] + last;
353        sums[7] = sums[6] - dst[2] + last;
354        sums[8] = sums[7] - dst[3] + last;
355        sums[9] = sums[8] - dst[4] + last;
356
357        dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
358        dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
359        dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
360        dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
361        dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
362        dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
363        dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
364        dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
365
366        dst+= stride;
367    }
368}
369
370/**
371 * Experimental Filter 1 (Horizontal)
372 * will not damage linear gradients
373 * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
374 * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
375 * MMX2 version does correct clipping C version does not
376 * not identical with the vertical one
377 */
378static inline void horizX1Filter(uint8_t *src, int stride, int QP)
379{
380    int y;
381    static uint64_t *lut= NULL;
382    if(lut==NULL)
383    {
384        int i;
385        lut = av_malloc(256*8);
386        for(i=0; i<256; i++)
387        {
388            int v= i < 128 ? 2*i : 2*(i-256);
389/*
390//Simulate 112242211 9-Tap filter
391            uint64_t a= (v/16)  & 0xFF;
392            uint64_t b= (v/8)   & 0xFF;
393            uint64_t c= (v/4)   & 0xFF;
394            uint64_t d= (3*v/8) & 0xFF;
395*/
396//Simulate piecewise linear interpolation
397            uint64_t a= (v/16)   & 0xFF;
398            uint64_t b= (v*3/16) & 0xFF;
399            uint64_t c= (v*5/16) & 0xFF;
400            uint64_t d= (7*v/16) & 0xFF;
401            uint64_t A= (0x100 - a)&0xFF;
402            uint64_t B= (0x100 - b)&0xFF;
403            uint64_t C= (0x100 - c)&0xFF;
404            uint64_t D= (0x100 - c)&0xFF;
405
406            lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
407                       (D<<24) | (C<<16) | (B<<8)  | (A);
408            //lut[i] = (v<<32) | (v<<24);
409        }
410    }
411
412    for(y=0; y<BLOCK_SIZE; y++){
413        int a= src[1] - src[2];
414        int b= src[3] - src[4];
415        int c= src[5] - src[6];
416
417        int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
418
419        if(d < QP){
420            int v = d * FFSIGN(-b);
421
422            src[1] +=v/8;
423            src[2] +=v/4;
424            src[3] +=3*v/8;
425            src[4] -=3*v/8;
426            src[5] -=v/4;
427            src[6] -=v/8;
428        }
429        src+=stride;
430    }
431}
432
433/**
434 * accurate deblock filter
435 */
436static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
437    int y;
438    const int QP= c->QP;
439    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
440    const int dcThreshold= dcOffset*2 + 1;
441//START_TIMER
442    src+= step*4; // src points to begin of the 8x8 Block
443    for(y=0; y<8; y++){
444        int numEq= 0;
445
446        if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
447        if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
448        if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
449        if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
450        if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
451        if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
452        if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
453        if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
454        if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
455        if(numEq > c->ppMode.flatnessThreshold){
456            int min, max, x;
457
458            if(src[0] > src[step]){
459                max= src[0];
460                min= src[step];
461            }else{
462                max= src[step];
463                min= src[0];
464            }
465            for(x=2; x<8; x+=2){
466                if(src[x*step] > src[(x+1)*step]){
467                        if(src[x    *step] > max) max= src[ x   *step];
468                        if(src[(x+1)*step] < min) min= src[(x+1)*step];
469                }else{
470                        if(src[(x+1)*step] > max) max= src[(x+1)*step];
471                        if(src[ x   *step] < min) min= src[ x   *step];
472                }
473            }
474            if(max-min < 2*QP){
475                const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
476                const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
477
478                int sums[10];
479                sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
480                sums[1] = sums[0] - first       + src[3*step];
481                sums[2] = sums[1] - first       + src[4*step];
482                sums[3] = sums[2] - first       + src[5*step];
483                sums[4] = sums[3] - first       + src[6*step];
484                sums[5] = sums[4] - src[0*step] + src[7*step];
485                sums[6] = sums[5] - src[1*step] + last;
486                sums[7] = sums[6] - src[2*step] + last;
487                sums[8] = sums[7] - src[3*step] + last;
488                sums[9] = sums[8] - src[4*step] + last;
489
490                src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
491                src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
492                src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
493                src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
494                src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
495                src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
496                src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
497                src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
498            }
499        }else{
500            const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
501
502            if(FFABS(middleEnergy) < 8*QP){
503                const int q=(src[3*step] - src[4*step])/2;
504                const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
505                const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
506
507                int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
508                d= FFMAX(d, 0);
509
510                d= (5*d + 32) >> 6;
511                d*= FFSIGN(-middleEnergy);
512
513                if(q>0){
514                    d= d<0 ? 0 : d;
515                    d= d>q ? q : d;
516                }else{
517                    d= d>0 ? 0 : d;
518                    d= d<q ? q : d;
519                }
520
521                src[3*step]-= d;
522                src[4*step]+= d;
523            }
524        }
525
526        src += stride;
527    }
528/*if(step==16){
529    STOP_TIMER("step16")
530}else{
531    STOP_TIMER("stepX")
532}*/
533}
534
535//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
536//Plain C versions
537#if !(HAVE_MMX || HAVE_ALTIVEC) || CONFIG_RUNTIME_CPUDETECT
538#define COMPILE_C
539#endif
540
541#if HAVE_ALTIVEC
542#define COMPILE_ALTIVEC
543#endif //HAVE_ALTIVEC
544
545#if ARCH_X86
546
547#if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
548#define COMPILE_MMX
549#endif
550
551#if HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT
552#define COMPILE_MMX2
553#endif
554
555#if (HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
556#define COMPILE_3DNOW
557#endif
558#endif /* ARCH_X86 */
559
560#undef HAVE_MMX
561#define HAVE_MMX 0
562#undef HAVE_MMX2
563#define HAVE_MMX2 0
564#undef HAVE_AMD3DNOW
565#define HAVE_AMD3DNOW 0
566#undef HAVE_ALTIVEC
567#define HAVE_ALTIVEC 0
568
569#ifdef COMPILE_C
570#define RENAME(a) a ## _C
571#include "postprocess_template.c"
572#endif
573
574#ifdef COMPILE_ALTIVEC
575#undef RENAME
576#undef HAVE_ALTIVEC
577#define HAVE_ALTIVEC 1
578#define RENAME(a) a ## _altivec
579#include "postprocess_altivec_template.c"
580#include "postprocess_template.c"
581#endif
582
583//MMX versions
584#ifdef COMPILE_MMX
585#undef RENAME
586#undef HAVE_MMX
587#define HAVE_MMX 1
588#define RENAME(a) a ## _MMX
589#include "postprocess_template.c"
590#endif
591
592//MMX2 versions
593#ifdef COMPILE_MMX2
594#undef RENAME
595#undef HAVE_MMX
596#undef HAVE_MMX2
597#define HAVE_MMX 1
598#define HAVE_MMX2 1
599#define RENAME(a) a ## _MMX2
600#include "postprocess_template.c"
601#endif
602
603//3DNOW versions
604#ifdef COMPILE_3DNOW
605#undef RENAME
606#undef HAVE_MMX
607#undef HAVE_MMX2
608#undef HAVE_AMD3DNOW
609#define HAVE_MMX 1
610#define HAVE_MMX2 0
611#define HAVE_AMD3DNOW 1
612#define RENAME(a) a ## _3DNow
613#include "postprocess_template.c"
614#endif
615
616// minor note: the HAVE_xyz is messed up after that line so do not use it.
617
618static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
619        const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
620{
621    PPContext *c= (PPContext *)vc;
622    PPMode *ppMode= (PPMode *)vm;
623    c->ppMode= *ppMode; //FIXME
624
625    // Using ifs here as they are faster than function pointers although the
626    // difference would not be measurable here but it is much better because
627    // someone might exchange the CPU whithout restarting MPlayer ;)
628#if CONFIG_RUNTIME_CPUDETECT
629#if ARCH_X86
630    // ordered per speed fastest first
631    if(c->cpuCaps & PP_CPU_CAPS_MMX2)
632        postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
633    else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
634        postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
635    else if(c->cpuCaps & PP_CPU_CAPS_MMX)
636        postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
637    else
638        postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
639#else
640#if HAVE_ALTIVEC
641    if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
642            postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
643    else
644#endif
645            postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
646#endif
647#else /* CONFIG_RUNTIME_CPUDETECT */
648#if   HAVE_MMX2
649            postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
650#elif HAVE_AMD3DNOW
651            postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
652#elif HAVE_MMX
653            postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
654#elif HAVE_ALTIVEC
655            postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
656#else
657            postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
658#endif
659#endif /* !CONFIG_RUNTIME_CPUDETECT */
660}
661
662//static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
663//        QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
664
665/* -pp Command line Help
666*/
667const char pp_help[] =
668"Available postprocessing filters:\n"
669"Filters                        Options\n"
670"short  long name       short   long option     Description\n"
671"*      *               a       autoq           CPU power dependent enabler\n"
672"                       c       chrom           chrominance filtering enabled\n"
673"                       y       nochrom         chrominance filtering disabled\n"
674"                       n       noluma          luma filtering disabled\n"
675"hb     hdeblock        (2 threshold)           horizontal deblocking filter\n"
676"       1. difference factor: default=32, higher -> more deblocking\n"
677"       2. flatness threshold: default=39, lower -> more deblocking\n"
678"                       the h & v deblocking filters share these\n"
679"                       so you can't set different thresholds for h / v\n"
680"vb     vdeblock        (2 threshold)           vertical deblocking filter\n"
681"ha     hadeblock       (2 threshold)           horizontal deblocking filter\n"
682"va     vadeblock       (2 threshold)           vertical deblocking filter\n"
683"h1     x1hdeblock                              experimental h deblock filter 1\n"
684"v1     x1vdeblock                              experimental v deblock filter 1\n"
685"dr     dering                                  deringing filter\n"
686"al     autolevels                              automatic brightness / contrast\n"
687"                       f        fullyrange     stretch luminance to (0..255)\n"
688"lb     linblenddeint                           linear blend deinterlacer\n"
689"li     linipoldeint                            linear interpolating deinterlace\n"
690"ci     cubicipoldeint                          cubic interpolating deinterlacer\n"
691"md     mediandeint                             median deinterlacer\n"
692"fd     ffmpegdeint                             ffmpeg deinterlacer\n"
693"l5     lowpass5                                FIR lowpass deinterlacer\n"
694"de     default                                 hb:a,vb:a,dr:a\n"
695"fa     fast                                    h1:a,v1:a,dr:a\n"
696"ac                                             ha:a:128:7,va:a,dr:a\n"
697"tn     tmpnoise        (3 threshold)           temporal noise reducer\n"
698"                     1. <= 2. <= 3.            larger -> stronger filtering\n"
699"fq     forceQuant      <quantizer>             force quantizer\n"
700"Usage:\n"
701"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
702"long form example:\n"
703"vdeblock:autoq/hdeblock:autoq/linblenddeint    default,-vdeblock\n"
704"short form example:\n"
705"vb:a/hb:a/lb                                   de,-vb\n"
706"more examples:\n"
707"tn:64:128:256\n"
708"\n"
709;
710
711pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
712{
713    char temp[GET_MODE_BUFFER_SIZE];
714    char *p= temp;
715    static const char filterDelimiters[] = ",/";
716    static const char optionDelimiters[] = ":";
717    struct PPMode *ppMode;
718    char *filterToken;
719
720    ppMode= av_malloc(sizeof(PPMode));
721
722    ppMode->lumMode= 0;
723    ppMode->chromMode= 0;
724    ppMode->maxTmpNoise[0]= 700;
725    ppMode->maxTmpNoise[1]= 1500;
726    ppMode->maxTmpNoise[2]= 3000;
727    ppMode->maxAllowedY= 234;
728    ppMode->minAllowedY= 16;
729    ppMode->baseDcDiff= 256/8;
730    ppMode->flatnessThreshold= 56-16-1;
731    ppMode->maxClippedThreshold= 0.01;
732    ppMode->error=0;
733
734    memset(temp, 0, GET_MODE_BUFFER_SIZE);
735    av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
736
737    av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
738
739    for(;;){
740        char *filterName;
741        int q= 1000000; //PP_QUALITY_MAX;
742        int chrom=-1;
743        int luma=-1;
744        char *option;
745        char *options[OPTIONS_ARRAY_SIZE];
746        int i;
747        int filterNameOk=0;
748        int numOfUnknownOptions=0;
749        int enable=1; //does the user want us to enabled or disabled the filter
750
751        filterToken= strtok(p, filterDelimiters);
752        if(filterToken == NULL) break;
753        p+= strlen(filterToken) + 1; // p points to next filterToken
754        filterName= strtok(filterToken, optionDelimiters);
755        av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
756
757        if(*filterName == '-'){
758            enable=0;
759            filterName++;
760        }
761
762        for(;;){ //for all options
763            option= strtok(NULL, optionDelimiters);
764            if(option == NULL) break;
765
766            av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
767            if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
768            else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
769            else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
770            else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
771            else{
772                options[numOfUnknownOptions] = option;
773                numOfUnknownOptions++;
774            }
775            if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
776        }
777        options[numOfUnknownOptions] = NULL;
778
779        /* replace stuff from the replace Table */
780        for(i=0; replaceTable[2*i]!=NULL; i++){
781            if(!strcmp(replaceTable[2*i], filterName)){
782                int newlen= strlen(replaceTable[2*i + 1]);
783                int plen;
784                int spaceLeft;
785
786                if(p==NULL) p= temp, *p=0;      //last filter
787                else p--, *p=',';               //not last filter
788
789                plen= strlen(p);
790                spaceLeft= p - temp + plen;
791                if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE - 1){
792                    ppMode->error++;
793                    break;
794                }
795                memmove(p + newlen, p, plen+1);
796                memcpy(p, replaceTable[2*i + 1], newlen);
797                filterNameOk=1;
798            }
799        }
800
801        for(i=0; filters[i].shortName!=NULL; i++){
802            if(   !strcmp(filters[i].longName, filterName)
803               || !strcmp(filters[i].shortName, filterName)){
804                ppMode->lumMode &= ~filters[i].mask;
805                ppMode->chromMode &= ~filters[i].mask;
806
807                filterNameOk=1;
808                if(!enable) break; // user wants to disable it
809
810                if(q >= filters[i].minLumQuality && luma)
811                    ppMode->lumMode|= filters[i].mask;
812                if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
813                    if(q >= filters[i].minChromQuality)
814                            ppMode->chromMode|= filters[i].mask;
815
816                if(filters[i].mask == LEVEL_FIX){
817                    int o;
818                    ppMode->minAllowedY= 16;
819                    ppMode->maxAllowedY= 234;
820                    for(o=0; options[o]!=NULL; o++){
821                        if(  !strcmp(options[o],"fullyrange")
822                           ||!strcmp(options[o],"f")){
823                            ppMode->minAllowedY= 0;
824                            ppMode->maxAllowedY= 255;
825                            numOfUnknownOptions--;
826                        }
827                    }
828                }
829                else if(filters[i].mask == TEMP_NOISE_FILTER)
830                {
831                    int o;
832                    int numOfNoises=0;
833
834                    for(o=0; options[o]!=NULL; o++){
835                        char *tail;
836                        ppMode->maxTmpNoise[numOfNoises]=
837                            strtol(options[o], &tail, 0);
838                        if(tail!=options[o]){
839                            numOfNoises++;
840                            numOfUnknownOptions--;
841                            if(numOfNoises >= 3) break;
842                        }
843                    }
844                }
845                else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
846                     || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
847                    int o;
848
849                    for(o=0; options[o]!=NULL && o<2; o++){
850                        char *tail;
851                        int val= strtol(options[o], &tail, 0);
852                        if(tail==options[o]) break;
853
854                        numOfUnknownOptions--;
855                        if(o==0) ppMode->baseDcDiff= val;
856                        else ppMode->flatnessThreshold= val;
857                    }
858                }
859                else if(filters[i].mask == FORCE_QUANT){
860                    int o;
861                    ppMode->forcedQuant= 15;
862
863                    for(o=0; options[o]!=NULL && o<1; o++){
864                        char *tail;
865                        int val= strtol(options[o], &tail, 0);
866                        if(tail==options[o]) break;
867
868                        numOfUnknownOptions--;
869                        ppMode->forcedQuant= val;
870                    }
871                }
872            }
873        }
874        if(!filterNameOk) ppMode->error++;
875        ppMode->error += numOfUnknownOptions;
876    }
877
878    av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
879    if(ppMode->error){
880        av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
881        av_free(ppMode);
882        return NULL;
883    }
884    return ppMode;
885}
886
887void pp_free_mode(pp_mode *mode){
888    av_free(mode);
889}
890
891static void reallocAlign(void **p, int alignment, int size){
892    av_free(*p);
893    *p= av_mallocz(size);
894}
895
896static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
897    int mbWidth = (width+15)>>4;
898    int mbHeight= (height+15)>>4;
899    int i;
900
901    c->stride= stride;
902    c->qpStride= qpStride;
903
904    reallocAlign((void **)&c->tempDst, 8, stride*24);
905    reallocAlign((void **)&c->tempSrc, 8, stride*24);
906    reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
907    reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
908    for(i=0; i<256; i++)
909            c->yHistogram[i]= width*height/64*15/256;
910
911    for(i=0; i<3; i++){
912        //Note: The +17*1024 is just there so I do not have to worry about r/w over the end.
913        reallocAlign((void **)&c->tempBlurred[i], 8, stride*mbHeight*16 + 17*1024);
914        reallocAlign((void **)&c->tempBlurredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
915    }
916
917    reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
918    reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
919    reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
920    reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
921}
922
923static const char * context_to_name(void * ptr) {
924    return "postproc";
925}
926
927static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
928
929pp_context *pp_get_context(int width, int height, int cpuCaps){
930    PPContext *c= av_malloc(sizeof(PPContext));
931    int stride= FFALIGN(width, 16);  //assumed / will realloc if needed
932    int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
933
934    memset(c, 0, sizeof(PPContext));
935    c->av_class = &av_codec_context_class;
936    c->cpuCaps= cpuCaps;
937    if(cpuCaps&PP_FORMAT){
938        c->hChromaSubSample= cpuCaps&0x3;
939        c->vChromaSubSample= (cpuCaps>>4)&0x3;
940    }else{
941        c->hChromaSubSample= 1;
942        c->vChromaSubSample= 1;
943    }
944
945    reallocBuffers(c, width, height, stride, qpStride);
946
947    c->frameNum=-1;
948
949    return c;
950}
951
952void pp_free_context(void *vc){
953    PPContext *c = (PPContext*)vc;
954    int i;
955
956    for(i=0; i<3; i++) av_free(c->tempBlurred[i]);
957    for(i=0; i<3; i++) av_free(c->tempBlurredPast[i]);
958
959    av_free(c->tempBlocks);
960    av_free(c->yHistogram);
961    av_free(c->tempDst);
962    av_free(c->tempSrc);
963    av_free(c->deintTemp);
964    av_free(c->stdQPTable);
965    av_free(c->nonBQPTable);
966    av_free(c->forcedQPTable);
967
968    memset(c, 0, sizeof(PPContext));
969
970    av_free(c);
971}
972
973void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
974                     uint8_t * dst[3], const int dstStride[3],
975                     int width, int height,
976                     const QP_STORE_T *QP_store,  int QPStride,
977                     pp_mode *vm,  void *vc, int pict_type)
978{
979    int mbWidth = (width+15)>>4;
980    int mbHeight= (height+15)>>4;
981    PPMode *mode = (PPMode*)vm;
982    PPContext *c = (PPContext*)vc;
983    int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
984    int absQPStride = FFABS(QPStride);
985
986    // c->stride and c->QPStride are always positive
987    if(c->stride < minStride || c->qpStride < absQPStride)
988        reallocBuffers(c, width, height,
989                       FFMAX(minStride, c->stride),
990                       FFMAX(c->qpStride, absQPStride));
991
992    if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)){
993        int i;
994        QP_store= c->forcedQPTable;
995        absQPStride = QPStride = 0;
996        if(mode->lumMode & FORCE_QUANT)
997            for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
998        else
999            for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
1000    }
1001
1002    if(pict_type & PP_PICT_TYPE_QP2){
1003        int i;
1004        const int count= mbHeight * absQPStride;
1005        for(i=0; i<(count>>2); i++){
1006            ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1007        }
1008        for(i<<=2; i<count; i++){
1009            c->stdQPTable[i] = QP_store[i]>>1;
1010        }
1011        QP_store= c->stdQPTable;
1012        QPStride= absQPStride;
1013    }
1014
1015    if(0){
1016        int x,y;
1017        for(y=0; y<mbHeight; y++){
1018            for(x=0; x<mbWidth; x++){
1019                av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
1020            }
1021            av_log(c, AV_LOG_INFO, "\n");
1022        }
1023        av_log(c, AV_LOG_INFO, "\n");
1024    }
1025
1026    if((pict_type&7)!=3){
1027        if (QPStride >= 0){
1028            int i;
1029            const int count= mbHeight * QPStride;
1030            for(i=0; i<(count>>2); i++){
1031                ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1032            }
1033            for(i<<=2; i<count; i++){
1034                c->nonBQPTable[i] = QP_store[i] & 0x3F;
1035            }
1036        } else {
1037            int i,j;
1038            for(i=0; i<mbHeight; i++) {
1039                for(j=0; j<absQPStride; j++) {
1040                    c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1041                }
1042            }
1043        }
1044    }
1045
1046    av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1047           mode->lumMode, mode->chromMode);
1048
1049    postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1050                width, height, QP_store, QPStride, 0, mode, c);
1051
1052    width  = (width )>>c->hChromaSubSample;
1053    height = (height)>>c->vChromaSubSample;
1054
1055    if(mode->chromMode){
1056        postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1057                    width, height, QP_store, QPStride, 1, mode, c);
1058        postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1059                    width, height, QP_store, QPStride, 2, mode, c);
1060    }
1061    else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
1062        linecpy(dst[1], src[1], height, srcStride[1]);
1063        linecpy(dst[2], src[2], height, srcStride[2]);
1064    }else{
1065        int y;
1066        for(y=0; y<height; y++){
1067            memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1068            memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1069        }
1070    }
1071}
1072