1/*
2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "libavcodec/dsputil.h"
24
25#include "dsputil_ppc.h"
26
27#include "dsputil_altivec.h"
28
29void fdct_altivec(int16_t *block);
30void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h,
31                  int x16, int y16, int rounder);
32void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
33void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
34
35void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx);
36
37void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx);
38void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx);
39void snow_init_altivec(DSPContext* c, AVCodecContext *avctx);
40void float_init_altivec(DSPContext* c, AVCodecContext *avctx);
41void int_init_altivec(DSPContext* c, AVCodecContext *avctx);
42
43int mm_flags = 0;
44
45int mm_support(void)
46{
47    int result = 0;
48#if HAVE_ALTIVEC
49    if (has_altivec()) {
50        result |= FF_MM_ALTIVEC;
51    }
52#endif /* result */
53    return result;
54}
55
56#if CONFIG_POWERPC_PERF
57unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
58/* list below must match enum in dsputil_ppc.h */
59static unsigned char* perfname[] = {
60    "ff_fft_calc_altivec",
61    "gmc1_altivec",
62    "dct_unquantize_h263_altivec",
63    "fdct_altivec",
64    "idct_add_altivec",
65    "idct_put_altivec",
66    "put_pixels16_altivec",
67    "avg_pixels16_altivec",
68    "avg_pixels8_altivec",
69    "put_pixels8_xy2_altivec",
70    "put_no_rnd_pixels8_xy2_altivec",
71    "put_pixels16_xy2_altivec",
72    "put_no_rnd_pixels16_xy2_altivec",
73    "hadamard8_diff8x8_altivec",
74    "hadamard8_diff16_altivec",
75    "avg_pixels8_xy2_altivec",
76    "clear_blocks_dcbz32_ppc",
77    "clear_blocks_dcbz128_ppc",
78    "put_h264_chroma_mc8_altivec",
79    "avg_h264_chroma_mc8_altivec",
80    "put_h264_qpel16_h_lowpass_altivec",
81    "avg_h264_qpel16_h_lowpass_altivec",
82    "put_h264_qpel16_v_lowpass_altivec",
83    "avg_h264_qpel16_v_lowpass_altivec",
84    "put_h264_qpel16_hv_lowpass_altivec",
85    "avg_h264_qpel16_hv_lowpass_altivec",
86    ""
87};
88#include <stdio.h>
89#endif
90
91#if CONFIG_POWERPC_PERF
92void powerpc_display_perf_report(void)
93{
94    int i, j;
95    av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
96    for(i = 0 ; i < powerpc_perf_total ; i++) {
97        for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) {
98            if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
99                av_log(NULL, AV_LOG_INFO,
100                       " Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n",
101                       perfname[i],
102                       j+1,
103                       perfdata[j][i][powerpc_data_min],
104                       perfdata[j][i][powerpc_data_max],
105                       (double)perfdata[j][i][powerpc_data_sum] /
106                       (double)perfdata[j][i][powerpc_data_num],
107                       perfdata[j][i][powerpc_data_num]);
108        }
109    }
110}
111#endif /* CONFIG_POWERPC_PERF */
112
113/* ***** WARNING ***** WARNING ***** WARNING ***** */
114/*
115clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a
116cache line size not equal to 32 bytes.
117Fortunately all processor used by Apple up to at least the 7450 (aka second
118generation G4) use 32 bytes cache line.
119This is due to the use of the 'dcbz' instruction. It simply clear to zero a
120single cache line, so you need to know the cache line size to use it !
121It's absurd, but it's fast...
122
123update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line
124size: 128 bytes. Oups.
125The semantic of dcbz was changed, it always clear 32 bytes. so the function
126below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
127which is defined to clear a cache line (as dcbz before). So we still can
128distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
129
130see <http://developer.apple.com/technotes/tn/tn2087.html>
131and <http://developer.apple.com/technotes/tn/tn2086.html>
132*/
133void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
134{
135POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
136    register int misal = ((unsigned long)blocks & 0x00000010);
137    register int i = 0;
138POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
139#if 1
140    if (misal) {
141        ((unsigned long*)blocks)[0] = 0L;
142        ((unsigned long*)blocks)[1] = 0L;
143        ((unsigned long*)blocks)[2] = 0L;
144        ((unsigned long*)blocks)[3] = 0L;
145        i += 16;
146    }
147    for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
148        __asm__ volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
149    }
150    if (misal) {
151        ((unsigned long*)blocks)[188] = 0L;
152        ((unsigned long*)blocks)[189] = 0L;
153        ((unsigned long*)blocks)[190] = 0L;
154        ((unsigned long*)blocks)[191] = 0L;
155        i += 16;
156    }
157#else
158    memset(blocks, 0, sizeof(DCTELEM)*6*64);
159#endif
160POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
161}
162
163/* same as above, when dcbzl clear a whole 128B cache line
164   i.e. the PPC970 aka G5 */
165#if HAVE_DCBZL
166void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
167{
168POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
169    register int misal = ((unsigned long)blocks & 0x0000007f);
170    register int i = 0;
171POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
172#if 1
173    if (misal) {
174        // we could probably also optimize this case,
175        // but there's not much point as the machines
176        // aren't available yet (2003-06-26)
177        memset(blocks, 0, sizeof(DCTELEM)*6*64);
178    }
179    else
180        for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
181            __asm__ volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
182        }
183#else
184    memset(blocks, 0, sizeof(DCTELEM)*6*64);
185#endif
186POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
187}
188#else
189void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
190{
191    memset(blocks, 0, sizeof(DCTELEM)*6*64);
192}
193#endif
194
195#if HAVE_DCBZL
196/* check dcbz report how many bytes are set to 0 by dcbz */
197/* update 24/06/2003 : replace dcbz by dcbzl to get
198   the intended effect (Apple "fixed" dcbz)
199   unfortunately this cannot be used unless the assembler
200   knows about dcbzl ... */
201long check_dcbzl_effect(void)
202{
203    register char *fakedata = av_malloc(1024);
204    register char *fakedata_middle;
205    register long zero = 0;
206    register long i = 0;
207    long count = 0;
208
209    if (!fakedata) {
210        return 0L;
211    }
212
213    fakedata_middle = (fakedata + 512);
214
215    memset(fakedata, 0xFF, 1024);
216
217    /* below the constraint "b" seems to mean "Address base register"
218       in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
219    __asm__ volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
220
221    for (i = 0; i < 1024 ; i ++) {
222        if (fakedata[i] == (char)0)
223            count++;
224    }
225
226    av_free(fakedata);
227
228    return count;
229}
230#else
231long check_dcbzl_effect(void)
232{
233  return 0;
234}
235#endif
236
237static void prefetch_ppc(void *mem, int stride, int h)
238{
239    register const uint8_t *p = mem;
240    do {
241        __asm__ volatile ("dcbt 0,%0" : : "r" (p));
242        p+= stride;
243    } while(--h);
244}
245
246void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
247{
248    // Common optimizations whether AltiVec is available or not
249    c->prefetch = prefetch_ppc;
250    switch (check_dcbzl_effect()) {
251        case 32:
252            c->clear_blocks = clear_blocks_dcbz32_ppc;
253            break;
254        case 128:
255            c->clear_blocks = clear_blocks_dcbz128_ppc;
256            break;
257        default:
258            break;
259    }
260
261#if HAVE_ALTIVEC
262    if(CONFIG_H264_DECODER) dsputil_h264_init_ppc(c, avctx);
263
264    if (has_altivec()) {
265        mm_flags |= FF_MM_ALTIVEC;
266
267        dsputil_init_altivec(c, avctx);
268        if(CONFIG_SNOW_DECODER) snow_init_altivec(c, avctx);
269        if(CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER)
270            vc1dsp_init_altivec(c, avctx);
271        float_init_altivec(c, avctx);
272        int_init_altivec(c, avctx);
273        c->gmc1 = gmc1_altivec;
274
275#if CONFIG_ENCODERS
276        if (avctx->dct_algo == FF_DCT_AUTO ||
277            avctx->dct_algo == FF_DCT_ALTIVEC) {
278            c->fdct = fdct_altivec;
279        }
280#endif //CONFIG_ENCODERS
281
282        if (avctx->lowres==0) {
283            if ((avctx->idct_algo == FF_IDCT_AUTO) ||
284                (avctx->idct_algo == FF_IDCT_ALTIVEC)) {
285                c->idct_put = idct_put_altivec;
286                c->idct_add = idct_add_altivec;
287                c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
288            }
289        }
290
291#if CONFIG_POWERPC_PERF
292        {
293            int i, j;
294            for (i = 0 ; i < powerpc_perf_total ; i++) {
295                for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) {
296                    perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL;
297                    perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL;
298                    perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL;
299                    perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL;
300                }
301            }
302        }
303#endif /* CONFIG_POWERPC_PERF */
304    }
305#endif /* HAVE_ALTIVEC */
306}
307