• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /asuswrt-rt-n18u-9.0.0.4.380.2695/release/src-rt-6.x.4708/router/ffmpeg/libavcodec/alpha/
1/*
2 * Alpha optimized DSP utils
3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavcodec/dsputil.h"
23#include "dsputil_alpha.h"
24#include "asm.h"
25
26void (*put_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels,
27                                 int line_size);
28void (*add_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels,
29                                 int line_size);
30
31#if 0
32/* These functions were the base for the optimized assembler routines,
33   and remain here for documentation purposes.  */
34static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
35                                   int line_size)
36{
37    int i = 8;
38    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
39
40    do {
41        uint64_t shorts0, shorts1;
42
43        shorts0 = ldq(block);
44        shorts0 = maxsw4(shorts0, 0);
45        shorts0 = minsw4(shorts0, clampmask);
46        stl(pkwb(shorts0), pixels);
47
48        shorts1 = ldq(block + 4);
49        shorts1 = maxsw4(shorts1, 0);
50        shorts1 = minsw4(shorts1, clampmask);
51        stl(pkwb(shorts1), pixels + 4);
52
53        pixels += line_size;
54        block += 8;
55    } while (--i);
56}
57
58void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
59                            int line_size)
60{
61    int h = 8;
62    /* Keep this function a leaf function by generating the constants
63       manually (mainly for the hack value ;-).  */
64    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
65    uint64_t signmask  = zap(-1, 0x33);
66    signmask ^= signmask >> 1;  /* 0x8000800080008000 */
67
68    do {
69        uint64_t shorts0, pix0, signs0;
70        uint64_t shorts1, pix1, signs1;
71
72        shorts0 = ldq(block);
73        shorts1 = ldq(block + 4);
74
75        pix0    = unpkbw(ldl(pixels));
76        /* Signed subword add (MMX paddw).  */
77        signs0  = shorts0 & signmask;
78        shorts0 &= ~signmask;
79        shorts0 += pix0;
80        shorts0 ^= signs0;
81        /* Clamp. */
82        shorts0 = maxsw4(shorts0, 0);
83        shorts0 = minsw4(shorts0, clampmask);
84
85        /* Next 4.  */
86        pix1    = unpkbw(ldl(pixels + 4));
87        signs1  = shorts1 & signmask;
88        shorts1 &= ~signmask;
89        shorts1 += pix1;
90        shorts1 ^= signs1;
91        shorts1 = maxsw4(shorts1, 0);
92        shorts1 = minsw4(shorts1, clampmask);
93
94        stl(pkwb(shorts0), pixels);
95        stl(pkwb(shorts1), pixels + 4);
96
97        pixels += line_size;
98        block += 8;
99    } while (--h);
100}
101#endif
102
103static void clear_blocks_axp(DCTELEM *blocks) {
104    uint64_t *p = (uint64_t *) blocks;
105    int n = sizeof(DCTELEM) * 6 * 64;
106
107    do {
108        p[0] = 0;
109        p[1] = 0;
110        p[2] = 0;
111        p[3] = 0;
112        p[4] = 0;
113        p[5] = 0;
114        p[6] = 0;
115        p[7] = 0;
116        p += 8;
117        n -= 8 * 8;
118    } while (n);
119}
120
121static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
122{
123    return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
124}
125
126static inline uint64_t avg2(uint64_t a, uint64_t b)
127{
128    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
129}
130
131#if 0
132/* The XY2 routines basically utilize this scheme, but reuse parts in
133   each iteration.  */
134static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
135{
136    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
137                + ((l2 & ~BYTE_VEC(0x03)) >> 2)
138                + ((l3 & ~BYTE_VEC(0x03)) >> 2)
139                + ((l4 & ~BYTE_VEC(0x03)) >> 2);
140    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
141                    + (l2 & BYTE_VEC(0x03))
142                    + (l3 & BYTE_VEC(0x03))
143                    + (l4 & BYTE_VEC(0x03))
144                    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
145    return r1 + r2;
146}
147#endif
148
149#define OP(LOAD, STORE)                         \
150    do {                                        \
151        STORE(LOAD(pixels), block);             \
152        pixels += line_size;                    \
153        block += line_size;                     \
154    } while (--h)
155
156#define OP_X2(LOAD, STORE)                                      \
157    do {                                                        \
158        uint64_t pix1, pix2;                                    \
159                                                                \
160        pix1 = LOAD(pixels);                                    \
161        pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);        \
162        STORE(AVG2(pix1, pix2), block);                         \
163        pixels += line_size;                                    \
164        block += line_size;                                     \
165    } while (--h)
166
167#define OP_Y2(LOAD, STORE)                      \
168    do {                                        \
169        uint64_t pix = LOAD(pixels);            \
170        do {                                    \
171            uint64_t next_pix;                  \
172                                                \
173            pixels += line_size;                \
174            next_pix = LOAD(pixels);            \
175            STORE(AVG2(pix, next_pix), block);  \
176            block += line_size;                 \
177            pix = next_pix;                     \
178        } while (--h);                          \
179    } while (0)
180
181#define OP_XY2(LOAD, STORE)                                                 \
182    do {                                                                    \
183        uint64_t pix1 = LOAD(pixels);                                       \
184        uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);           \
185        uint64_t pix_l = (pix1 & BYTE_VEC(0x03))                            \
186                       + (pix2 & BYTE_VEC(0x03));                           \
187        uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2)                    \
188                       + ((pix2 & ~BYTE_VEC(0x03)) >> 2);                   \
189                                                                            \
190        do {                                                                \
191            uint64_t npix1, npix2;                                          \
192            uint64_t npix_l, npix_h;                                        \
193            uint64_t avg;                                                   \
194                                                                            \
195            pixels += line_size;                                            \
196            npix1 = LOAD(pixels);                                           \
197            npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56);              \
198            npix_l = (npix1 & BYTE_VEC(0x03))                               \
199                   + (npix2 & BYTE_VEC(0x03));                              \
200            npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2)                       \
201                   + ((npix2 & ~BYTE_VEC(0x03)) >> 2);                      \
202            avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
203                + pix_h + npix_h;                                           \
204            STORE(avg, block);                                              \
205                                                                            \
206            block += line_size;                                             \
207            pix_l = npix_l;                                                 \
208            pix_h = npix_h;                                                 \
209        } while (--h);                                                      \
210    } while (0)
211
212#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE)                                \
213static void OPNAME ## _pixels ## SUFF ## _axp                               \
214        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
215         int line_size, int h)                                              \
216{                                                                           \
217    if ((size_t) pixels & 0x7) {                                            \
218        OPKIND(uldq, STORE);                                                \
219    } else {                                                                \
220        OPKIND(ldq, STORE);                                                 \
221    }                                                                       \
222}                                                                           \
223                                                                            \
224static void OPNAME ## _pixels16 ## SUFF ## _axp                             \
225        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
226         int line_size, int h)                                              \
227{                                                                           \
228    OPNAME ## _pixels ## SUFF ## _axp(block,     pixels,     line_size, h); \
229    OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \
230}
231
232#define PIXOP(OPNAME, STORE)                    \
233    MAKE_OP(OPNAME, ,     OP,     STORE)        \
234    MAKE_OP(OPNAME, _x2,  OP_X2,  STORE)        \
235    MAKE_OP(OPNAME, _y2,  OP_Y2,  STORE)        \
236    MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)
237
238/* Rounding primitives.  */
239#define AVG2 avg2
240#define AVG4 avg4
241#define AVG4_ROUNDER BYTE_VEC(0x02)
242#define STORE(l, b) stq(l, b)
243PIXOP(put, STORE);
244
245#undef STORE
246#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
247PIXOP(avg, STORE);
248
249/* Not rounding primitives.  */
250#undef AVG2
251#undef AVG4
252#undef AVG4_ROUNDER
253#undef STORE
254#define AVG2 avg2_no_rnd
255#define AVG4 avg4_no_rnd
256#define AVG4_ROUNDER BYTE_VEC(0x01)
257#define STORE(l, b) stq(l, b)
258PIXOP(put_no_rnd, STORE);
259
260#undef STORE
261#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
262PIXOP(avg_no_rnd, STORE);
263
264static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,
265                                 int line_size, int h)
266{
267    put_pixels_axp_asm(block,     pixels,     line_size, h);
268    put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);
269}
270
271void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx)
272{
273    c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
274    c->put_pixels_tab[0][1] = put_pixels16_x2_axp;
275    c->put_pixels_tab[0][2] = put_pixels16_y2_axp;
276    c->put_pixels_tab[0][3] = put_pixels16_xy2_axp;
277
278    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm;
279    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp;
280    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp;
281    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp;
282
283    c->avg_pixels_tab[0][0] = avg_pixels16_axp;
284    c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp;
285    c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp;
286    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp;
287
288    c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_axp;
289    c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_axp;
290    c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_axp;
291    c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_axp;
292
293    c->put_pixels_tab[1][0] = put_pixels_axp_asm;
294    c->put_pixels_tab[1][1] = put_pixels_x2_axp;
295    c->put_pixels_tab[1][2] = put_pixels_y2_axp;
296    c->put_pixels_tab[1][3] = put_pixels_xy2_axp;
297
298    c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm;
299    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp;
300    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp;
301    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp;
302
303    c->avg_pixels_tab[1][0] = avg_pixels_axp;
304    c->avg_pixels_tab[1][1] = avg_pixels_x2_axp;
305    c->avg_pixels_tab[1][2] = avg_pixels_y2_axp;
306    c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp;
307
308    c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels_axp;
309    c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels_x2_axp;
310    c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels_y2_axp;
311    c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels_xy2_axp;
312
313    c->clear_blocks = clear_blocks_axp;
314
315    /* amask clears all bits that correspond to present features.  */
316    if (amask(AMASK_MVI) == 0) {
317        c->put_pixels_clamped = put_pixels_clamped_mvi_asm;
318        c->add_pixels_clamped = add_pixels_clamped_mvi_asm;
319
320        c->get_pixels       = get_pixels_mvi;
321        c->diff_pixels      = diff_pixels_mvi;
322        c->sad[0]           = pix_abs16x16_mvi_asm;
323        c->sad[1]           = pix_abs8x8_mvi;
324        c->pix_abs[0][0]    = pix_abs16x16_mvi_asm;
325        c->pix_abs[1][0]    = pix_abs8x8_mvi;
326        c->pix_abs[0][1]    = pix_abs16x16_x2_mvi;
327        c->pix_abs[0][2]    = pix_abs16x16_y2_mvi;
328        c->pix_abs[0][3]    = pix_abs16x16_xy2_mvi;
329    }
330
331    put_pixels_clamped_axp_p = c->put_pixels_clamped;
332    add_pixels_clamped_axp_p = c->add_pixels_clamped;
333
334    if (!avctx->lowres &&
335        (avctx->idct_algo == FF_IDCT_AUTO ||
336         avctx->idct_algo == FF_IDCT_SIMPLEALPHA)) {
337        c->idct_put = ff_simple_idct_put_axp;
338        c->idct_add = ff_simple_idct_add_axp;
339        c->idct =     ff_simple_idct_axp;
340    }
341}
342