1/*
2 * Alpha optimized DSP utils
3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavcodec/dsputil.h"
23#include "asm.h"
24
25void ff_simple_idct_axp(DCTELEM *block);
26void ff_simple_idct_put_axp(uint8_t *dest, int line_size, DCTELEM *block);
27void ff_simple_idct_add_axp(uint8_t *dest, int line_size, DCTELEM *block);
28
29void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
30                        int line_size, int h);
31void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
32                                int line_size);
33void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
34                                int line_size);
35void (*put_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels,
36                                 int line_size);
37void (*add_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels,
38                                 int line_size);
39
40void get_pixels_mvi(DCTELEM *restrict block,
41                    const uint8_t *restrict pixels, int line_size);
42void diff_pixels_mvi(DCTELEM *block, const uint8_t *s1, const uint8_t *s2,
43                     int stride);
44int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h);
45int pix_abs16x16_mvi_asm(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h);
46int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h);
47int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h);
48int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h);
49
50#if 0
51/* These functions were the base for the optimized assembler routines,
52   and remain here for documentation purposes.  */
53static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
54                                   int line_size)
55{
56    int i = 8;
57    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
58
59    do {
60        uint64_t shorts0, shorts1;
61
62        shorts0 = ldq(block);
63        shorts0 = maxsw4(shorts0, 0);
64        shorts0 = minsw4(shorts0, clampmask);
65        stl(pkwb(shorts0), pixels);
66
67        shorts1 = ldq(block + 4);
68        shorts1 = maxsw4(shorts1, 0);
69        shorts1 = minsw4(shorts1, clampmask);
70        stl(pkwb(shorts1), pixels + 4);
71
72        pixels += line_size;
73        block += 8;
74    } while (--i);
75}
76
77void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
78                            int line_size)
79{
80    int h = 8;
81    /* Keep this function a leaf function by generating the constants
82       manually (mainly for the hack value ;-).  */
83    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
84    uint64_t signmask  = zap(-1, 0x33);
85    signmask ^= signmask >> 1;  /* 0x8000800080008000 */
86
87    do {
88        uint64_t shorts0, pix0, signs0;
89        uint64_t shorts1, pix1, signs1;
90
91        shorts0 = ldq(block);
92        shorts1 = ldq(block + 4);
93
94        pix0    = unpkbw(ldl(pixels));
95        /* Signed subword add (MMX paddw).  */
96        signs0  = shorts0 & signmask;
97        shorts0 &= ~signmask;
98        shorts0 += pix0;
99        shorts0 ^= signs0;
100        /* Clamp. */
101        shorts0 = maxsw4(shorts0, 0);
102        shorts0 = minsw4(shorts0, clampmask);
103
104        /* Next 4.  */
105        pix1    = unpkbw(ldl(pixels + 4));
106        signs1  = shorts1 & signmask;
107        shorts1 &= ~signmask;
108        shorts1 += pix1;
109        shorts1 ^= signs1;
110        shorts1 = maxsw4(shorts1, 0);
111        shorts1 = minsw4(shorts1, clampmask);
112
113        stl(pkwb(shorts0), pixels);
114        stl(pkwb(shorts1), pixels + 4);
115
116        pixels += line_size;
117        block += 8;
118    } while (--h);
119}
120#endif
121
122static void clear_blocks_axp(DCTELEM *blocks) {
123    uint64_t *p = (uint64_t *) blocks;
124    int n = sizeof(DCTELEM) * 6 * 64;
125
126    do {
127        p[0] = 0;
128        p[1] = 0;
129        p[2] = 0;
130        p[3] = 0;
131        p[4] = 0;
132        p[5] = 0;
133        p[6] = 0;
134        p[7] = 0;
135        p += 8;
136        n -= 8 * 8;
137    } while (n);
138}
139
140static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
141{
142    return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
143}
144
145static inline uint64_t avg2(uint64_t a, uint64_t b)
146{
147    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
148}
149
150#if 0
151/* The XY2 routines basically utilize this scheme, but reuse parts in
152   each iteration.  */
153static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
154{
155    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
156                + ((l2 & ~BYTE_VEC(0x03)) >> 2)
157                + ((l3 & ~BYTE_VEC(0x03)) >> 2)
158                + ((l4 & ~BYTE_VEC(0x03)) >> 2);
159    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
160                    + (l2 & BYTE_VEC(0x03))
161                    + (l3 & BYTE_VEC(0x03))
162                    + (l4 & BYTE_VEC(0x03))
163                    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
164    return r1 + r2;
165}
166#endif
167
168#define OP(LOAD, STORE)                         \
169    do {                                        \
170        STORE(LOAD(pixels), block);             \
171        pixels += line_size;                    \
172        block += line_size;                     \
173    } while (--h)
174
175#define OP_X2(LOAD, STORE)                                      \
176    do {                                                        \
177        uint64_t pix1, pix2;                                    \
178                                                                \
179        pix1 = LOAD(pixels);                                    \
180        pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);        \
181        STORE(AVG2(pix1, pix2), block);                         \
182        pixels += line_size;                                    \
183        block += line_size;                                     \
184    } while (--h)
185
186#define OP_Y2(LOAD, STORE)                      \
187    do {                                        \
188        uint64_t pix = LOAD(pixels);            \
189        do {                                    \
190            uint64_t next_pix;                  \
191                                                \
192            pixels += line_size;                \
193            next_pix = LOAD(pixels);            \
194            STORE(AVG2(pix, next_pix), block);  \
195            block += line_size;                 \
196            pix = next_pix;                     \
197        } while (--h);                          \
198    } while (0)
199
200#define OP_XY2(LOAD, STORE)                                                 \
201    do {                                                                    \
202        uint64_t pix1 = LOAD(pixels);                                       \
203        uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);           \
204        uint64_t pix_l = (pix1 & BYTE_VEC(0x03))                            \
205                       + (pix2 & BYTE_VEC(0x03));                           \
206        uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2)                    \
207                       + ((pix2 & ~BYTE_VEC(0x03)) >> 2);                   \
208                                                                            \
209        do {                                                                \
210            uint64_t npix1, npix2;                                          \
211            uint64_t npix_l, npix_h;                                        \
212            uint64_t avg;                                                   \
213                                                                            \
214            pixels += line_size;                                            \
215            npix1 = LOAD(pixels);                                           \
216            npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56);              \
217            npix_l = (npix1 & BYTE_VEC(0x03))                               \
218                   + (npix2 & BYTE_VEC(0x03));                              \
219            npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2)                       \
220                   + ((npix2 & ~BYTE_VEC(0x03)) >> 2);                      \
221            avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
222                + pix_h + npix_h;                                           \
223            STORE(avg, block);                                              \
224                                                                            \
225            block += line_size;                                             \
226            pix_l = npix_l;                                                 \
227            pix_h = npix_h;                                                 \
228        } while (--h);                                                      \
229    } while (0)
230
231#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE)                                \
232static void OPNAME ## _pixels ## SUFF ## _axp                               \
233        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
234         int line_size, int h)                                              \
235{                                                                           \
236    if ((size_t) pixels & 0x7) {                                            \
237        OPKIND(uldq, STORE);                                                \
238    } else {                                                                \
239        OPKIND(ldq, STORE);                                                 \
240    }                                                                       \
241}                                                                           \
242                                                                            \
243static void OPNAME ## _pixels16 ## SUFF ## _axp                             \
244        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
245         int line_size, int h)                                              \
246{                                                                           \
247    OPNAME ## _pixels ## SUFF ## _axp(block,     pixels,     line_size, h); \
248    OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \
249}
250
251#define PIXOP(OPNAME, STORE)                    \
252    MAKE_OP(OPNAME, ,     OP,     STORE)        \
253    MAKE_OP(OPNAME, _x2,  OP_X2,  STORE)        \
254    MAKE_OP(OPNAME, _y2,  OP_Y2,  STORE)        \
255    MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)
256
257/* Rounding primitives.  */
258#define AVG2 avg2
259#define AVG4 avg4
260#define AVG4_ROUNDER BYTE_VEC(0x02)
261#define STORE(l, b) stq(l, b)
262PIXOP(put, STORE);
263
264#undef STORE
265#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
266PIXOP(avg, STORE);
267
268/* Not rounding primitives.  */
269#undef AVG2
270#undef AVG4
271#undef AVG4_ROUNDER
272#undef STORE
273#define AVG2 avg2_no_rnd
274#define AVG4 avg4_no_rnd
275#define AVG4_ROUNDER BYTE_VEC(0x01)
276#define STORE(l, b) stq(l, b)
277PIXOP(put_no_rnd, STORE);
278
279#undef STORE
280#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
281PIXOP(avg_no_rnd, STORE);
282
283void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,
284                          int line_size, int h)
285{
286    put_pixels_axp_asm(block,     pixels,     line_size, h);
287    put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);
288}
289
290void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx)
291{
292    c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
293    c->put_pixels_tab[0][1] = put_pixels16_x2_axp;
294    c->put_pixels_tab[0][2] = put_pixels16_y2_axp;
295    c->put_pixels_tab[0][3] = put_pixels16_xy2_axp;
296
297    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm;
298    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp;
299    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp;
300    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp;
301
302    c->avg_pixels_tab[0][0] = avg_pixels16_axp;
303    c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp;
304    c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp;
305    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp;
306
307    c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_axp;
308    c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_axp;
309    c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_axp;
310    c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_axp;
311
312    c->put_pixels_tab[1][0] = put_pixels_axp_asm;
313    c->put_pixels_tab[1][1] = put_pixels_x2_axp;
314    c->put_pixels_tab[1][2] = put_pixels_y2_axp;
315    c->put_pixels_tab[1][3] = put_pixels_xy2_axp;
316
317    c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm;
318    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp;
319    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp;
320    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp;
321
322    c->avg_pixels_tab[1][0] = avg_pixels_axp;
323    c->avg_pixels_tab[1][1] = avg_pixels_x2_axp;
324    c->avg_pixels_tab[1][2] = avg_pixels_y2_axp;
325    c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp;
326
327    c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels_axp;
328    c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels_x2_axp;
329    c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels_y2_axp;
330    c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels_xy2_axp;
331
332    c->clear_blocks = clear_blocks_axp;
333
334    /* amask clears all bits that correspond to present features.  */
335    if (amask(AMASK_MVI) == 0) {
336        c->put_pixels_clamped = put_pixels_clamped_mvi_asm;
337        c->add_pixels_clamped = add_pixels_clamped_mvi_asm;
338
339        c->get_pixels       = get_pixels_mvi;
340        c->diff_pixels      = diff_pixels_mvi;
341        c->sad[0]           = pix_abs16x16_mvi_asm;
342        c->sad[1]           = pix_abs8x8_mvi;
343        c->pix_abs[0][0]    = pix_abs16x16_mvi_asm;
344        c->pix_abs[1][0]    = pix_abs8x8_mvi;
345        c->pix_abs[0][1]    = pix_abs16x16_x2_mvi;
346        c->pix_abs[0][2]    = pix_abs16x16_y2_mvi;
347        c->pix_abs[0][3]    = pix_abs16x16_xy2_mvi;
348    }
349
350    put_pixels_clamped_axp_p = c->put_pixels_clamped;
351    add_pixels_clamped_axp_p = c->add_pixels_clamped;
352
353    if (!avctx->lowres &&
354        (avctx->idct_algo == FF_IDCT_AUTO ||
355         avctx->idct_algo == FF_IDCT_SIMPLEALPHA)) {
356        c->idct_put = ff_simple_idct_put_axp;
357        c->idct_add = ff_simple_idct_add_axp;
358        c->idct =     ff_simple_idct_axp;
359    }
360}
361