1/*
2 * Simple IDCT (Alpha optimized)
3 *
4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * based upon some outcommented C code from mpeg2dec (idct_mmx.c
7 * written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
8 *
9 * Alpha optimizations by M��ns Rullg��rd <mans@mansr.com>
10 *                     and Falk Hueffner <falk@debian.org>
11 *
12 * This file is part of FFmpeg.
13 *
14 * FFmpeg is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU Lesser General Public
16 * License as published by the Free Software Foundation; either
17 * version 2.1 of the License, or (at your option) any later version.
18 *
19 * FFmpeg is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22 * Lesser General Public License for more details.
23 *
24 * You should have received a copy of the GNU Lesser General Public
25 * License along with FFmpeg; if not, write to the Free Software
26 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 */
28
29#include "libavcodec/dsputil.h"
30#include "asm.h"
31
32extern void (*put_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels,
33                                        int line_size);
34extern void (*add_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels,
35                                        int line_size);
36
37// cos(i * M_PI / 16) * sqrt(2) * (1 << 14)
38// W4 is actually exactly 16384, but using 16383 works around
39// accumulating rounding errors for some encoders
40#define W1 ((int_fast32_t) 22725)
41#define W2 ((int_fast32_t) 21407)
42#define W3 ((int_fast32_t) 19266)
43#define W4 ((int_fast32_t) 16383)
44#define W5 ((int_fast32_t) 12873)
45#define W6 ((int_fast32_t)  8867)
46#define W7 ((int_fast32_t)  4520)
47#define ROW_SHIFT 11
48#define COL_SHIFT 20
49
50/* 0: all entries 0, 1: only first entry nonzero, 2: otherwise  */
51static inline int idct_row(DCTELEM *row)
52{
53    int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3, t;
54    uint64_t l, r, t2;
55    l = ldq(row);
56    r = ldq(row + 4);
57
58    if (l == 0 && r == 0)
59        return 0;
60
61    a0 = W4 * sextw(l) + (1 << (ROW_SHIFT - 1));
62
63    if (((l & ~0xffffUL) | r) == 0) {
64        a0 >>= ROW_SHIFT;
65        t2 = (uint16_t) a0;
66        t2 |= t2 << 16;
67        t2 |= t2 << 32;
68
69        stq(t2, row);
70        stq(t2, row + 4);
71        return 1;
72    }
73
74    a1 = a0;
75    a2 = a0;
76    a3 = a0;
77
78    t = extwl(l, 4);            /* row[2] */
79    if (t != 0) {
80        t = sextw(t);
81        a0 += W2 * t;
82        a1 += W6 * t;
83        a2 -= W6 * t;
84        a3 -= W2 * t;
85    }
86
87    t = extwl(r, 0);            /* row[4] */
88    if (t != 0) {
89        t = sextw(t);
90        a0 += W4 * t;
91        a1 -= W4 * t;
92        a2 -= W4 * t;
93        a3 += W4 * t;
94    }
95
96    t = extwl(r, 4);            /* row[6] */
97    if (t != 0) {
98        t = sextw(t);
99        a0 += W6 * t;
100        a1 -= W2 * t;
101        a2 += W2 * t;
102        a3 -= W6 * t;
103    }
104
105    t = extwl(l, 2);            /* row[1] */
106    if (t != 0) {
107        t = sextw(t);
108        b0 = W1 * t;
109        b1 = W3 * t;
110        b2 = W5 * t;
111        b3 = W7 * t;
112    } else {
113        b0 = 0;
114        b1 = 0;
115        b2 = 0;
116        b3 = 0;
117    }
118
119    t = extwl(l, 6);            /* row[3] */
120    if (t) {
121        t = sextw(t);
122        b0 += W3 * t;
123        b1 -= W7 * t;
124        b2 -= W1 * t;
125        b3 -= W5 * t;
126    }
127
128
129    t = extwl(r, 2);            /* row[5] */
130    if (t) {
131        t = sextw(t);
132        b0 += W5 * t;
133        b1 -= W1 * t;
134        b2 += W7 * t;
135        b3 += W3 * t;
136    }
137
138    t = extwl(r, 6);            /* row[7] */
139    if (t) {
140        t = sextw(t);
141        b0 += W7 * t;
142        b1 -= W5 * t;
143        b2 += W3 * t;
144        b3 -= W1 * t;
145    }
146
147    row[0] = (a0 + b0) >> ROW_SHIFT;
148    row[1] = (a1 + b1) >> ROW_SHIFT;
149    row[2] = (a2 + b2) >> ROW_SHIFT;
150    row[3] = (a3 + b3) >> ROW_SHIFT;
151    row[4] = (a3 - b3) >> ROW_SHIFT;
152    row[5] = (a2 - b2) >> ROW_SHIFT;
153    row[6] = (a1 - b1) >> ROW_SHIFT;
154    row[7] = (a0 - b0) >> ROW_SHIFT;
155
156    return 2;
157}
158
159static inline void idct_col(DCTELEM *col)
160{
161    int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3;
162
163    col[0] += (1 << (COL_SHIFT - 1)) / W4;
164
165    a0 = W4 * col[8 * 0];
166    a1 = W4 * col[8 * 0];
167    a2 = W4 * col[8 * 0];
168    a3 = W4 * col[8 * 0];
169
170    if (col[8 * 2]) {
171        a0 += W2 * col[8 * 2];
172        a1 += W6 * col[8 * 2];
173        a2 -= W6 * col[8 * 2];
174        a3 -= W2 * col[8 * 2];
175    }
176
177    if (col[8 * 4]) {
178        a0 += W4 * col[8 * 4];
179        a1 -= W4 * col[8 * 4];
180        a2 -= W4 * col[8 * 4];
181        a3 += W4 * col[8 * 4];
182    }
183
184    if (col[8 * 6]) {
185        a0 += W6 * col[8 * 6];
186        a1 -= W2 * col[8 * 6];
187        a2 += W2 * col[8 * 6];
188        a3 -= W6 * col[8 * 6];
189    }
190
191    if (col[8 * 1]) {
192        b0 = W1 * col[8 * 1];
193        b1 = W3 * col[8 * 1];
194        b2 = W5 * col[8 * 1];
195        b3 = W7 * col[8 * 1];
196    } else {
197        b0 = 0;
198        b1 = 0;
199        b2 = 0;
200        b3 = 0;
201    }
202
203    if (col[8 * 3]) {
204        b0 += W3 * col[8 * 3];
205        b1 -= W7 * col[8 * 3];
206        b2 -= W1 * col[8 * 3];
207        b3 -= W5 * col[8 * 3];
208    }
209
210    if (col[8 * 5]) {
211        b0 += W5 * col[8 * 5];
212        b1 -= W1 * col[8 * 5];
213        b2 += W7 * col[8 * 5];
214        b3 += W3 * col[8 * 5];
215    }
216
217    if (col[8 * 7]) {
218        b0 += W7 * col[8 * 7];
219        b1 -= W5 * col[8 * 7];
220        b2 += W3 * col[8 * 7];
221        b3 -= W1 * col[8 * 7];
222    }
223
224    col[8 * 0] = (a0 + b0) >> COL_SHIFT;
225    col[8 * 7] = (a0 - b0) >> COL_SHIFT;
226    col[8 * 1] = (a1 + b1) >> COL_SHIFT;
227    col[8 * 6] = (a1 - b1) >> COL_SHIFT;
228    col[8 * 2] = (a2 + b2) >> COL_SHIFT;
229    col[8 * 5] = (a2 - b2) >> COL_SHIFT;
230    col[8 * 3] = (a3 + b3) >> COL_SHIFT;
231    col[8 * 4] = (a3 - b3) >> COL_SHIFT;
232}
233
234/* If all rows but the first one are zero after row transformation,
235   all rows will be identical after column transformation.  */
236static inline void idct_col2(DCTELEM *col)
237{
238    int i;
239    uint64_t l, r;
240
241    for (i = 0; i < 8; ++i) {
242        int_fast32_t a0 = col[i] + (1 << (COL_SHIFT - 1)) / W4;
243
244        a0 *= W4;
245        col[i] = a0 >> COL_SHIFT;
246    }
247
248    l = ldq(col + 0 * 4); r = ldq(col + 1 * 4);
249    stq(l, col +  2 * 4); stq(r, col +  3 * 4);
250    stq(l, col +  4 * 4); stq(r, col +  5 * 4);
251    stq(l, col +  6 * 4); stq(r, col +  7 * 4);
252    stq(l, col +  8 * 4); stq(r, col +  9 * 4);
253    stq(l, col + 10 * 4); stq(r, col + 11 * 4);
254    stq(l, col + 12 * 4); stq(r, col + 13 * 4);
255    stq(l, col + 14 * 4); stq(r, col + 15 * 4);
256}
257
258void ff_simple_idct_axp(DCTELEM *block)
259{
260
261    int i;
262    int rowsZero = 1;           /* all rows except row 0 zero */
263    int rowsConstant = 1;       /* all rows consist of a constant value */
264
265    for (i = 0; i < 8; i++) {
266        int sparseness = idct_row(block + 8 * i);
267
268        if (i > 0 && sparseness > 0)
269            rowsZero = 0;
270        if (sparseness == 2)
271            rowsConstant = 0;
272    }
273
274    if (rowsZero) {
275        idct_col2(block);
276    } else if (rowsConstant) {
277        idct_col(block);
278        for (i = 0; i < 8; i += 2) {
279            uint64_t v = (uint16_t) block[0];
280            uint64_t w = (uint16_t) block[8];
281
282            v |= v << 16;
283            w |= w << 16;
284            v |= v << 32;
285            w |= w << 32;
286            stq(v, block + 0 * 4);
287            stq(v, block + 1 * 4);
288            stq(w, block + 2 * 4);
289            stq(w, block + 3 * 4);
290            block += 4 * 4;
291        }
292    } else {
293        for (i = 0; i < 8; i++)
294            idct_col(block + i);
295    }
296}
297
298void ff_simple_idct_put_axp(uint8_t *dest, int line_size, DCTELEM *block)
299{
300    ff_simple_idct_axp(block);
301    put_pixels_clamped_axp_p(block, dest, line_size);
302}
303
304void ff_simple_idct_add_axp(uint8_t *dest, int line_size, DCTELEM *block)
305{
306    ff_simple_idct_axp(block);
307    add_pixels_clamped_axp_p(block, dest, line_size);
308}
309