1/*
2 * Simple IDCT (Alpha optimized)
3 *
4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * based upon some outcommented C code from mpeg2dec (idct_mmx.c
7 * written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
8 *
9 * Alpha optimizations by M��ns Rullg��rd <mans@mansr.com>
10 *                     and Falk Hueffner <falk@debian.org>
11 *
12 * This file is part of FFmpeg.
13 *
14 * FFmpeg is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU Lesser General Public
16 * License as published by the Free Software Foundation; either
17 * version 2.1 of the License, or (at your option) any later version.
18 *
19 * FFmpeg is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22 * Lesser General Public License for more details.
23 *
24 * You should have received a copy of the GNU Lesser General Public
25 * License along with FFmpeg; if not, write to the Free Software
26 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 */
28
29#include "idctdsp_alpha.h"
30#include "asm.h"
31
32// cos(i * M_PI / 16) * sqrt(2) * (1 << 14)
33// W4 is actually exactly 16384, but using 16383 works around
34// accumulating rounding errors for some encoders
35#define W1 22725
36#define W2 21407
37#define W3 19266
38#define W4 16383
39#define W5 12873
40#define W6  8867
41#define W7  4520
42#define ROW_SHIFT 11
43#define COL_SHIFT 20
44
45/* 0: all entries 0, 1: only first entry nonzero, 2: otherwise  */
46static inline int idct_row(int16_t *row)
47{
48    int a0, a1, a2, a3, b0, b1, b2, b3, t;
49    uint64_t l, r, t2;
50    l = ldq(row);
51    r = ldq(row + 4);
52
53    if (l == 0 && r == 0)
54        return 0;
55
56    a0 = W4 * sextw(l) + (1 << (ROW_SHIFT - 1));
57
58    if (((l & ~0xffffUL) | r) == 0) {
59        a0 >>= ROW_SHIFT;
60        t2 = (uint16_t) a0;
61        t2 |= t2 << 16;
62        t2 |= t2 << 32;
63
64        stq(t2, row);
65        stq(t2, row + 4);
66        return 1;
67    }
68
69    a1 = a0;
70    a2 = a0;
71    a3 = a0;
72
73    t = extwl(l, 4);            /* row[2] */
74    if (t != 0) {
75        t = sextw(t);
76        a0 += W2 * t;
77        a1 += W6 * t;
78        a2 -= W6 * t;
79        a3 -= W2 * t;
80    }
81
82    t = extwl(r, 0);            /* row[4] */
83    if (t != 0) {
84        t = sextw(t);
85        a0 += W4 * t;
86        a1 -= W4 * t;
87        a2 -= W4 * t;
88        a3 += W4 * t;
89    }
90
91    t = extwl(r, 4);            /* row[6] */
92    if (t != 0) {
93        t = sextw(t);
94        a0 += W6 * t;
95        a1 -= W2 * t;
96        a2 += W2 * t;
97        a3 -= W6 * t;
98    }
99
100    t = extwl(l, 2);            /* row[1] */
101    if (t != 0) {
102        t = sextw(t);
103        b0 = W1 * t;
104        b1 = W3 * t;
105        b2 = W5 * t;
106        b3 = W7 * t;
107    } else {
108        b0 = 0;
109        b1 = 0;
110        b2 = 0;
111        b3 = 0;
112    }
113
114    t = extwl(l, 6);            /* row[3] */
115    if (t) {
116        t = sextw(t);
117        b0 += W3 * t;
118        b1 -= W7 * t;
119        b2 -= W1 * t;
120        b3 -= W5 * t;
121    }
122
123
124    t = extwl(r, 2);            /* row[5] */
125    if (t) {
126        t = sextw(t);
127        b0 += W5 * t;
128        b1 -= W1 * t;
129        b2 += W7 * t;
130        b3 += W3 * t;
131    }
132
133    t = extwl(r, 6);            /* row[7] */
134    if (t) {
135        t = sextw(t);
136        b0 += W7 * t;
137        b1 -= W5 * t;
138        b2 += W3 * t;
139        b3 -= W1 * t;
140    }
141
142    row[0] = (a0 + b0) >> ROW_SHIFT;
143    row[1] = (a1 + b1) >> ROW_SHIFT;
144    row[2] = (a2 + b2) >> ROW_SHIFT;
145    row[3] = (a3 + b3) >> ROW_SHIFT;
146    row[4] = (a3 - b3) >> ROW_SHIFT;
147    row[5] = (a2 - b2) >> ROW_SHIFT;
148    row[6] = (a1 - b1) >> ROW_SHIFT;
149    row[7] = (a0 - b0) >> ROW_SHIFT;
150
151    return 2;
152}
153
154static inline void idct_col(int16_t *col)
155{
156    int a0, a1, a2, a3, b0, b1, b2, b3;
157
158    col[0] += (1 << (COL_SHIFT - 1)) / W4;
159
160    a0 = W4 * col[8 * 0];
161    a1 = W4 * col[8 * 0];
162    a2 = W4 * col[8 * 0];
163    a3 = W4 * col[8 * 0];
164
165    if (col[8 * 2]) {
166        a0 += W2 * col[8 * 2];
167        a1 += W6 * col[8 * 2];
168        a2 -= W6 * col[8 * 2];
169        a3 -= W2 * col[8 * 2];
170    }
171
172    if (col[8 * 4]) {
173        a0 += W4 * col[8 * 4];
174        a1 -= W4 * col[8 * 4];
175        a2 -= W4 * col[8 * 4];
176        a3 += W4 * col[8 * 4];
177    }
178
179    if (col[8 * 6]) {
180        a0 += W6 * col[8 * 6];
181        a1 -= W2 * col[8 * 6];
182        a2 += W2 * col[8 * 6];
183        a3 -= W6 * col[8 * 6];
184    }
185
186    if (col[8 * 1]) {
187        b0 = W1 * col[8 * 1];
188        b1 = W3 * col[8 * 1];
189        b2 = W5 * col[8 * 1];
190        b3 = W7 * col[8 * 1];
191    } else {
192        b0 = 0;
193        b1 = 0;
194        b2 = 0;
195        b3 = 0;
196    }
197
198    if (col[8 * 3]) {
199        b0 += W3 * col[8 * 3];
200        b1 -= W7 * col[8 * 3];
201        b2 -= W1 * col[8 * 3];
202        b3 -= W5 * col[8 * 3];
203    }
204
205    if (col[8 * 5]) {
206        b0 += W5 * col[8 * 5];
207        b1 -= W1 * col[8 * 5];
208        b2 += W7 * col[8 * 5];
209        b3 += W3 * col[8 * 5];
210    }
211
212    if (col[8 * 7]) {
213        b0 += W7 * col[8 * 7];
214        b1 -= W5 * col[8 * 7];
215        b2 += W3 * col[8 * 7];
216        b3 -= W1 * col[8 * 7];
217    }
218
219    col[8 * 0] = (a0 + b0) >> COL_SHIFT;
220    col[8 * 7] = (a0 - b0) >> COL_SHIFT;
221    col[8 * 1] = (a1 + b1) >> COL_SHIFT;
222    col[8 * 6] = (a1 - b1) >> COL_SHIFT;
223    col[8 * 2] = (a2 + b2) >> COL_SHIFT;
224    col[8 * 5] = (a2 - b2) >> COL_SHIFT;
225    col[8 * 3] = (a3 + b3) >> COL_SHIFT;
226    col[8 * 4] = (a3 - b3) >> COL_SHIFT;
227}
228
229/* If all rows but the first one are zero after row transformation,
230   all rows will be identical after column transformation.  */
231static inline void idct_col2(int16_t *col)
232{
233    int i;
234    uint64_t l, r;
235
236    for (i = 0; i < 8; ++i) {
237        int a0 = col[i] + (1 << (COL_SHIFT - 1)) / W4;
238
239        a0 *= W4;
240        col[i] = a0 >> COL_SHIFT;
241    }
242
243    l = ldq(col + 0 * 4); r = ldq(col + 1 * 4);
244    stq(l, col +  2 * 4); stq(r, col +  3 * 4);
245    stq(l, col +  4 * 4); stq(r, col +  5 * 4);
246    stq(l, col +  6 * 4); stq(r, col +  7 * 4);
247    stq(l, col +  8 * 4); stq(r, col +  9 * 4);
248    stq(l, col + 10 * 4); stq(r, col + 11 * 4);
249    stq(l, col + 12 * 4); stq(r, col + 13 * 4);
250    stq(l, col + 14 * 4); stq(r, col + 15 * 4);
251}
252
253void ff_simple_idct_axp(int16_t *block)
254{
255
256    int i;
257    int rowsZero = 1;           /* all rows except row 0 zero */
258    int rowsConstant = 1;       /* all rows consist of a constant value */
259
260    for (i = 0; i < 8; i++) {
261        int sparseness = idct_row(block + 8 * i);
262
263        if (i > 0 && sparseness > 0)
264            rowsZero = 0;
265        if (sparseness == 2)
266            rowsConstant = 0;
267    }
268
269    if (rowsZero) {
270        idct_col2(block);
271    } else if (rowsConstant) {
272        idct_col(block);
273        for (i = 0; i < 8; i += 2) {
274            uint64_t v = (uint16_t) block[0];
275            uint64_t w = (uint16_t) block[8];
276
277            v |= v << 16;
278            w |= w << 16;
279            v |= v << 32;
280            w |= w << 32;
281            stq(v, block + 0 * 4);
282            stq(v, block + 1 * 4);
283            stq(w, block + 2 * 4);
284            stq(w, block + 3 * 4);
285            block += 4 * 4;
286        }
287    } else {
288        for (i = 0; i < 8; i++)
289            idct_col(block + i);
290    }
291}
292
293void ff_simple_idct_put_axp(uint8_t *dest, int line_size, int16_t *block)
294{
295    ff_simple_idct_axp(block);
296    put_pixels_clamped_axp_p(block, dest, line_size);
297}
298
299void ff_simple_idct_add_axp(uint8_t *dest, int line_size, int16_t *block)
300{
301    ff_simple_idct_axp(block);
302    add_pixels_clamped_axp_p(block, dest, line_size);
303}
304