1/*
2 * idct for sh4
3 *
4 * Copyright (c) 2001-2003 BERO <bero@geocities.co.jp>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "libavcodec/dsputil.h"
24#include "dsputil_sh4.h"
25#include "sh4.h"
26
27#define c1      1.38703984532214752434  /* sqrt(2)*cos(1*pi/16) */
28#define c2      1.30656296487637657577  /* sqrt(2)*cos(2*pi/16) */
29#define c3      1.17587560241935884520  /* sqrt(2)*cos(3*pi/16) */
30#define c4      1.00000000000000000000  /* sqrt(2)*cos(4*pi/16) */
31#define c5      0.78569495838710234903  /* sqrt(2)*cos(5*pi/16) */
32#define c6      0.54119610014619712324  /* sqrt(2)*cos(6*pi/16) */
33#define c7      0.27589937928294311353  /* sqrt(2)*cos(7*pi/16) */
34
35static const float even_table[] __attribute__ ((aligned(8))) = {
36        c4, c4, c4, c4,
37        c2, c6,-c6,-c2,
38        c4,-c4,-c4, c4,
39        c6,-c2, c2,-c6
40};
41
42static const float odd_table[] __attribute__ ((aligned(8))) = {
43        c1, c3, c5, c7,
44        c3,-c7,-c1,-c5,
45        c5,-c1, c7, c3,
46        c7,-c5, c3,-c1
47};
48
49#undef  c1
50#undef  c2
51#undef  c3
52#undef  c4
53#undef  c5
54#undef  c6
55#undef  c7
56
57#if 1
58
59#define         load_matrix(table) \
60    do { \
61        const float *t = table; \
62        __asm__ volatile( \
63        "       fschg\n" \
64        "       fmov   @%0+,xd0\n" \
65        "       fmov   @%0+,xd2\n" \
66        "       fmov   @%0+,xd4\n" \
67        "       fmov   @%0+,xd6\n" \
68        "       fmov   @%0+,xd8\n" \
69        "       fmov   @%0+,xd10\n" \
70        "       fmov   @%0+,xd12\n" \
71        "       fmov   @%0+,xd14\n" \
72        "       fschg\n" \
73        : "+r"(t) \
74        ); \
75    } while (0)
76
77#define         ftrv() \
78                __asm__ volatile("ftrv xmtrx,fv0" \
79                : "+f"(fr0),"+f"(fr1),"+f"(fr2),"+f"(fr3));
80
81#define         DEFREG        \
82        register float fr0 __asm__("fr0"); \
83        register float fr1 __asm__("fr1"); \
84        register float fr2 __asm__("fr2"); \
85        register float fr3 __asm__("fr3")
86
87#else
88
89/* generic C code for check */
90
91static void ftrv_(const float xf[],float fv[])
92{
93        float f0,f1,f2,f3;
94        f0 = fv[0];
95        f1 = fv[1];
96        f2 = fv[2];
97        f3 = fv[3];
98        fv[0] = xf[0]*f0 + xf[4]*f1 + xf[ 8]*f2 + xf[12]*f3;
99        fv[1] = xf[1]*f0 + xf[5]*f1 + xf[ 9]*f2 + xf[13]*f3;
100        fv[2] = xf[2]*f0 + xf[6]*f1 + xf[10]*f2 + xf[14]*f3;
101        fv[3] = xf[3]*f0 + xf[7]*f1 + xf[11]*f2 + xf[15]*f3;
102}
103
104static void load_matrix_(float xf[],const float table[])
105{
106        int i;
107        for(i=0;i<16;i++) xf[i]=table[i];
108}
109
110#define         ftrv()                  ftrv_(xf,fv)
111#define         load_matrix(table)      load_matrix_(xf,table)
112
113#define         DEFREG \
114        float fv[4],xf[16]
115
116#define         fr0     fv[0]
117#define         fr1     fv[1]
118#define         fr2     fv[2]
119#define         fr3     fv[3]
120
121#endif
122
123#if 1
124#define         DESCALE(x,n)    (x)*(1.0f/(1<<(n)))
125#else
126#define         DESCALE(x,n)    (((int)(x)+(1<<(n-1)))>>(n))
127#endif
128
129/* this code work worse on gcc cvs. 3.2.3 work fine */
130
131
132#if 1
133//optimized
134
135void idct_sh4(DCTELEM *block)
136{
137        DEFREG;
138
139        int i;
140        float        tblock[8*8],*fblock;
141        int ofs1,ofs2,ofs3;
142        int fpscr;
143
144        fp_single_enter(fpscr);
145
146        /* row */
147
148        /* even part */
149        load_matrix(even_table);
150
151        fblock = tblock+4;
152        i = 8;
153        do {
154                fr0 = block[0];
155                fr1 = block[2];
156                fr2 = block[4];
157                fr3 = block[6];
158                block+=8;
159                ftrv();
160                *--fblock = fr3;
161                *--fblock = fr2;
162                *--fblock = fr1;
163                *--fblock = fr0;
164                fblock+=8+4;
165        } while(--i);
166        block-=8*8;
167        fblock-=8*8+4;
168
169        load_matrix(odd_table);
170
171        i = 8;
172
173        do {
174                float t0,t1,t2,t3;
175                fr0 = block[1];
176                fr1 = block[3];
177                fr2 = block[5];
178                fr3 = block[7];
179                block+=8;
180                ftrv();
181                t0 = *fblock++;
182                t1 = *fblock++;
183                t2 = *fblock++;
184                t3 = *fblock++;
185                fblock+=4;
186                *--fblock = t0 - fr0;
187                *--fblock = t1 - fr1;
188                *--fblock = t2 - fr2;
189                *--fblock = t3 - fr3;
190                *--fblock = t3 + fr3;
191                *--fblock = t2 + fr2;
192                *--fblock = t1 + fr1;
193                *--fblock = t0 + fr0;
194                fblock+=8;
195        } while(--i);
196        block-=8*8;
197        fblock-=8*8;
198
199        /* col */
200
201        /* even part */
202        load_matrix(even_table);
203
204        ofs1 = sizeof(float)*2*8;
205        ofs2 = sizeof(float)*4*8;
206        ofs3 = sizeof(float)*6*8;
207
208        i = 8;
209
210#define        OA(fblock,ofs)   *(float*)((char*)fblock + ofs)
211
212        do {
213                fr0 = OA(fblock,   0);
214                fr1 = OA(fblock,ofs1);
215                fr2 = OA(fblock,ofs2);
216                fr3 = OA(fblock,ofs3);
217                ftrv();
218                OA(fblock,0   ) = fr0;
219                OA(fblock,ofs1) = fr1;
220                OA(fblock,ofs2) = fr2;
221                OA(fblock,ofs3) = fr3;
222                fblock++;
223        } while(--i);
224        fblock-=8;
225
226        load_matrix(odd_table);
227
228        i=8;
229        do {
230                float t0,t1,t2,t3;
231                t0 = OA(fblock,   0); /* [8*0] */
232                t1 = OA(fblock,ofs1); /* [8*2] */
233                t2 = OA(fblock,ofs2); /* [8*4] */
234                t3 = OA(fblock,ofs3); /* [8*6] */
235                fblock+=8;
236                fr0 = OA(fblock,   0); /* [8*1] */
237                fr1 = OA(fblock,ofs1); /* [8*3] */
238                fr2 = OA(fblock,ofs2); /* [8*5] */
239                fr3 = OA(fblock,ofs3); /* [8*7] */
240                fblock+=-8+1;
241                ftrv();
242                block[8*0] = DESCALE(t0 + fr0,3);
243                block[8*7] = DESCALE(t0 - fr0,3);
244                block[8*1] = DESCALE(t1 + fr1,3);
245                block[8*6] = DESCALE(t1 - fr1,3);
246                block[8*2] = DESCALE(t2 + fr2,3);
247                block[8*5] = DESCALE(t2 - fr2,3);
248                block[8*3] = DESCALE(t3 + fr3,3);
249                block[8*4] = DESCALE(t3 - fr3,3);
250                block++;
251        } while(--i);
252
253        fp_single_leave(fpscr);
254}
255#else
256void idct_sh4(DCTELEM *block)
257{
258        DEFREG;
259
260        int i;
261        float   tblock[8*8],*fblock;
262
263        /* row */
264
265        /* even part */
266        load_matrix(even_table);
267
268        fblock = tblock;
269        i = 8;
270        do {
271                fr0 = block[0];
272                fr1 = block[2];
273                fr2 = block[4];
274                fr3 = block[6];
275                block+=8;
276                ftrv();
277                fblock[0] = fr0;
278                fblock[2] = fr1;
279                fblock[4] = fr2;
280                fblock[6] = fr3;
281                fblock+=8;
282        } while(--i);
283        block-=8*8;
284        fblock-=8*8;
285
286        load_matrix(odd_table);
287
288        i = 8;
289
290        do {
291                float t0,t1,t2,t3;
292                fr0 = block[1];
293                fr1 = block[3];
294                fr2 = block[5];
295                fr3 = block[7];
296                block+=8;
297                ftrv();
298                t0 = fblock[0];
299                t1 = fblock[2];
300                t2 = fblock[4];
301                t3 = fblock[6];
302                fblock[0] = t0 + fr0;
303                fblock[7] = t0 - fr0;
304                fblock[1] = t1 + fr1;
305                fblock[6] = t1 - fr1;
306                fblock[2] = t2 + fr2;
307                fblock[5] = t2 - fr2;
308                fblock[3] = t3 + fr3;
309                fblock[4] = t3 - fr3;
310                fblock+=8;
311        } while(--i);
312        block-=8*8;
313        fblock-=8*8;
314
315        /* col */
316
317        /* even part */
318        load_matrix(even_table);
319
320        i = 8;
321
322        do {
323                fr0 = fblock[8*0];
324                fr1 = fblock[8*2];
325                fr2 = fblock[8*4];
326                fr3 = fblock[8*6];
327                ftrv();
328                fblock[8*0] = fr0;
329                fblock[8*2] = fr1;
330                fblock[8*4] = fr2;
331                fblock[8*6] = fr3;
332                fblock++;
333        } while(--i);
334        fblock-=8;
335
336        load_matrix(odd_table);
337
338        i=8;
339        do {
340                float t0,t1,t2,t3;
341                fr0 = fblock[8*1];
342                fr1 = fblock[8*3];
343                fr2 = fblock[8*5];
344                fr3 = fblock[8*7];
345                ftrv();
346                t0 = fblock[8*0];
347                t1 = fblock[8*2];
348                t2 = fblock[8*4];
349                t3 = fblock[8*6];
350                fblock++;
351                block[8*0] = DESCALE(t0 + fr0,3);
352                block[8*7] = DESCALE(t0 - fr0,3);
353                block[8*1] = DESCALE(t1 + fr1,3);
354                block[8*6] = DESCALE(t1 - fr1,3);
355                block[8*2] = DESCALE(t2 + fr2,3);
356                block[8*5] = DESCALE(t2 - fr2,3);
357                block[8*3] = DESCALE(t3 + fr3,3);
358                block[8*4] = DESCALE(t3 - fr3,3);
359                block++;
360        } while(--i);
361}
362#endif
363