1/* 2 * idct for sh4 3 * 4 * Copyright (c) 2001-2003 BERO <bero@geocities.co.jp> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23#include "libavcodec/dsputil.h" 24#include "dsputil_sh4.h" 25#include "sh4.h" 26 27#define c1 1.38703984532214752434 /* sqrt(2)*cos(1*pi/16) */ 28#define c2 1.30656296487637657577 /* sqrt(2)*cos(2*pi/16) */ 29#define c3 1.17587560241935884520 /* sqrt(2)*cos(3*pi/16) */ 30#define c4 1.00000000000000000000 /* sqrt(2)*cos(4*pi/16) */ 31#define c5 0.78569495838710234903 /* sqrt(2)*cos(5*pi/16) */ 32#define c6 0.54119610014619712324 /* sqrt(2)*cos(6*pi/16) */ 33#define c7 0.27589937928294311353 /* sqrt(2)*cos(7*pi/16) */ 34 35static const float even_table[] __attribute__ ((aligned(8))) = { 36 c4, c4, c4, c4, 37 c2, c6,-c6,-c2, 38 c4,-c4,-c4, c4, 39 c6,-c2, c2,-c6 40}; 41 42static const float odd_table[] __attribute__ ((aligned(8))) = { 43 c1, c3, c5, c7, 44 c3,-c7,-c1,-c5, 45 c5,-c1, c7, c3, 46 c7,-c5, c3,-c1 47}; 48 49#undef c1 50#undef c2 51#undef c3 52#undef c4 53#undef c5 54#undef c6 55#undef c7 56 57#if 1 58 59#define load_matrix(table) \ 60 do { \ 61 const float *t = table; \ 62 __asm__ volatile( \ 63 " fschg\n" \ 64 " fmov @%0+,xd0\n" \ 65 " fmov @%0+,xd2\n" \ 66 " fmov @%0+,xd4\n" \ 67 " fmov @%0+,xd6\n" \ 68 " fmov @%0+,xd8\n" \ 69 " fmov @%0+,xd10\n" \ 70 " fmov @%0+,xd12\n" \ 71 " fmov @%0+,xd14\n" \ 72 " fschg\n" \ 73 : "+r"(t) \ 74 ); \ 75 } while (0) 76 77#define ftrv() \ 78 __asm__ volatile("ftrv xmtrx,fv0" \ 79 : "+f"(fr0),"+f"(fr1),"+f"(fr2),"+f"(fr3)); 80 81#define DEFREG \ 82 register float fr0 __asm__("fr0"); \ 83 register float fr1 __asm__("fr1"); \ 84 register float fr2 __asm__("fr2"); \ 85 register float fr3 __asm__("fr3") 86 87#else 88 89/* generic C code for check */ 90 91static void ftrv_(const float xf[],float fv[]) 92{ 93 float f0,f1,f2,f3; 94 f0 = fv[0]; 95 f1 = fv[1]; 96 f2 = fv[2]; 97 f3 = fv[3]; 98 fv[0] = xf[0]*f0 + xf[4]*f1 + xf[ 8]*f2 + xf[12]*f3; 99 fv[1] = xf[1]*f0 + xf[5]*f1 + xf[ 9]*f2 + xf[13]*f3; 100 fv[2] = xf[2]*f0 + xf[6]*f1 + xf[10]*f2 + xf[14]*f3; 101 fv[3] = xf[3]*f0 + xf[7]*f1 + xf[11]*f2 + xf[15]*f3; 102} 103 104static void load_matrix_(float xf[],const float table[]) 105{ 106 int i; 107 for(i=0;i<16;i++) xf[i]=table[i]; 108} 109 110#define ftrv() ftrv_(xf,fv) 111#define load_matrix(table) load_matrix_(xf,table) 112 113#define DEFREG \ 114 float fv[4],xf[16] 115 116#define fr0 fv[0] 117#define fr1 fv[1] 118#define fr2 fv[2] 119#define fr3 fv[3] 120 121#endif 122 123#if 1 124#define DESCALE(x,n) (x)*(1.0f/(1<<(n))) 125#else 126#define DESCALE(x,n) (((int)(x)+(1<<(n-1)))>>(n)) 127#endif 128 129/* this code work worse on gcc cvs. 3.2.3 work fine */ 130 131 132#if 1 133//optimized 134 135void idct_sh4(DCTELEM *block) 136{ 137 DEFREG; 138 139 int i; 140 float tblock[8*8],*fblock; 141 int ofs1,ofs2,ofs3; 142 int fpscr; 143 144 fp_single_enter(fpscr); 145 146 /* row */ 147 148 /* even part */ 149 load_matrix(even_table); 150 151 fblock = tblock+4; 152 i = 8; 153 do { 154 fr0 = block[0]; 155 fr1 = block[2]; 156 fr2 = block[4]; 157 fr3 = block[6]; 158 block+=8; 159 ftrv(); 160 *--fblock = fr3; 161 *--fblock = fr2; 162 *--fblock = fr1; 163 *--fblock = fr0; 164 fblock+=8+4; 165 } while(--i); 166 block-=8*8; 167 fblock-=8*8+4; 168 169 load_matrix(odd_table); 170 171 i = 8; 172 173 do { 174 float t0,t1,t2,t3; 175 fr0 = block[1]; 176 fr1 = block[3]; 177 fr2 = block[5]; 178 fr3 = block[7]; 179 block+=8; 180 ftrv(); 181 t0 = *fblock++; 182 t1 = *fblock++; 183 t2 = *fblock++; 184 t3 = *fblock++; 185 fblock+=4; 186 *--fblock = t0 - fr0; 187 *--fblock = t1 - fr1; 188 *--fblock = t2 - fr2; 189 *--fblock = t3 - fr3; 190 *--fblock = t3 + fr3; 191 *--fblock = t2 + fr2; 192 *--fblock = t1 + fr1; 193 *--fblock = t0 + fr0; 194 fblock+=8; 195 } while(--i); 196 block-=8*8; 197 fblock-=8*8; 198 199 /* col */ 200 201 /* even part */ 202 load_matrix(even_table); 203 204 ofs1 = sizeof(float)*2*8; 205 ofs2 = sizeof(float)*4*8; 206 ofs3 = sizeof(float)*6*8; 207 208 i = 8; 209 210#define OA(fblock,ofs) *(float*)((char*)fblock + ofs) 211 212 do { 213 fr0 = OA(fblock, 0); 214 fr1 = OA(fblock,ofs1); 215 fr2 = OA(fblock,ofs2); 216 fr3 = OA(fblock,ofs3); 217 ftrv(); 218 OA(fblock,0 ) = fr0; 219 OA(fblock,ofs1) = fr1; 220 OA(fblock,ofs2) = fr2; 221 OA(fblock,ofs3) = fr3; 222 fblock++; 223 } while(--i); 224 fblock-=8; 225 226 load_matrix(odd_table); 227 228 i=8; 229 do { 230 float t0,t1,t2,t3; 231 t0 = OA(fblock, 0); /* [8*0] */ 232 t1 = OA(fblock,ofs1); /* [8*2] */ 233 t2 = OA(fblock,ofs2); /* [8*4] */ 234 t3 = OA(fblock,ofs3); /* [8*6] */ 235 fblock+=8; 236 fr0 = OA(fblock, 0); /* [8*1] */ 237 fr1 = OA(fblock,ofs1); /* [8*3] */ 238 fr2 = OA(fblock,ofs2); /* [8*5] */ 239 fr3 = OA(fblock,ofs3); /* [8*7] */ 240 fblock+=-8+1; 241 ftrv(); 242 block[8*0] = DESCALE(t0 + fr0,3); 243 block[8*7] = DESCALE(t0 - fr0,3); 244 block[8*1] = DESCALE(t1 + fr1,3); 245 block[8*6] = DESCALE(t1 - fr1,3); 246 block[8*2] = DESCALE(t2 + fr2,3); 247 block[8*5] = DESCALE(t2 - fr2,3); 248 block[8*3] = DESCALE(t3 + fr3,3); 249 block[8*4] = DESCALE(t3 - fr3,3); 250 block++; 251 } while(--i); 252 253 fp_single_leave(fpscr); 254} 255#else 256void idct_sh4(DCTELEM *block) 257{ 258 DEFREG; 259 260 int i; 261 float tblock[8*8],*fblock; 262 263 /* row */ 264 265 /* even part */ 266 load_matrix(even_table); 267 268 fblock = tblock; 269 i = 8; 270 do { 271 fr0 = block[0]; 272 fr1 = block[2]; 273 fr2 = block[4]; 274 fr3 = block[6]; 275 block+=8; 276 ftrv(); 277 fblock[0] = fr0; 278 fblock[2] = fr1; 279 fblock[4] = fr2; 280 fblock[6] = fr3; 281 fblock+=8; 282 } while(--i); 283 block-=8*8; 284 fblock-=8*8; 285 286 load_matrix(odd_table); 287 288 i = 8; 289 290 do { 291 float t0,t1,t2,t3; 292 fr0 = block[1]; 293 fr1 = block[3]; 294 fr2 = block[5]; 295 fr3 = block[7]; 296 block+=8; 297 ftrv(); 298 t0 = fblock[0]; 299 t1 = fblock[2]; 300 t2 = fblock[4]; 301 t3 = fblock[6]; 302 fblock[0] = t0 + fr0; 303 fblock[7] = t0 - fr0; 304 fblock[1] = t1 + fr1; 305 fblock[6] = t1 - fr1; 306 fblock[2] = t2 + fr2; 307 fblock[5] = t2 - fr2; 308 fblock[3] = t3 + fr3; 309 fblock[4] = t3 - fr3; 310 fblock+=8; 311 } while(--i); 312 block-=8*8; 313 fblock-=8*8; 314 315 /* col */ 316 317 /* even part */ 318 load_matrix(even_table); 319 320 i = 8; 321 322 do { 323 fr0 = fblock[8*0]; 324 fr1 = fblock[8*2]; 325 fr2 = fblock[8*4]; 326 fr3 = fblock[8*6]; 327 ftrv(); 328 fblock[8*0] = fr0; 329 fblock[8*2] = fr1; 330 fblock[8*4] = fr2; 331 fblock[8*6] = fr3; 332 fblock++; 333 } while(--i); 334 fblock-=8; 335 336 load_matrix(odd_table); 337 338 i=8; 339 do { 340 float t0,t1,t2,t3; 341 fr0 = fblock[8*1]; 342 fr1 = fblock[8*3]; 343 fr2 = fblock[8*5]; 344 fr3 = fblock[8*7]; 345 ftrv(); 346 t0 = fblock[8*0]; 347 t1 = fblock[8*2]; 348 t2 = fblock[8*4]; 349 t3 = fblock[8*6]; 350 fblock++; 351 block[8*0] = DESCALE(t0 + fr0,3); 352 block[8*7] = DESCALE(t0 - fr0,3); 353 block[8*1] = DESCALE(t1 + fr1,3); 354 block[8*6] = DESCALE(t1 - fr1,3); 355 block[8*2] = DESCALE(t2 + fr2,3); 356 block[8*5] = DESCALE(t2 - fr2,3); 357 block[8*3] = DESCALE(t3 + fr3,3); 358 block[8*4] = DESCALE(t3 - fr3,3); 359 block++; 360 } while(--i); 361} 362#endif 363