1/* 2 * Simple IDCT (Alpha optimized) 3 * 4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> 5 * 6 * based upon some outcommented C code from mpeg2dec (idct_mmx.c 7 * written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>) 8 * 9 * Alpha optimizations by M��ns Rullg��rd <mans@mansr.com> 10 * and Falk Hueffner <falk@debian.org> 11 * 12 * This file is part of FFmpeg. 13 * 14 * FFmpeg is free software; you can redistribute it and/or 15 * modify it under the terms of the GNU Lesser General Public 16 * License as published by the Free Software Foundation; either 17 * version 2.1 of the License, or (at your option) any later version. 18 * 19 * FFmpeg is distributed in the hope that it will be useful, 20 * but WITHOUT ANY WARRANTY; without even the implied warranty of 21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 * Lesser General Public License for more details. 23 * 24 * You should have received a copy of the GNU Lesser General Public 25 * License along with FFmpeg; if not, write to the Free Software 26 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 27 */ 28 29#include "libavcodec/dsputil.h" 30#include "dsputil_alpha.h" 31#include "asm.h" 32 33// cos(i * M_PI / 16) * sqrt(2) * (1 << 14) 34// W4 is actually exactly 16384, but using 16383 works around 35// accumulating rounding errors for some encoders 36#define W1 ((int_fast32_t) 22725) 37#define W2 ((int_fast32_t) 21407) 38#define W3 ((int_fast32_t) 19266) 39#define W4 ((int_fast32_t) 16383) 40#define W5 ((int_fast32_t) 12873) 41#define W6 ((int_fast32_t) 8867) 42#define W7 ((int_fast32_t) 4520) 43#define ROW_SHIFT 11 44#define COL_SHIFT 20 45 46/* 0: all entries 0, 1: only first entry nonzero, 2: otherwise */ 47static inline int idct_row(DCTELEM *row) 48{ 49 int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3, t; 50 uint64_t l, r, t2; 51 l = ldq(row); 52 r = ldq(row + 4); 53 54 if (l == 0 && r == 0) 55 return 0; 56 57 a0 = W4 * sextw(l) + (1 << (ROW_SHIFT - 1)); 58 59 if (((l & ~0xffffUL) | r) == 0) { 60 a0 >>= ROW_SHIFT; 61 t2 = (uint16_t) a0; 62 t2 |= t2 << 16; 63 t2 |= t2 << 32; 64 65 stq(t2, row); 66 stq(t2, row + 4); 67 return 1; 68 } 69 70 a1 = a0; 71 a2 = a0; 72 a3 = a0; 73 74 t = extwl(l, 4); /* row[2] */ 75 if (t != 0) { 76 t = sextw(t); 77 a0 += W2 * t; 78 a1 += W6 * t; 79 a2 -= W6 * t; 80 a3 -= W2 * t; 81 } 82 83 t = extwl(r, 0); /* row[4] */ 84 if (t != 0) { 85 t = sextw(t); 86 a0 += W4 * t; 87 a1 -= W4 * t; 88 a2 -= W4 * t; 89 a3 += W4 * t; 90 } 91 92 t = extwl(r, 4); /* row[6] */ 93 if (t != 0) { 94 t = sextw(t); 95 a0 += W6 * t; 96 a1 -= W2 * t; 97 a2 += W2 * t; 98 a3 -= W6 * t; 99 } 100 101 t = extwl(l, 2); /* row[1] */ 102 if (t != 0) { 103 t = sextw(t); 104 b0 = W1 * t; 105 b1 = W3 * t; 106 b2 = W5 * t; 107 b3 = W7 * t; 108 } else { 109 b0 = 0; 110 b1 = 0; 111 b2 = 0; 112 b3 = 0; 113 } 114 115 t = extwl(l, 6); /* row[3] */ 116 if (t) { 117 t = sextw(t); 118 b0 += W3 * t; 119 b1 -= W7 * t; 120 b2 -= W1 * t; 121 b3 -= W5 * t; 122 } 123 124 125 t = extwl(r, 2); /* row[5] */ 126 if (t) { 127 t = sextw(t); 128 b0 += W5 * t; 129 b1 -= W1 * t; 130 b2 += W7 * t; 131 b3 += W3 * t; 132 } 133 134 t = extwl(r, 6); /* row[7] */ 135 if (t) { 136 t = sextw(t); 137 b0 += W7 * t; 138 b1 -= W5 * t; 139 b2 += W3 * t; 140 b3 -= W1 * t; 141 } 142 143 row[0] = (a0 + b0) >> ROW_SHIFT; 144 row[1] = (a1 + b1) >> ROW_SHIFT; 145 row[2] = (a2 + b2) >> ROW_SHIFT; 146 row[3] = (a3 + b3) >> ROW_SHIFT; 147 row[4] = (a3 - b3) >> ROW_SHIFT; 148 row[5] = (a2 - b2) >> ROW_SHIFT; 149 row[6] = (a1 - b1) >> ROW_SHIFT; 150 row[7] = (a0 - b0) >> ROW_SHIFT; 151 152 return 2; 153} 154 155static inline void idct_col(DCTELEM *col) 156{ 157 int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3; 158 159 col[0] += (1 << (COL_SHIFT - 1)) / W4; 160 161 a0 = W4 * col[8 * 0]; 162 a1 = W4 * col[8 * 0]; 163 a2 = W4 * col[8 * 0]; 164 a3 = W4 * col[8 * 0]; 165 166 if (col[8 * 2]) { 167 a0 += W2 * col[8 * 2]; 168 a1 += W6 * col[8 * 2]; 169 a2 -= W6 * col[8 * 2]; 170 a3 -= W2 * col[8 * 2]; 171 } 172 173 if (col[8 * 4]) { 174 a0 += W4 * col[8 * 4]; 175 a1 -= W4 * col[8 * 4]; 176 a2 -= W4 * col[8 * 4]; 177 a3 += W4 * col[8 * 4]; 178 } 179 180 if (col[8 * 6]) { 181 a0 += W6 * col[8 * 6]; 182 a1 -= W2 * col[8 * 6]; 183 a2 += W2 * col[8 * 6]; 184 a3 -= W6 * col[8 * 6]; 185 } 186 187 if (col[8 * 1]) { 188 b0 = W1 * col[8 * 1]; 189 b1 = W3 * col[8 * 1]; 190 b2 = W5 * col[8 * 1]; 191 b3 = W7 * col[8 * 1]; 192 } else { 193 b0 = 0; 194 b1 = 0; 195 b2 = 0; 196 b3 = 0; 197 } 198 199 if (col[8 * 3]) { 200 b0 += W3 * col[8 * 3]; 201 b1 -= W7 * col[8 * 3]; 202 b2 -= W1 * col[8 * 3]; 203 b3 -= W5 * col[8 * 3]; 204 } 205 206 if (col[8 * 5]) { 207 b0 += W5 * col[8 * 5]; 208 b1 -= W1 * col[8 * 5]; 209 b2 += W7 * col[8 * 5]; 210 b3 += W3 * col[8 * 5]; 211 } 212 213 if (col[8 * 7]) { 214 b0 += W7 * col[8 * 7]; 215 b1 -= W5 * col[8 * 7]; 216 b2 += W3 * col[8 * 7]; 217 b3 -= W1 * col[8 * 7]; 218 } 219 220 col[8 * 0] = (a0 + b0) >> COL_SHIFT; 221 col[8 * 7] = (a0 - b0) >> COL_SHIFT; 222 col[8 * 1] = (a1 + b1) >> COL_SHIFT; 223 col[8 * 6] = (a1 - b1) >> COL_SHIFT; 224 col[8 * 2] = (a2 + b2) >> COL_SHIFT; 225 col[8 * 5] = (a2 - b2) >> COL_SHIFT; 226 col[8 * 3] = (a3 + b3) >> COL_SHIFT; 227 col[8 * 4] = (a3 - b3) >> COL_SHIFT; 228} 229 230/* If all rows but the first one are zero after row transformation, 231 all rows will be identical after column transformation. */ 232static inline void idct_col2(DCTELEM *col) 233{ 234 int i; 235 uint64_t l, r; 236 237 for (i = 0; i < 8; ++i) { 238 int_fast32_t a0 = col[i] + (1 << (COL_SHIFT - 1)) / W4; 239 240 a0 *= W4; 241 col[i] = a0 >> COL_SHIFT; 242 } 243 244 l = ldq(col + 0 * 4); r = ldq(col + 1 * 4); 245 stq(l, col + 2 * 4); stq(r, col + 3 * 4); 246 stq(l, col + 4 * 4); stq(r, col + 5 * 4); 247 stq(l, col + 6 * 4); stq(r, col + 7 * 4); 248 stq(l, col + 8 * 4); stq(r, col + 9 * 4); 249 stq(l, col + 10 * 4); stq(r, col + 11 * 4); 250 stq(l, col + 12 * 4); stq(r, col + 13 * 4); 251 stq(l, col + 14 * 4); stq(r, col + 15 * 4); 252} 253 254void ff_simple_idct_axp(DCTELEM *block) 255{ 256 257 int i; 258 int rowsZero = 1; /* all rows except row 0 zero */ 259 int rowsConstant = 1; /* all rows consist of a constant value */ 260 261 for (i = 0; i < 8; i++) { 262 int sparseness = idct_row(block + 8 * i); 263 264 if (i > 0 && sparseness > 0) 265 rowsZero = 0; 266 if (sparseness == 2) 267 rowsConstant = 0; 268 } 269 270 if (rowsZero) { 271 idct_col2(block); 272 } else if (rowsConstant) { 273 idct_col(block); 274 for (i = 0; i < 8; i += 2) { 275 uint64_t v = (uint16_t) block[0]; 276 uint64_t w = (uint16_t) block[8]; 277 278 v |= v << 16; 279 w |= w << 16; 280 v |= v << 32; 281 w |= w << 32; 282 stq(v, block + 0 * 4); 283 stq(v, block + 1 * 4); 284 stq(w, block + 2 * 4); 285 stq(w, block + 3 * 4); 286 block += 4 * 4; 287 } 288 } else { 289 for (i = 0; i < 8; i++) 290 idct_col(block + i); 291 } 292} 293 294void ff_simple_idct_put_axp(uint8_t *dest, int line_size, DCTELEM *block) 295{ 296 ff_simple_idct_axp(block); 297 put_pixels_clamped_axp_p(block, dest, line_size); 298} 299 300void ff_simple_idct_add_axp(uint8_t *dest, int line_size, DCTELEM *block) 301{ 302 ff_simple_idct_axp(block); 303 add_pixels_clamped_axp_p(block, dest, line_size); 304} 305