1/* 2 * Simple IDCT (Alpha optimized) 3 * 4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> 5 * 6 * based upon some outcommented C code from mpeg2dec (idct_mmx.c 7 * written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>) 8 * 9 * Alpha optimizations by M��ns Rullg��rd <mans@mansr.com> 10 * and Falk Hueffner <falk@debian.org> 11 * 12 * This file is part of FFmpeg. 13 * 14 * FFmpeg is free software; you can redistribute it and/or 15 * modify it under the terms of the GNU Lesser General Public 16 * License as published by the Free Software Foundation; either 17 * version 2.1 of the License, or (at your option) any later version. 18 * 19 * FFmpeg is distributed in the hope that it will be useful, 20 * but WITHOUT ANY WARRANTY; without even the implied warranty of 21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 * Lesser General Public License for more details. 23 * 24 * You should have received a copy of the GNU Lesser General Public 25 * License along with FFmpeg; if not, write to the Free Software 26 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 27 */ 28 29#include "libavcodec/dsputil.h" 30#include "asm.h" 31 32extern void (*put_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels, 33 int line_size); 34extern void (*add_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels, 35 int line_size); 36 37// cos(i * M_PI / 16) * sqrt(2) * (1 << 14) 38// W4 is actually exactly 16384, but using 16383 works around 39// accumulating rounding errors for some encoders 40#define W1 ((int_fast32_t) 22725) 41#define W2 ((int_fast32_t) 21407) 42#define W3 ((int_fast32_t) 19266) 43#define W4 ((int_fast32_t) 16383) 44#define W5 ((int_fast32_t) 12873) 45#define W6 ((int_fast32_t) 8867) 46#define W7 ((int_fast32_t) 4520) 47#define ROW_SHIFT 11 48#define COL_SHIFT 20 49 50/* 0: all entries 0, 1: only first entry nonzero, 2: otherwise */ 51static inline int idct_row(DCTELEM *row) 52{ 53 int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3, t; 54 uint64_t l, r, t2; 55 l = ldq(row); 56 r = ldq(row + 4); 57 58 if (l == 0 && r == 0) 59 return 0; 60 61 a0 = W4 * sextw(l) + (1 << (ROW_SHIFT - 1)); 62 63 if (((l & ~0xffffUL) | r) == 0) { 64 a0 >>= ROW_SHIFT; 65 t2 = (uint16_t) a0; 66 t2 |= t2 << 16; 67 t2 |= t2 << 32; 68 69 stq(t2, row); 70 stq(t2, row + 4); 71 return 1; 72 } 73 74 a1 = a0; 75 a2 = a0; 76 a3 = a0; 77 78 t = extwl(l, 4); /* row[2] */ 79 if (t != 0) { 80 t = sextw(t); 81 a0 += W2 * t; 82 a1 += W6 * t; 83 a2 -= W6 * t; 84 a3 -= W2 * t; 85 } 86 87 t = extwl(r, 0); /* row[4] */ 88 if (t != 0) { 89 t = sextw(t); 90 a0 += W4 * t; 91 a1 -= W4 * t; 92 a2 -= W4 * t; 93 a3 += W4 * t; 94 } 95 96 t = extwl(r, 4); /* row[6] */ 97 if (t != 0) { 98 t = sextw(t); 99 a0 += W6 * t; 100 a1 -= W2 * t; 101 a2 += W2 * t; 102 a3 -= W6 * t; 103 } 104 105 t = extwl(l, 2); /* row[1] */ 106 if (t != 0) { 107 t = sextw(t); 108 b0 = W1 * t; 109 b1 = W3 * t; 110 b2 = W5 * t; 111 b3 = W7 * t; 112 } else { 113 b0 = 0; 114 b1 = 0; 115 b2 = 0; 116 b3 = 0; 117 } 118 119 t = extwl(l, 6); /* row[3] */ 120 if (t) { 121 t = sextw(t); 122 b0 += W3 * t; 123 b1 -= W7 * t; 124 b2 -= W1 * t; 125 b3 -= W5 * t; 126 } 127 128 129 t = extwl(r, 2); /* row[5] */ 130 if (t) { 131 t = sextw(t); 132 b0 += W5 * t; 133 b1 -= W1 * t; 134 b2 += W7 * t; 135 b3 += W3 * t; 136 } 137 138 t = extwl(r, 6); /* row[7] */ 139 if (t) { 140 t = sextw(t); 141 b0 += W7 * t; 142 b1 -= W5 * t; 143 b2 += W3 * t; 144 b3 -= W1 * t; 145 } 146 147 row[0] = (a0 + b0) >> ROW_SHIFT; 148 row[1] = (a1 + b1) >> ROW_SHIFT; 149 row[2] = (a2 + b2) >> ROW_SHIFT; 150 row[3] = (a3 + b3) >> ROW_SHIFT; 151 row[4] = (a3 - b3) >> ROW_SHIFT; 152 row[5] = (a2 - b2) >> ROW_SHIFT; 153 row[6] = (a1 - b1) >> ROW_SHIFT; 154 row[7] = (a0 - b0) >> ROW_SHIFT; 155 156 return 2; 157} 158 159static inline void idct_col(DCTELEM *col) 160{ 161 int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3; 162 163 col[0] += (1 << (COL_SHIFT - 1)) / W4; 164 165 a0 = W4 * col[8 * 0]; 166 a1 = W4 * col[8 * 0]; 167 a2 = W4 * col[8 * 0]; 168 a3 = W4 * col[8 * 0]; 169 170 if (col[8 * 2]) { 171 a0 += W2 * col[8 * 2]; 172 a1 += W6 * col[8 * 2]; 173 a2 -= W6 * col[8 * 2]; 174 a3 -= W2 * col[8 * 2]; 175 } 176 177 if (col[8 * 4]) { 178 a0 += W4 * col[8 * 4]; 179 a1 -= W4 * col[8 * 4]; 180 a2 -= W4 * col[8 * 4]; 181 a3 += W4 * col[8 * 4]; 182 } 183 184 if (col[8 * 6]) { 185 a0 += W6 * col[8 * 6]; 186 a1 -= W2 * col[8 * 6]; 187 a2 += W2 * col[8 * 6]; 188 a3 -= W6 * col[8 * 6]; 189 } 190 191 if (col[8 * 1]) { 192 b0 = W1 * col[8 * 1]; 193 b1 = W3 * col[8 * 1]; 194 b2 = W5 * col[8 * 1]; 195 b3 = W7 * col[8 * 1]; 196 } else { 197 b0 = 0; 198 b1 = 0; 199 b2 = 0; 200 b3 = 0; 201 } 202 203 if (col[8 * 3]) { 204 b0 += W3 * col[8 * 3]; 205 b1 -= W7 * col[8 * 3]; 206 b2 -= W1 * col[8 * 3]; 207 b3 -= W5 * col[8 * 3]; 208 } 209 210 if (col[8 * 5]) { 211 b0 += W5 * col[8 * 5]; 212 b1 -= W1 * col[8 * 5]; 213 b2 += W7 * col[8 * 5]; 214 b3 += W3 * col[8 * 5]; 215 } 216 217 if (col[8 * 7]) { 218 b0 += W7 * col[8 * 7]; 219 b1 -= W5 * col[8 * 7]; 220 b2 += W3 * col[8 * 7]; 221 b3 -= W1 * col[8 * 7]; 222 } 223 224 col[8 * 0] = (a0 + b0) >> COL_SHIFT; 225 col[8 * 7] = (a0 - b0) >> COL_SHIFT; 226 col[8 * 1] = (a1 + b1) >> COL_SHIFT; 227 col[8 * 6] = (a1 - b1) >> COL_SHIFT; 228 col[8 * 2] = (a2 + b2) >> COL_SHIFT; 229 col[8 * 5] = (a2 - b2) >> COL_SHIFT; 230 col[8 * 3] = (a3 + b3) >> COL_SHIFT; 231 col[8 * 4] = (a3 - b3) >> COL_SHIFT; 232} 233 234/* If all rows but the first one are zero after row transformation, 235 all rows will be identical after column transformation. */ 236static inline void idct_col2(DCTELEM *col) 237{ 238 int i; 239 uint64_t l, r; 240 241 for (i = 0; i < 8; ++i) { 242 int_fast32_t a0 = col[i] + (1 << (COL_SHIFT - 1)) / W4; 243 244 a0 *= W4; 245 col[i] = a0 >> COL_SHIFT; 246 } 247 248 l = ldq(col + 0 * 4); r = ldq(col + 1 * 4); 249 stq(l, col + 2 * 4); stq(r, col + 3 * 4); 250 stq(l, col + 4 * 4); stq(r, col + 5 * 4); 251 stq(l, col + 6 * 4); stq(r, col + 7 * 4); 252 stq(l, col + 8 * 4); stq(r, col + 9 * 4); 253 stq(l, col + 10 * 4); stq(r, col + 11 * 4); 254 stq(l, col + 12 * 4); stq(r, col + 13 * 4); 255 stq(l, col + 14 * 4); stq(r, col + 15 * 4); 256} 257 258void ff_simple_idct_axp(DCTELEM *block) 259{ 260 261 int i; 262 int rowsZero = 1; /* all rows except row 0 zero */ 263 int rowsConstant = 1; /* all rows consist of a constant value */ 264 265 for (i = 0; i < 8; i++) { 266 int sparseness = idct_row(block + 8 * i); 267 268 if (i > 0 && sparseness > 0) 269 rowsZero = 0; 270 if (sparseness == 2) 271 rowsConstant = 0; 272 } 273 274 if (rowsZero) { 275 idct_col2(block); 276 } else if (rowsConstant) { 277 idct_col(block); 278 for (i = 0; i < 8; i += 2) { 279 uint64_t v = (uint16_t) block[0]; 280 uint64_t w = (uint16_t) block[8]; 281 282 v |= v << 16; 283 w |= w << 16; 284 v |= v << 32; 285 w |= w << 32; 286 stq(v, block + 0 * 4); 287 stq(v, block + 1 * 4); 288 stq(w, block + 2 * 4); 289 stq(w, block + 3 * 4); 290 block += 4 * 4; 291 } 292 } else { 293 for (i = 0; i < 8; i++) 294 idct_col(block + i); 295 } 296} 297 298void ff_simple_idct_put_axp(uint8_t *dest, int line_size, DCTELEM *block) 299{ 300 ff_simple_idct_axp(block); 301 put_pixels_clamped_axp_p(block, dest, line_size); 302} 303 304void ff_simple_idct_add_axp(uint8_t *dest, int line_size, DCTELEM *block) 305{ 306 ff_simple_idct_axp(block); 307 add_pixels_clamped_axp_p(block, dest, line_size); 308} 309