1/* 2 * Alpha optimized DSP utils 3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavcodec/dsputil.h" 23#include "dsputil_alpha.h" 24#include "asm.h" 25 26void (*put_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels, 27 int line_size); 28void (*add_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels, 29 int line_size); 30 31#if 0 32/* These functions were the base for the optimized assembler routines, 33 and remain here for documentation purposes. */ 34static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 35 int line_size) 36{ 37 int i = 8; 38 uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ 39 40 do { 41 uint64_t shorts0, shorts1; 42 43 shorts0 = ldq(block); 44 shorts0 = maxsw4(shorts0, 0); 45 shorts0 = minsw4(shorts0, clampmask); 46 stl(pkwb(shorts0), pixels); 47 48 shorts1 = ldq(block + 4); 49 shorts1 = maxsw4(shorts1, 0); 50 shorts1 = minsw4(shorts1, clampmask); 51 stl(pkwb(shorts1), pixels + 4); 52 53 pixels += line_size; 54 block += 8; 55 } while (--i); 56} 57 58void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 59 int line_size) 60{ 61 int h = 8; 62 /* Keep this function a leaf function by generating the constants 63 manually (mainly for the hack value ;-). */ 64 uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ 65 uint64_t signmask = zap(-1, 0x33); 66 signmask ^= signmask >> 1; /* 0x8000800080008000 */ 67 68 do { 69 uint64_t shorts0, pix0, signs0; 70 uint64_t shorts1, pix1, signs1; 71 72 shorts0 = ldq(block); 73 shorts1 = ldq(block + 4); 74 75 pix0 = unpkbw(ldl(pixels)); 76 /* Signed subword add (MMX paddw). */ 77 signs0 = shorts0 & signmask; 78 shorts0 &= ~signmask; 79 shorts0 += pix0; 80 shorts0 ^= signs0; 81 /* Clamp. */ 82 shorts0 = maxsw4(shorts0, 0); 83 shorts0 = minsw4(shorts0, clampmask); 84 85 /* Next 4. */ 86 pix1 = unpkbw(ldl(pixels + 4)); 87 signs1 = shorts1 & signmask; 88 shorts1 &= ~signmask; 89 shorts1 += pix1; 90 shorts1 ^= signs1; 91 shorts1 = maxsw4(shorts1, 0); 92 shorts1 = minsw4(shorts1, clampmask); 93 94 stl(pkwb(shorts0), pixels); 95 stl(pkwb(shorts1), pixels + 4); 96 97 pixels += line_size; 98 block += 8; 99 } while (--h); 100} 101#endif 102 103static void clear_blocks_axp(DCTELEM *blocks) { 104 uint64_t *p = (uint64_t *) blocks; 105 int n = sizeof(DCTELEM) * 6 * 64; 106 107 do { 108 p[0] = 0; 109 p[1] = 0; 110 p[2] = 0; 111 p[3] = 0; 112 p[4] = 0; 113 p[5] = 0; 114 p[6] = 0; 115 p[7] = 0; 116 p += 8; 117 n -= 8 * 8; 118 } while (n); 119} 120 121static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b) 122{ 123 return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1); 124} 125 126static inline uint64_t avg2(uint64_t a, uint64_t b) 127{ 128 return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); 129} 130 131#if 0 132/* The XY2 routines basically utilize this scheme, but reuse parts in 133 each iteration. */ 134static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) 135{ 136 uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) 137 + ((l2 & ~BYTE_VEC(0x03)) >> 2) 138 + ((l3 & ~BYTE_VEC(0x03)) >> 2) 139 + ((l4 & ~BYTE_VEC(0x03)) >> 2); 140 uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) 141 + (l2 & BYTE_VEC(0x03)) 142 + (l3 & BYTE_VEC(0x03)) 143 + (l4 & BYTE_VEC(0x03)) 144 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); 145 return r1 + r2; 146} 147#endif 148 149#define OP(LOAD, STORE) \ 150 do { \ 151 STORE(LOAD(pixels), block); \ 152 pixels += line_size; \ 153 block += line_size; \ 154 } while (--h) 155 156#define OP_X2(LOAD, STORE) \ 157 do { \ 158 uint64_t pix1, pix2; \ 159 \ 160 pix1 = LOAD(pixels); \ 161 pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ 162 STORE(AVG2(pix1, pix2), block); \ 163 pixels += line_size; \ 164 block += line_size; \ 165 } while (--h) 166 167#define OP_Y2(LOAD, STORE) \ 168 do { \ 169 uint64_t pix = LOAD(pixels); \ 170 do { \ 171 uint64_t next_pix; \ 172 \ 173 pixels += line_size; \ 174 next_pix = LOAD(pixels); \ 175 STORE(AVG2(pix, next_pix), block); \ 176 block += line_size; \ 177 pix = next_pix; \ 178 } while (--h); \ 179 } while (0) 180 181#define OP_XY2(LOAD, STORE) \ 182 do { \ 183 uint64_t pix1 = LOAD(pixels); \ 184 uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ 185 uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \ 186 + (pix2 & BYTE_VEC(0x03)); \ 187 uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \ 188 + ((pix2 & ~BYTE_VEC(0x03)) >> 2); \ 189 \ 190 do { \ 191 uint64_t npix1, npix2; \ 192 uint64_t npix_l, npix_h; \ 193 uint64_t avg; \ 194 \ 195 pixels += line_size; \ 196 npix1 = LOAD(pixels); \ 197 npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \ 198 npix_l = (npix1 & BYTE_VEC(0x03)) \ 199 + (npix2 & BYTE_VEC(0x03)); \ 200 npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \ 201 + ((npix2 & ~BYTE_VEC(0x03)) >> 2); \ 202 avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \ 203 + pix_h + npix_h; \ 204 STORE(avg, block); \ 205 \ 206 block += line_size; \ 207 pix_l = npix_l; \ 208 pix_h = npix_h; \ 209 } while (--h); \ 210 } while (0) 211 212#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \ 213static void OPNAME ## _pixels ## SUFF ## _axp \ 214 (uint8_t *restrict block, const uint8_t *restrict pixels, \ 215 int line_size, int h) \ 216{ \ 217 if ((size_t) pixels & 0x7) { \ 218 OPKIND(uldq, STORE); \ 219 } else { \ 220 OPKIND(ldq, STORE); \ 221 } \ 222} \ 223 \ 224static void OPNAME ## _pixels16 ## SUFF ## _axp \ 225 (uint8_t *restrict block, const uint8_t *restrict pixels, \ 226 int line_size, int h) \ 227{ \ 228 OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \ 229 OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \ 230} 231 232#define PIXOP(OPNAME, STORE) \ 233 MAKE_OP(OPNAME, , OP, STORE) \ 234 MAKE_OP(OPNAME, _x2, OP_X2, STORE) \ 235 MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \ 236 MAKE_OP(OPNAME, _xy2, OP_XY2, STORE) 237 238/* Rounding primitives. */ 239#define AVG2 avg2 240#define AVG4 avg4 241#define AVG4_ROUNDER BYTE_VEC(0x02) 242#define STORE(l, b) stq(l, b) 243PIXOP(put, STORE); 244 245#undef STORE 246#define STORE(l, b) stq(AVG2(l, ldq(b)), b); 247PIXOP(avg, STORE); 248 249/* Not rounding primitives. */ 250#undef AVG2 251#undef AVG4 252#undef AVG4_ROUNDER 253#undef STORE 254#define AVG2 avg2_no_rnd 255#define AVG4 avg4_no_rnd 256#define AVG4_ROUNDER BYTE_VEC(0x01) 257#define STORE(l, b) stq(l, b) 258PIXOP(put_no_rnd, STORE); 259 260#undef STORE 261#define STORE(l, b) stq(AVG2(l, ldq(b)), b); 262PIXOP(avg_no_rnd, STORE); 263 264static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels, 265 int line_size, int h) 266{ 267 put_pixels_axp_asm(block, pixels, line_size, h); 268 put_pixels_axp_asm(block + 8, pixels + 8, line_size, h); 269} 270 271void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx) 272{ 273 c->put_pixels_tab[0][0] = put_pixels16_axp_asm; 274 c->put_pixels_tab[0][1] = put_pixels16_x2_axp; 275 c->put_pixels_tab[0][2] = put_pixels16_y2_axp; 276 c->put_pixels_tab[0][3] = put_pixels16_xy2_axp; 277 278 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm; 279 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp; 280 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp; 281 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp; 282 283 c->avg_pixels_tab[0][0] = avg_pixels16_axp; 284 c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp; 285 c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp; 286 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp; 287 288 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_axp; 289 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_axp; 290 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_axp; 291 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_axp; 292 293 c->put_pixels_tab[1][0] = put_pixels_axp_asm; 294 c->put_pixels_tab[1][1] = put_pixels_x2_axp; 295 c->put_pixels_tab[1][2] = put_pixels_y2_axp; 296 c->put_pixels_tab[1][3] = put_pixels_xy2_axp; 297 298 c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm; 299 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp; 300 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp; 301 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp; 302 303 c->avg_pixels_tab[1][0] = avg_pixels_axp; 304 c->avg_pixels_tab[1][1] = avg_pixels_x2_axp; 305 c->avg_pixels_tab[1][2] = avg_pixels_y2_axp; 306 c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp; 307 308 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels_axp; 309 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels_x2_axp; 310 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels_y2_axp; 311 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels_xy2_axp; 312 313 c->clear_blocks = clear_blocks_axp; 314 315 /* amask clears all bits that correspond to present features. */ 316 if (amask(AMASK_MVI) == 0) { 317 c->put_pixels_clamped = put_pixels_clamped_mvi_asm; 318 c->add_pixels_clamped = add_pixels_clamped_mvi_asm; 319 320 c->get_pixels = get_pixels_mvi; 321 c->diff_pixels = diff_pixels_mvi; 322 c->sad[0] = pix_abs16x16_mvi_asm; 323 c->sad[1] = pix_abs8x8_mvi; 324 c->pix_abs[0][0] = pix_abs16x16_mvi_asm; 325 c->pix_abs[1][0] = pix_abs8x8_mvi; 326 c->pix_abs[0][1] = pix_abs16x16_x2_mvi; 327 c->pix_abs[0][2] = pix_abs16x16_y2_mvi; 328 c->pix_abs[0][3] = pix_abs16x16_xy2_mvi; 329 } 330 331 put_pixels_clamped_axp_p = c->put_pixels_clamped; 332 add_pixels_clamped_axp_p = c->add_pixels_clamped; 333 334 if (!avctx->lowres && 335 (avctx->idct_algo == FF_IDCT_AUTO || 336 avctx->idct_algo == FF_IDCT_SIMPLEALPHA)) { 337 c->idct_put = ff_simple_idct_put_axp; 338 c->idct_add = ff_simple_idct_add_axp; 339 c->idct = ff_simple_idct_axp; 340 } 341} 342