1/* 2 * Alpha optimized DSP utils 3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavcodec/dsputil.h" 23#include "asm.h" 24 25void ff_simple_idct_axp(DCTELEM *block); 26void ff_simple_idct_put_axp(uint8_t *dest, int line_size, DCTELEM *block); 27void ff_simple_idct_add_axp(uint8_t *dest, int line_size, DCTELEM *block); 28 29void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, 30 int line_size, int h); 31void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 32 int line_size); 33void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 34 int line_size); 35void (*put_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels, 36 int line_size); 37void (*add_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels, 38 int line_size); 39 40void get_pixels_mvi(DCTELEM *restrict block, 41 const uint8_t *restrict pixels, int line_size); 42void diff_pixels_mvi(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, 43 int stride); 44int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h); 45int pix_abs16x16_mvi_asm(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h); 46int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h); 47int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h); 48int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h); 49 50#if 0 51/* These functions were the base for the optimized assembler routines, 52 and remain here for documentation purposes. */ 53static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 54 int line_size) 55{ 56 int i = 8; 57 uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ 58 59 do { 60 uint64_t shorts0, shorts1; 61 62 shorts0 = ldq(block); 63 shorts0 = maxsw4(shorts0, 0); 64 shorts0 = minsw4(shorts0, clampmask); 65 stl(pkwb(shorts0), pixels); 66 67 shorts1 = ldq(block + 4); 68 shorts1 = maxsw4(shorts1, 0); 69 shorts1 = minsw4(shorts1, clampmask); 70 stl(pkwb(shorts1), pixels + 4); 71 72 pixels += line_size; 73 block += 8; 74 } while (--i); 75} 76 77void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 78 int line_size) 79{ 80 int h = 8; 81 /* Keep this function a leaf function by generating the constants 82 manually (mainly for the hack value ;-). */ 83 uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ 84 uint64_t signmask = zap(-1, 0x33); 85 signmask ^= signmask >> 1; /* 0x8000800080008000 */ 86 87 do { 88 uint64_t shorts0, pix0, signs0; 89 uint64_t shorts1, pix1, signs1; 90 91 shorts0 = ldq(block); 92 shorts1 = ldq(block + 4); 93 94 pix0 = unpkbw(ldl(pixels)); 95 /* Signed subword add (MMX paddw). */ 96 signs0 = shorts0 & signmask; 97 shorts0 &= ~signmask; 98 shorts0 += pix0; 99 shorts0 ^= signs0; 100 /* Clamp. */ 101 shorts0 = maxsw4(shorts0, 0); 102 shorts0 = minsw4(shorts0, clampmask); 103 104 /* Next 4. */ 105 pix1 = unpkbw(ldl(pixels + 4)); 106 signs1 = shorts1 & signmask; 107 shorts1 &= ~signmask; 108 shorts1 += pix1; 109 shorts1 ^= signs1; 110 shorts1 = maxsw4(shorts1, 0); 111 shorts1 = minsw4(shorts1, clampmask); 112 113 stl(pkwb(shorts0), pixels); 114 stl(pkwb(shorts1), pixels + 4); 115 116 pixels += line_size; 117 block += 8; 118 } while (--h); 119} 120#endif 121 122static void clear_blocks_axp(DCTELEM *blocks) { 123 uint64_t *p = (uint64_t *) blocks; 124 int n = sizeof(DCTELEM) * 6 * 64; 125 126 do { 127 p[0] = 0; 128 p[1] = 0; 129 p[2] = 0; 130 p[3] = 0; 131 p[4] = 0; 132 p[5] = 0; 133 p[6] = 0; 134 p[7] = 0; 135 p += 8; 136 n -= 8 * 8; 137 } while (n); 138} 139 140static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b) 141{ 142 return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1); 143} 144 145static inline uint64_t avg2(uint64_t a, uint64_t b) 146{ 147 return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); 148} 149 150#if 0 151/* The XY2 routines basically utilize this scheme, but reuse parts in 152 each iteration. */ 153static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) 154{ 155 uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) 156 + ((l2 & ~BYTE_VEC(0x03)) >> 2) 157 + ((l3 & ~BYTE_VEC(0x03)) >> 2) 158 + ((l4 & ~BYTE_VEC(0x03)) >> 2); 159 uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) 160 + (l2 & BYTE_VEC(0x03)) 161 + (l3 & BYTE_VEC(0x03)) 162 + (l4 & BYTE_VEC(0x03)) 163 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); 164 return r1 + r2; 165} 166#endif 167 168#define OP(LOAD, STORE) \ 169 do { \ 170 STORE(LOAD(pixels), block); \ 171 pixels += line_size; \ 172 block += line_size; \ 173 } while (--h) 174 175#define OP_X2(LOAD, STORE) \ 176 do { \ 177 uint64_t pix1, pix2; \ 178 \ 179 pix1 = LOAD(pixels); \ 180 pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ 181 STORE(AVG2(pix1, pix2), block); \ 182 pixels += line_size; \ 183 block += line_size; \ 184 } while (--h) 185 186#define OP_Y2(LOAD, STORE) \ 187 do { \ 188 uint64_t pix = LOAD(pixels); \ 189 do { \ 190 uint64_t next_pix; \ 191 \ 192 pixels += line_size; \ 193 next_pix = LOAD(pixels); \ 194 STORE(AVG2(pix, next_pix), block); \ 195 block += line_size; \ 196 pix = next_pix; \ 197 } while (--h); \ 198 } while (0) 199 200#define OP_XY2(LOAD, STORE) \ 201 do { \ 202 uint64_t pix1 = LOAD(pixels); \ 203 uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ 204 uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \ 205 + (pix2 & BYTE_VEC(0x03)); \ 206 uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \ 207 + ((pix2 & ~BYTE_VEC(0x03)) >> 2); \ 208 \ 209 do { \ 210 uint64_t npix1, npix2; \ 211 uint64_t npix_l, npix_h; \ 212 uint64_t avg; \ 213 \ 214 pixels += line_size; \ 215 npix1 = LOAD(pixels); \ 216 npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \ 217 npix_l = (npix1 & BYTE_VEC(0x03)) \ 218 + (npix2 & BYTE_VEC(0x03)); \ 219 npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \ 220 + ((npix2 & ~BYTE_VEC(0x03)) >> 2); \ 221 avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \ 222 + pix_h + npix_h; \ 223 STORE(avg, block); \ 224 \ 225 block += line_size; \ 226 pix_l = npix_l; \ 227 pix_h = npix_h; \ 228 } while (--h); \ 229 } while (0) 230 231#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \ 232static void OPNAME ## _pixels ## SUFF ## _axp \ 233 (uint8_t *restrict block, const uint8_t *restrict pixels, \ 234 int line_size, int h) \ 235{ \ 236 if ((size_t) pixels & 0x7) { \ 237 OPKIND(uldq, STORE); \ 238 } else { \ 239 OPKIND(ldq, STORE); \ 240 } \ 241} \ 242 \ 243static void OPNAME ## _pixels16 ## SUFF ## _axp \ 244 (uint8_t *restrict block, const uint8_t *restrict pixels, \ 245 int line_size, int h) \ 246{ \ 247 OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \ 248 OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \ 249} 250 251#define PIXOP(OPNAME, STORE) \ 252 MAKE_OP(OPNAME, , OP, STORE) \ 253 MAKE_OP(OPNAME, _x2, OP_X2, STORE) \ 254 MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \ 255 MAKE_OP(OPNAME, _xy2, OP_XY2, STORE) 256 257/* Rounding primitives. */ 258#define AVG2 avg2 259#define AVG4 avg4 260#define AVG4_ROUNDER BYTE_VEC(0x02) 261#define STORE(l, b) stq(l, b) 262PIXOP(put, STORE); 263 264#undef STORE 265#define STORE(l, b) stq(AVG2(l, ldq(b)), b); 266PIXOP(avg, STORE); 267 268/* Not rounding primitives. */ 269#undef AVG2 270#undef AVG4 271#undef AVG4_ROUNDER 272#undef STORE 273#define AVG2 avg2_no_rnd 274#define AVG4 avg4_no_rnd 275#define AVG4_ROUNDER BYTE_VEC(0x01) 276#define STORE(l, b) stq(l, b) 277PIXOP(put_no_rnd, STORE); 278 279#undef STORE 280#define STORE(l, b) stq(AVG2(l, ldq(b)), b); 281PIXOP(avg_no_rnd, STORE); 282 283void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels, 284 int line_size, int h) 285{ 286 put_pixels_axp_asm(block, pixels, line_size, h); 287 put_pixels_axp_asm(block + 8, pixels + 8, line_size, h); 288} 289 290void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx) 291{ 292 c->put_pixels_tab[0][0] = put_pixels16_axp_asm; 293 c->put_pixels_tab[0][1] = put_pixels16_x2_axp; 294 c->put_pixels_tab[0][2] = put_pixels16_y2_axp; 295 c->put_pixels_tab[0][3] = put_pixels16_xy2_axp; 296 297 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm; 298 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp; 299 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp; 300 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp; 301 302 c->avg_pixels_tab[0][0] = avg_pixels16_axp; 303 c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp; 304 c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp; 305 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp; 306 307 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_axp; 308 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_axp; 309 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_axp; 310 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_axp; 311 312 c->put_pixels_tab[1][0] = put_pixels_axp_asm; 313 c->put_pixels_tab[1][1] = put_pixels_x2_axp; 314 c->put_pixels_tab[1][2] = put_pixels_y2_axp; 315 c->put_pixels_tab[1][3] = put_pixels_xy2_axp; 316 317 c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm; 318 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp; 319 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp; 320 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp; 321 322 c->avg_pixels_tab[1][0] = avg_pixels_axp; 323 c->avg_pixels_tab[1][1] = avg_pixels_x2_axp; 324 c->avg_pixels_tab[1][2] = avg_pixels_y2_axp; 325 c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp; 326 327 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels_axp; 328 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels_x2_axp; 329 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels_y2_axp; 330 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels_xy2_axp; 331 332 c->clear_blocks = clear_blocks_axp; 333 334 /* amask clears all bits that correspond to present features. */ 335 if (amask(AMASK_MVI) == 0) { 336 c->put_pixels_clamped = put_pixels_clamped_mvi_asm; 337 c->add_pixels_clamped = add_pixels_clamped_mvi_asm; 338 339 c->get_pixels = get_pixels_mvi; 340 c->diff_pixels = diff_pixels_mvi; 341 c->sad[0] = pix_abs16x16_mvi_asm; 342 c->sad[1] = pix_abs8x8_mvi; 343 c->pix_abs[0][0] = pix_abs16x16_mvi_asm; 344 c->pix_abs[1][0] = pix_abs8x8_mvi; 345 c->pix_abs[0][1] = pix_abs16x16_x2_mvi; 346 c->pix_abs[0][2] = pix_abs16x16_y2_mvi; 347 c->pix_abs[0][3] = pix_abs16x16_xy2_mvi; 348 } 349 350 put_pixels_clamped_axp_p = c->put_pixels_clamped; 351 add_pixels_clamped_axp_p = c->add_pixels_clamped; 352 353 if (!avctx->lowres && 354 (avctx->idct_algo == FF_IDCT_AUTO || 355 avctx->idct_algo == FF_IDCT_SIMPLEALPHA)) { 356 c->idct_put = ff_simple_idct_put_axp; 357 c->idct_add = ff_simple_idct_add_axp; 358 c->idct = ff_simple_idct_axp; 359 } 360} 361