1/* 2 * Alpha optimized DSP utils 3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/attributes.h" 23#include "libavcodec/hpeldsp.h" 24#include "hpeldsp_alpha.h" 25#include "asm.h" 26 27static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b) 28{ 29 return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1); 30} 31 32static inline uint64_t avg2(uint64_t a, uint64_t b) 33{ 34 return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); 35} 36 37#if 0 38/* The XY2 routines basically utilize this scheme, but reuse parts in 39 each iteration. */ 40static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) 41{ 42 uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) 43 + ((l2 & ~BYTE_VEC(0x03)) >> 2) 44 + ((l3 & ~BYTE_VEC(0x03)) >> 2) 45 + ((l4 & ~BYTE_VEC(0x03)) >> 2); 46 uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) 47 + (l2 & BYTE_VEC(0x03)) 48 + (l3 & BYTE_VEC(0x03)) 49 + (l4 & BYTE_VEC(0x03)) 50 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); 51 return r1 + r2; 52} 53#endif 54 55#define OP(LOAD, STORE) \ 56 do { \ 57 STORE(LOAD(pixels), block); \ 58 pixels += line_size; \ 59 block += line_size; \ 60 } while (--h) 61 62#define OP_X2(LOAD, STORE) \ 63 do { \ 64 uint64_t pix1, pix2; \ 65 \ 66 pix1 = LOAD(pixels); \ 67 pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ 68 STORE(AVG2(pix1, pix2), block); \ 69 pixels += line_size; \ 70 block += line_size; \ 71 } while (--h) 72 73#define OP_Y2(LOAD, STORE) \ 74 do { \ 75 uint64_t pix = LOAD(pixels); \ 76 do { \ 77 uint64_t next_pix; \ 78 \ 79 pixels += line_size; \ 80 next_pix = LOAD(pixels); \ 81 STORE(AVG2(pix, next_pix), block); \ 82 block += line_size; \ 83 pix = next_pix; \ 84 } while (--h); \ 85 } while (0) 86 87#define OP_XY2(LOAD, STORE) \ 88 do { \ 89 uint64_t pix1 = LOAD(pixels); \ 90 uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ 91 uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \ 92 + (pix2 & BYTE_VEC(0x03)); \ 93 uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \ 94 + ((pix2 & ~BYTE_VEC(0x03)) >> 2); \ 95 \ 96 do { \ 97 uint64_t npix1, npix2; \ 98 uint64_t npix_l, npix_h; \ 99 uint64_t avg; \ 100 \ 101 pixels += line_size; \ 102 npix1 = LOAD(pixels); \ 103 npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \ 104 npix_l = (npix1 & BYTE_VEC(0x03)) \ 105 + (npix2 & BYTE_VEC(0x03)); \ 106 npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \ 107 + ((npix2 & ~BYTE_VEC(0x03)) >> 2); \ 108 avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \ 109 + pix_h + npix_h; \ 110 STORE(avg, block); \ 111 \ 112 block += line_size; \ 113 pix_l = npix_l; \ 114 pix_h = npix_h; \ 115 } while (--h); \ 116 } while (0) 117 118#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \ 119static void OPNAME ## _pixels ## SUFF ## _axp \ 120 (uint8_t *restrict block, const uint8_t *restrict pixels, \ 121 ptrdiff_t line_size, int h) \ 122{ \ 123 if ((size_t) pixels & 0x7) { \ 124 OPKIND(uldq, STORE); \ 125 } else { \ 126 OPKIND(ldq, STORE); \ 127 } \ 128} \ 129 \ 130static void OPNAME ## _pixels16 ## SUFF ## _axp \ 131 (uint8_t *restrict block, const uint8_t *restrict pixels, \ 132 ptrdiff_t line_size, int h) \ 133{ \ 134 OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \ 135 OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \ 136} 137 138#define PIXOP(OPNAME, STORE) \ 139 MAKE_OP(OPNAME, , OP, STORE) \ 140 MAKE_OP(OPNAME, _x2, OP_X2, STORE) \ 141 MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \ 142 MAKE_OP(OPNAME, _xy2, OP_XY2, STORE) 143 144/* Rounding primitives. */ 145#define AVG2 avg2 146#define AVG4 avg4 147#define AVG4_ROUNDER BYTE_VEC(0x02) 148#define STORE(l, b) stq(l, b) 149PIXOP(put, STORE); 150 151#undef STORE 152#define STORE(l, b) stq(AVG2(l, ldq(b)), b); 153PIXOP(avg, STORE); 154 155/* Not rounding primitives. */ 156#undef AVG2 157#undef AVG4 158#undef AVG4_ROUNDER 159#undef STORE 160#define AVG2 avg2_no_rnd 161#define AVG4 avg4_no_rnd 162#define AVG4_ROUNDER BYTE_VEC(0x01) 163#define STORE(l, b) stq(l, b) 164PIXOP(put_no_rnd, STORE); 165 166#undef STORE 167#define STORE(l, b) stq(AVG2(l, ldq(b)), b); 168PIXOP(avg_no_rnd, STORE); 169 170static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels, 171 ptrdiff_t line_size, int h) 172{ 173 put_pixels_axp_asm(block, pixels, line_size, h); 174 put_pixels_axp_asm(block + 8, pixels + 8, line_size, h); 175} 176 177av_cold void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags) 178{ 179 c->put_pixels_tab[0][0] = put_pixels16_axp_asm; 180 c->put_pixels_tab[0][1] = put_pixels16_x2_axp; 181 c->put_pixels_tab[0][2] = put_pixels16_y2_axp; 182 c->put_pixels_tab[0][3] = put_pixels16_xy2_axp; 183 184 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm; 185 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp; 186 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp; 187 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp; 188 189 c->avg_pixels_tab[0][0] = avg_pixels16_axp; 190 c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp; 191 c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp; 192 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp; 193 194 c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp; 195 c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp; 196 c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp; 197 c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp; 198 199 c->put_pixels_tab[1][0] = put_pixels_axp_asm; 200 c->put_pixels_tab[1][1] = put_pixels_x2_axp; 201 c->put_pixels_tab[1][2] = put_pixels_y2_axp; 202 c->put_pixels_tab[1][3] = put_pixels_xy2_axp; 203 204 c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm; 205 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp; 206 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp; 207 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp; 208 209 c->avg_pixels_tab[1][0] = avg_pixels_axp; 210 c->avg_pixels_tab[1][1] = avg_pixels_x2_axp; 211 c->avg_pixels_tab[1][2] = avg_pixels_y2_axp; 212 c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp; 213} 214