1/* 2 * AltiVec-enhanced yuv2yuvX 3 * 4 * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org> 5 * based on the equivalent C code in swscale.c 6 * 7 * This file is part of Libav. 8 * 9 * Libav is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU Lesser General Public 11 * License as published by the Free Software Foundation; either 12 * version 2.1 of the License, or (at your option) any later version. 13 * 14 * Libav is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Lesser General Public License for more details. 18 * 19 * You should have received a copy of the GNU Lesser General Public 20 * License along with Libav; if not, write to the Free Software 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22 */ 23 24#include <inttypes.h> 25#include "config.h" 26#include "libswscale/swscale.h" 27#include "libswscale/swscale_internal.h" 28#include "libavutil/cpu.h" 29#include "yuv2rgb_altivec.h" 30 31#define vzero vec_splat_s32(0) 32 33static inline void 34altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW) 35{ 36 register int i; 37 vector unsigned int altivec_vectorShiftInt19 = 38 vec_add(vec_splat_u32(10), vec_splat_u32(9)); 39 if ((uintptr_t)dest % 16) { 40 /* badly aligned store, we force store alignment */ 41 /* and will handle load misalignment on val w/ vec_perm */ 42 vector unsigned char perm1; 43 vector signed int v1; 44 for (i = 0 ; (i < dstW) && 45 (((uintptr_t)dest + i) % 16) ; i++) { 46 int t = val[i] >> 19; 47 dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t); 48 } 49 perm1 = vec_lvsl(i << 2, val); 50 v1 = vec_ld(i << 2, val); 51 for ( ; i < (dstW - 15); i+=16) { 52 int offset = i << 2; 53 vector signed int v2 = vec_ld(offset + 16, val); 54 vector signed int v3 = vec_ld(offset + 32, val); 55 vector signed int v4 = vec_ld(offset + 48, val); 56 vector signed int v5 = vec_ld(offset + 64, val); 57 vector signed int v12 = vec_perm(v1, v2, perm1); 58 vector signed int v23 = vec_perm(v2, v3, perm1); 59 vector signed int v34 = vec_perm(v3, v4, perm1); 60 vector signed int v45 = vec_perm(v4, v5, perm1); 61 62 vector signed int vA = vec_sra(v12, altivec_vectorShiftInt19); 63 vector signed int vB = vec_sra(v23, altivec_vectorShiftInt19); 64 vector signed int vC = vec_sra(v34, altivec_vectorShiftInt19); 65 vector signed int vD = vec_sra(v45, altivec_vectorShiftInt19); 66 vector unsigned short vs1 = vec_packsu(vA, vB); 67 vector unsigned short vs2 = vec_packsu(vC, vD); 68 vector unsigned char vf = vec_packsu(vs1, vs2); 69 vec_st(vf, i, dest); 70 v1 = v5; 71 } 72 } else { // dest is properly aligned, great 73 for (i = 0; i < (dstW - 15); i+=16) { 74 int offset = i << 2; 75 vector signed int v1 = vec_ld(offset, val); 76 vector signed int v2 = vec_ld(offset + 16, val); 77 vector signed int v3 = vec_ld(offset + 32, val); 78 vector signed int v4 = vec_ld(offset + 48, val); 79 vector signed int v5 = vec_sra(v1, altivec_vectorShiftInt19); 80 vector signed int v6 = vec_sra(v2, altivec_vectorShiftInt19); 81 vector signed int v7 = vec_sra(v3, altivec_vectorShiftInt19); 82 vector signed int v8 = vec_sra(v4, altivec_vectorShiftInt19); 83 vector unsigned short vs1 = vec_packsu(v5, v6); 84 vector unsigned short vs2 = vec_packsu(v7, v8); 85 vector unsigned char vf = vec_packsu(vs1, vs2); 86 vec_st(vf, i, dest); 87 } 88 } 89 for ( ; i < dstW ; i++) { 90 int t = val[i] >> 19; 91 dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t); 92 } 93} 94 95//FIXME remove the usage of scratch buffers. 96static void 97yuv2planeX_altivec(const int16_t *filter, int filterSize, 98 const int16_t **src, uint8_t *dest, int dstW, 99 const uint8_t *dither, int offset) 100{ 101 register int i, j; 102 { 103 DECLARE_ALIGNED(16, int, val)[dstW]; 104 105 for (i=0; i<dstW; i++) 106 val[i] = dither[(i + offset) & 7] << 12; 107 108 for (j = 0; j < filterSize; j++) { 109 vector signed short l1, vLumFilter = vec_ld(j << 1, filter); 110 vector unsigned char perm, perm0 = vec_lvsl(j << 1, filter); 111 vLumFilter = vec_perm(vLumFilter, vLumFilter, perm0); 112 vLumFilter = vec_splat(vLumFilter, 0); // lumFilter[j] is loaded 8 times in vLumFilter 113 114 perm = vec_lvsl(0, src[j]); 115 l1 = vec_ld(0, src[j]); 116 117 for (i = 0; i < (dstW - 7); i+=8) { 118 int offset = i << 2; 119 vector signed short l2 = vec_ld((i << 1) + 16, src[j]); 120 121 vector signed int v1 = vec_ld(offset, val); 122 vector signed int v2 = vec_ld(offset + 16, val); 123 124 vector signed short ls = vec_perm(l1, l2, perm); // lumSrc[j][i] ... lumSrc[j][i+7] 125 126 vector signed int i1 = vec_mule(vLumFilter, ls); 127 vector signed int i2 = vec_mulo(vLumFilter, ls); 128 129 vector signed int vf1 = vec_mergeh(i1, i2); 130 vector signed int vf2 = vec_mergel(i1, i2); // lumSrc[j][i] * lumFilter[j] ... lumSrc[j][i+7] * lumFilter[j] 131 132 vector signed int vo1 = vec_add(v1, vf1); 133 vector signed int vo2 = vec_add(v2, vf2); 134 135 vec_st(vo1, offset, val); 136 vec_st(vo2, offset + 16, val); 137 138 l1 = l2; 139 } 140 for ( ; i < dstW; i++) { 141 val[i] += src[j][i] * filter[j]; 142 } 143 } 144 altivec_packIntArrayToCharArray(val, dest, dstW); 145 } 146} 147 148static void hScale_altivec_real(SwsContext *c, int16_t *dst, int dstW, 149 const uint8_t *src, const int16_t *filter, 150 const int32_t *filterPos, int filterSize) 151{ 152 register int i; 153 DECLARE_ALIGNED(16, int, tempo)[4]; 154 155 if (filterSize % 4) { 156 for (i=0; i<dstW; i++) { 157 register int j; 158 register int srcPos = filterPos[i]; 159 register int val = 0; 160 for (j=0; j<filterSize; j++) { 161 val += ((int)src[srcPos + j])*filter[filterSize*i + j]; 162 } 163 dst[i] = FFMIN(val>>7, (1<<15)-1); 164 } 165 } 166 else 167 switch (filterSize) { 168 case 4: 169 { 170 for (i=0; i<dstW; i++) { 171 register int srcPos = filterPos[i]; 172 173 vector unsigned char src_v0 = vec_ld(srcPos, src); 174 vector unsigned char src_v1, src_vF; 175 vector signed short src_v, filter_v; 176 vector signed int val_vEven, val_s; 177 if ((((uintptr_t)src + srcPos) % 16) > 12) { 178 src_v1 = vec_ld(srcPos + 16, src); 179 } 180 src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); 181 182 src_v = // vec_unpackh sign-extends... 183 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); 184 // now put our elements in the even slots 185 src_v = vec_mergeh(src_v, (vector signed short)vzero); 186 187 filter_v = vec_ld(i << 3, filter); 188 // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2). 189 190 // The neat trick: We only care for half the elements, 191 // high or low depending on (i<<3)%16 (it's 0 or 8 here), 192 // and we're going to use vec_mule, so we choose 193 // carefully how to "unpack" the elements into the even slots. 194 if ((i << 3) % 16) 195 filter_v = vec_mergel(filter_v, (vector signed short)vzero); 196 else 197 filter_v = vec_mergeh(filter_v, (vector signed short)vzero); 198 199 val_vEven = vec_mule(src_v, filter_v); 200 val_s = vec_sums(val_vEven, vzero); 201 vec_st(val_s, 0, tempo); 202 dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1); 203 } 204 } 205 break; 206 207 case 8: 208 { 209 for (i=0; i<dstW; i++) { 210 register int srcPos = filterPos[i]; 211 212 vector unsigned char src_v0 = vec_ld(srcPos, src); 213 vector unsigned char src_v1, src_vF; 214 vector signed short src_v, filter_v; 215 vector signed int val_v, val_s; 216 if ((((uintptr_t)src + srcPos) % 16) > 8) { 217 src_v1 = vec_ld(srcPos + 16, src); 218 } 219 src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); 220 221 src_v = // vec_unpackh sign-extends... 222 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); 223 filter_v = vec_ld(i << 4, filter); 224 // the 4 above is 3 (filterSize == 8) + 1 (sizeof(short) == 2) 225 226 val_v = vec_msums(src_v, filter_v, (vector signed int)vzero); 227 val_s = vec_sums(val_v, vzero); 228 vec_st(val_s, 0, tempo); 229 dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1); 230 } 231 } 232 break; 233 234 case 16: 235 { 236 for (i=0; i<dstW; i++) { 237 register int srcPos = filterPos[i]; 238 239 vector unsigned char src_v0 = vec_ld(srcPos, src); 240 vector unsigned char src_v1 = vec_ld(srcPos + 16, src); 241 vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); 242 243 vector signed short src_vA = // vec_unpackh sign-extends... 244 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); 245 vector signed short src_vB = // vec_unpackh sign-extends... 246 (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF)); 247 248 vector signed short filter_v0 = vec_ld(i << 5, filter); 249 vector signed short filter_v1 = vec_ld((i << 5) + 16, filter); 250 // the 5 above are 4 (filterSize == 16) + 1 (sizeof(short) == 2) 251 252 vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero); 253 vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc); 254 255 vector signed int val_s = vec_sums(val_v, vzero); 256 257 vec_st(val_s, 0, tempo); 258 dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1); 259 } 260 } 261 break; 262 263 default: 264 { 265 for (i=0; i<dstW; i++) { 266 register int j; 267 register int srcPos = filterPos[i]; 268 269 vector signed int val_s, val_v = (vector signed int)vzero; 270 vector signed short filter_v0R = vec_ld(i * 2 * filterSize, filter); 271 vector unsigned char permF = vec_lvsl((i * 2 * filterSize), filter); 272 273 vector unsigned char src_v0 = vec_ld(srcPos, src); 274 vector unsigned char permS = vec_lvsl(srcPos, src); 275 276 for (j = 0 ; j < filterSize - 15; j += 16) { 277 vector unsigned char src_v1 = vec_ld(srcPos + j + 16, src); 278 vector unsigned char src_vF = vec_perm(src_v0, src_v1, permS); 279 280 vector signed short src_vA = // vec_unpackh sign-extends... 281 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); 282 vector signed short src_vB = // vec_unpackh sign-extends... 283 (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF)); 284 285 vector signed short filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter); 286 vector signed short filter_v2R = vec_ld((i * 2 * filterSize) + (j * 2) + 32, filter); 287 vector signed short filter_v0 = vec_perm(filter_v0R, filter_v1R, permF); 288 vector signed short filter_v1 = vec_perm(filter_v1R, filter_v2R, permF); 289 290 vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v); 291 val_v = vec_msums(src_vB, filter_v1, val_acc); 292 293 filter_v0R = filter_v2R; 294 src_v0 = src_v1; 295 } 296 297 if (j < filterSize-7) { 298 // loading src_v0 is useless, it's already done above 299 //vector unsigned char src_v0 = vec_ld(srcPos + j, src); 300 vector unsigned char src_v1, src_vF; 301 vector signed short src_v, filter_v1R, filter_v; 302 if ((((uintptr_t)src + srcPos) % 16) > 8) { 303 src_v1 = vec_ld(srcPos + j + 16, src); 304 } 305 src_vF = vec_perm(src_v0, src_v1, permS); 306 307 src_v = // vec_unpackh sign-extends... 308 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); 309 // loading filter_v0R is useless, it's already done above 310 //vector signed short filter_v0R = vec_ld((i * 2 * filterSize) + j, filter); 311 filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter); 312 filter_v = vec_perm(filter_v0R, filter_v1R, permF); 313 314 val_v = vec_msums(src_v, filter_v, val_v); 315 } 316 317 val_s = vec_sums(val_v, vzero); 318 319 vec_st(val_s, 0, tempo); 320 dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1); 321 } 322 323 } 324 } 325} 326 327void ff_sws_init_swScale_altivec(SwsContext *c) 328{ 329 enum PixelFormat dstFormat = c->dstFormat; 330 331 if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) 332 return; 333 334 if (c->srcBpc == 8 && c->dstBpc <= 10) { 335 c->hyScale = c->hcScale = hScale_altivec_real; 336 } 337 if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && 338 dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21 && 339 !c->alpPixBuf) { 340 c->yuv2planeX = yuv2planeX_altivec; 341 } 342 343 /* The following list of supported dstFormat values should 344 * match what's found in the body of ff_yuv2packedX_altivec() */ 345 if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->alpPixBuf) { 346 switch (c->dstFormat) { 347 case PIX_FMT_ABGR: c->yuv2packedX = ff_yuv2abgr_X_altivec; break; 348 case PIX_FMT_BGRA: c->yuv2packedX = ff_yuv2bgra_X_altivec; break; 349 case PIX_FMT_ARGB: c->yuv2packedX = ff_yuv2argb_X_altivec; break; 350 case PIX_FMT_RGBA: c->yuv2packedX = ff_yuv2rgba_X_altivec; break; 351 case PIX_FMT_BGR24: c->yuv2packedX = ff_yuv2bgr24_X_altivec; break; 352 case PIX_FMT_RGB24: c->yuv2packedX = ff_yuv2rgb24_X_altivec; break; 353 } 354 } 355} 356