/* * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ #if !defined(JAVA2D_NO_MLIB) || defined(MLIB_ADD_SUFF) #include #include "java2d_Mlib.h" #include "vis_AlphaMacros.h" /***************************************************************/ extern mlib_d64 vis_d64_div_tbl[256]; /***************************************************************/ #define RGB2GRAY(r, g, b) \ (((19672 * (r)) + (38621 * (g)) + (7500 * (b))) >> 8) /***************************************************************/ static const mlib_s32 RGB_weight[] = { (19672/2) | ((19672/2) << 16), (38621/2) | ((38621/2) << 16), ( 7500/2) | (( 7500/2) << 16), /*(1 << 6)*/ - (1 << 22) }; /***************************************************************/ #define RGB_VARS \ mlib_d64 r, g, b, ar, gb, s02, s13; \ mlib_f32 ff; \ mlib_f32 alpha = ((mlib_f32*)RGB_weight)[0]; \ mlib_f32 beta = ((mlib_f32*)RGB_weight)[1]; \ mlib_f32 gamma = ((mlib_f32*)RGB_weight)[2]; \ mlib_f32 fzeros = vis_fzeros(); \ mlib_d64 d_half = vis_to_double_dup(RGB_weight[3]); \ mlib_f32 mask8000 = vis_to_float(0x80008000); \ \ vis_write_gsr(((16 - 7) << 3) | 6) /***************************************************************/ #define GRAY_U16(ff, r, g, b) \ { \ mlib_d64 dr, dg, db; \ dr = vis_fmuld8ulx16(r, alpha); \ dg = vis_fmuld8ulx16(g, beta); \ db = vis_fmuld8ulx16(b, gamma); \ dr = vis_fpadd32(dr, dg); \ db = vis_fpadd32(db, d_half); \ dr = vis_fpadd32(dr, db); \ ff = vis_fpackfix(dr); \ ff = vis_fxors(ff, mask8000); \ } /***************************************************************/ #define LOAD_BGR(ind) \ b = vis_faligndata(vis_ld_u8(src + (ind )), b); \ g = vis_faligndata(vis_ld_u8(src + (ind + 1)), g); \ r = vis_faligndata(vis_ld_u8(src + (ind + 2)), r) /***************************************************************/ void ADD_SUFF(IntArgbToUshortGrayConvert)(BLIT_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_s32 j; RGB_VARS; if (srcScan == 4*width && dstScan == 2*width) { width *= height; height = 1; } for (j = 0; j < height; j++) { mlib_f32 *src = srcBase; mlib_u16 *dst = dstBase; mlib_u16 *dst_end; dst_end = dst + width; while (((mlib_s32)dst & 3) && dst < dst_end) { r = vis_ld_u8((mlib_u8*)src + 1); g = vis_ld_u8((mlib_u8*)src + 2); b = vis_ld_u8((mlib_u8*)src + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); vis_st_u16(D64_FROM_F32x2(ff), dst); dst++; src++; } #pragma pipeloop(0) for (; dst <= (dst_end - 2); dst += 2) { s02 = vis_fpmerge(src[0], src[1]); ar = vis_fpmerge(fzeros, vis_read_hi(s02)); gb = vis_fpmerge(fzeros, vis_read_lo(s02)); GRAY_U16(ff, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb)); *(mlib_f32*)dst = ff; src += 2; } while (dst < dst_end) { r = vis_ld_u8((mlib_u8*)src + 1); g = vis_ld_u8((mlib_u8*)src + 2); b = vis_ld_u8((mlib_u8*)src + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); vis_st_u16(D64_FROM_F32x2(ff), dst); dst++; src++; } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } } /***************************************************************/ void ADD_SUFF(ThreeByteBgrToUshortGrayConvert)(BLIT_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_u16 *dst_end; mlib_s32 j; RGB_VARS; if (srcScan == 3*width && dstScan == 2*width) { width *= height; height = 1; } for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_u16 *dst = dstBase; dst_end = dst + width; while (((mlib_s32)dst & 3) && dst < dst_end) { b = vis_ld_u8(src); g = vis_ld_u8(src + 1); r = vis_ld_u8(src + 2); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); vis_st_u16(D64_FROM_F32x2(ff), dst); dst++; src += 3; } #pragma pipeloop(0) for (; dst <= (dst_end - 2); dst += 2) { LOAD_BGR(3); LOAD_BGR(0); GRAY_U16(ff, vis_read_hi(r), vis_read_hi(g), vis_read_hi(b)); *(mlib_f32*)dst = ff; src += 3*2; } while (dst < dst_end) { b = vis_ld_u8(src); g = vis_ld_u8(src + 1); r = vis_ld_u8(src + 2); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); vis_st_u16(D64_FROM_F32x2(ff), dst); dst++; src += 3; } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } } /***************************************************************/ void ADD_SUFF(IntArgbToUshortGrayScaleConvert)(SCALE_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_u16 *dst_end; mlib_s32 i, j; RGB_VARS; for (j = 0; j < height; j++) { mlib_f32 *src = srcBase; mlib_u16 *dst = dstBase; mlib_s32 tmpsxloc = sxloc; PTR_ADD(src, (syloc >> shift) * srcScan); dst_end = dst + width; while (((mlib_s32)dst & 3) && dst < dst_end) { i = tmpsxloc >> shift; tmpsxloc += sxinc; r = vis_ld_u8((mlib_u8*)(src + i) + 1); g = vis_ld_u8((mlib_u8*)(src + i) + 2); b = vis_ld_u8((mlib_u8*)(src + i) + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); vis_st_u16(D64_FROM_F32x2(ff), dst); dst++; } #pragma pipeloop(0) for (; dst <= (dst_end - 2); dst += 2) { s02 = vis_fpmerge(src[(tmpsxloc ) >> shift], src[(tmpsxloc + sxinc) >> shift]); tmpsxloc += 2*sxinc; ar = vis_fpmerge(fzeros, vis_read_hi(s02)); gb = vis_fpmerge(fzeros, vis_read_lo(s02)); GRAY_U16(ff, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb)); *(mlib_f32*)dst = ff; } while (dst < dst_end) { i = tmpsxloc >> shift; tmpsxloc += sxinc; r = vis_ld_u8((mlib_u8*)(src + i) + 1); g = vis_ld_u8((mlib_u8*)(src + i) + 2); b = vis_ld_u8((mlib_u8*)(src + i) + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); vis_st_u16(D64_FROM_F32x2(ff), dst); dst++; } PTR_ADD(dstBase, dstScan); syloc += syinc; } } /***************************************************************/ void ADD_SUFF(ThreeByteBgrToUshortGrayScaleConvert)(SCALE_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_u16 *dst_end; mlib_s32 j, i0, i1; RGB_VARS; for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_u16 *dst = dstBase; mlib_s32 tmpsxloc = sxloc; PTR_ADD(src, (syloc >> shift) * srcScan); dst_end = dst + width; while (((mlib_s32)dst & 3) && dst < dst_end) { i0 = 3*(tmpsxloc >> shift); tmpsxloc += sxinc; b = vis_ld_u8(src + i0); g = vis_ld_u8(src + i0 + 1); r = vis_ld_u8(src + i0 + 2); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); vis_st_u16(D64_FROM_F32x2(ff), dst); dst++; } #pragma pipeloop(0) for (; dst <= (dst_end - 2); dst += 2) { i0 = 3*(tmpsxloc >> shift); tmpsxloc += sxinc; i1 = 3*(tmpsxloc >> shift); tmpsxloc += sxinc; LOAD_BGR(i1); LOAD_BGR(i0); GRAY_U16(ff, vis_read_hi(r), vis_read_hi(g), vis_read_hi(b)); *(mlib_f32*)dst = ff; } while (dst < dst_end) { i0 = 3*(tmpsxloc >> shift); tmpsxloc += sxinc; b = vis_ld_u8(src + i0); g = vis_ld_u8(src + i0 + 1); r = vis_ld_u8(src + i0 + 2); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); vis_st_u16(D64_FROM_F32x2(ff), dst); dst++; } PTR_ADD(dstBase, dstScan); syloc += syinc; } } /***************************************************************/ #if 0 void ADD_SUFF(IntArgbBmToUshortGrayXparOver)(BLIT_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_d64 dzero = vis_fzero(); mlib_f32 f0, f1; mlib_s32 i, j, mask0, mask1; RGB_VARS; if (width < 8) { for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_u16 *dst = dstBase; for (i = 0; i < width; i++) { if (src[4*i]) { dst[i] = RGB2GRAY(src[4*i + 1], src[4*i + 2], src[4*i + 3]); } } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } return; } for (j = 0; j < height; j++) { mlib_f32 *src = srcBase; mlib_u16 *dst = dstBase; mlib_u16 *dst_end; dst_end = dst + width; while (((mlib_s32)dst & 7) && dst < dst_end) { if (*(mlib_u8*)src) { r = vis_ld_u8((mlib_u8*)src + 1); g = vis_ld_u8((mlib_u8*)src + 2); b = vis_ld_u8((mlib_u8*)src + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); vis_st_u16(D64_FROM_F32x2(ff), dst); } dst++; src++; } #pragma pipeloop(0) for (; dst <= (dst_end - 4); dst += 4) { s02 = vis_fpmerge(src[0], src[1]); src += 2; ar = vis_fpmerge(fzeros, vis_read_hi(s02)); gb = vis_fpmerge(fzeros, vis_read_lo(s02)); mask0 = vis_fcmpne16(ar, dzero) & 0xC; GRAY_U16(f0, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb)); s02 = vis_fpmerge(src[0], src[1]); src += 2; ar = vis_fpmerge(fzeros, vis_read_hi(s02)); gb = vis_fpmerge(fzeros, vis_read_lo(s02)); mask1 = vis_fcmpne16(ar, dzero) >> 2; GRAY_U16(f1, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb)); vis_pst_16(vis_freg_pair(f0, f1), dst, mask0 | mask1); } while (dst < dst_end) { if (*(mlib_u8*)src) { r = vis_ld_u8((mlib_u8*)src + 1); g = vis_ld_u8((mlib_u8*)src + 2); b = vis_ld_u8((mlib_u8*)src + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); vis_st_u16(D64_FROM_F32x2(ff), dst); } dst++; src++; } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } } /***************************************************************/ void ADD_SUFF(IntArgbBmToUshortGrayXparBgCopy)(BCOPY_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_d64 dzero = vis_fzero(), d_bgpixel; mlib_f32 f0, f1; mlib_s32 i, j, mask0, mask1; RGB_VARS; if (width < 8) { for (j = 0; j < height; j++) { mlib_u8 *src = srcBase; mlib_u16 *dst = dstBase; mlib_s32 srcpixel, r, g, b; for (i = 0; i < width; i++) { if (src[4*i]) { dst[i] = RGB2GRAY(src[4*i + 1], src[4*i + 2], src[4*i + 3]); } else { dst[i] = bgpixel; } } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } return; } D64_FROM_U16x4(d_bgpixel, bgpixel); for (j = 0; j < height; j++) { mlib_f32 *src = srcBase; mlib_u16 *dst = dstBase; mlib_u16 *dst_end; dst_end = dst + width; while (((mlib_s32)dst & 7) && dst < dst_end) { if (*(mlib_u8*)src) { r = vis_ld_u8((mlib_u8*)src + 1); g = vis_ld_u8((mlib_u8*)src + 2); b = vis_ld_u8((mlib_u8*)src + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); vis_st_u16(D64_FROM_F32x2(ff), dst); } else { *dst = bgpixel; } dst++; src++; } #pragma pipeloop(0) for (; dst <= (dst_end - 4); dst += 4) { s02 = vis_fpmerge(src[0], src[1]); src += 2; ar = vis_fpmerge(fzeros, vis_read_hi(s02)); gb = vis_fpmerge(fzeros, vis_read_lo(s02)); mask0 = vis_fcmpne16(ar, dzero) & 0xC; GRAY_U16(f0, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb)); s02 = vis_fpmerge(src[0], src[1]); src += 2; ar = vis_fpmerge(fzeros, vis_read_hi(s02)); gb = vis_fpmerge(fzeros, vis_read_lo(s02)); mask1 = vis_fcmpne16(ar, dzero) >> 2; GRAY_U16(f1, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb)); *(mlib_d64*)dst = d_bgpixel; vis_pst_16(vis_freg_pair(f0, f1), dst, mask0 | mask1); } while (dst < dst_end) { if (*(mlib_u8*)src) { r = vis_ld_u8((mlib_u8*)src + 1); g = vis_ld_u8((mlib_u8*)src + 2); b = vis_ld_u8((mlib_u8*)src + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); vis_st_u16(D64_FROM_F32x2(ff), dst); } else { *dst = bgpixel; } dst++; src++; } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } } #endif /***************************************************************/ void ADD_SUFF(IntArgbToUshortGrayXorBlit)(BLIT_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_d64 dd, d_xorpixel, d_alphamask, dzero = vis_fzero(); mlib_f32 f0, f1; mlib_s32 i, j, mask0, mask1; jint xorpixel = pCompInfo->details.xorPixel; juint alphamask = pCompInfo->alphaMask; RGB_VARS; if (width < 8) { for (j = 0; j < height; j++) { mlib_s32 *src = srcBase; mlib_u16 *dst = dstBase; mlib_s32 srcpixel, r, g, b; for (i = 0; i < width; i++) { srcpixel = src[i]; if (srcpixel >= 0) continue; b = (srcpixel) & 0xff; g = (srcpixel >> 8) & 0xff; r = (srcpixel >> 16) & 0xff; srcpixel = (77*r + 150*g + 29*b + 128) / 256; dst[i] ^= (((srcpixel) ^ (xorpixel)) & ~(alphamask)); } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } return; } D64_FROM_U16x4(d_xorpixel, xorpixel); D64_FROM_U16x4(d_alphamask, alphamask); for (j = 0; j < height; j++) { mlib_f32 *src = srcBase; mlib_u16 *dst = dstBase; mlib_u16 *dst_end; dst_end = dst + width; while (((mlib_s32)dst & 7) && dst < dst_end) { if ((*(mlib_u8*)src) & 0x80) { r = vis_ld_u8((mlib_u8*)src + 1); g = vis_ld_u8((mlib_u8*)src + 2); b = vis_ld_u8((mlib_u8*)src + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); dd = vis_fxor(D64_FROM_F32x2(ff), d_xorpixel); dd = vis_fandnot(d_alphamask, dd); vis_st_u16(vis_fxor(vis_ld_u8(dst), dd), dst); } dst++; src++; } #pragma pipeloop(0) for (; dst <= (dst_end - 8); dst += 8) { s02 = vis_fpmerge(src[0], src[1]); src += 2; ar = vis_fpmerge(fzeros, vis_read_hi(s02)); gb = vis_fpmerge(fzeros, vis_read_lo(s02)); mask0 = vis_fcmplt16(ar, dzero) & 0xC; GRAY_U16(f0, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb)); s02 = vis_fpmerge(src[0], src[1]); src += 2; ar = vis_fpmerge(fzeros, vis_read_hi(s02)); gb = vis_fpmerge(fzeros, vis_read_lo(s02)); mask1 = vis_fcmplt16(ar, dzero) >> 2; GRAY_U16(f1, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb)); dd = vis_freg_pair(f0, f1); dd = vis_fandnot(d_alphamask, vis_fxor(dd, d_xorpixel)); vis_pst_16(vis_fxor(*(mlib_d64*)dst, dd), dst, mask0 | mask1); } while (dst < dst_end) { if ((*(mlib_u8*)src) & 0x80) { r = vis_ld_u8((mlib_u8*)src + 1); g = vis_ld_u8((mlib_u8*)src + 2); b = vis_ld_u8((mlib_u8*)src + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); dd = vis_fxor(D64_FROM_F32x2(ff), d_xorpixel); dd = vis_fandnot(d_alphamask, dd); vis_st_u16(vis_fxor(vis_ld_u8(dst), dd), dst); } dst++; src++; } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } } /***************************************************************/ void ADD_SUFF(IntArgbBmToUshortGrayScaleXparOver)(SCALE_PARAMS) { mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_d64 dzero = vis_fzero(); mlib_f32 f0, f1; mlib_s32 i, j, mask0, mask1; RGB_VARS; for (j = 0; j < height; j++) { mlib_f32 *src = srcBase; mlib_u16 *dst = dstBase; mlib_u16 *dst_end; mlib_s32 tmpsxloc = sxloc; PTR_ADD(src, (syloc >> shift) * srcScan); dst_end = dst + width; while (((mlib_s32)dst & 7) && dst < dst_end) { i = tmpsxloc >> shift; tmpsxloc += sxinc; if (*(mlib_u8*)(src + i)) { r = vis_ld_u8((mlib_u8*)(src + i) + 1); g = vis_ld_u8((mlib_u8*)(src + i) + 2); b = vis_ld_u8((mlib_u8*)(src + i) + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); vis_st_u16(D64_FROM_F32x2(ff), dst); } dst++; } #pragma pipeloop(0) for (; dst <= (dst_end - 4); dst += 4) { s02 = vis_fpmerge(src[(tmpsxloc ) >> shift], src[(tmpsxloc + sxinc) >> shift]); tmpsxloc += 2*sxinc; ar = vis_fpmerge(fzeros, vis_read_hi(s02)); gb = vis_fpmerge(fzeros, vis_read_lo(s02)); mask0 = vis_fcmpne16(ar, dzero) & 0xC; GRAY_U16(f0, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb)); s02 = vis_fpmerge(src[(tmpsxloc ) >> shift], src[(tmpsxloc + sxinc) >> shift]); tmpsxloc += 2*sxinc; ar = vis_fpmerge(fzeros, vis_read_hi(s02)); gb = vis_fpmerge(fzeros, vis_read_lo(s02)); mask1 = vis_fcmpne16(ar, dzero) >> 2; GRAY_U16(f1, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb)); vis_pst_16(vis_freg_pair(f0, f1), dst, mask0 | mask1); } while (dst < dst_end) { i = tmpsxloc >> shift; tmpsxloc += sxinc; if (*(mlib_u8*)(src + i)) { r = vis_ld_u8((mlib_u8*)(src + i) + 1); g = vis_ld_u8((mlib_u8*)(src + i) + 2); b = vis_ld_u8((mlib_u8*)(src + i) + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); vis_st_u16(D64_FROM_F32x2(ff), dst); } dst++; } PTR_ADD(dstBase, dstScan); syloc += syinc; } } /***************************************************************/ #define TBL_MUL ((mlib_s16*)vis_mul8s_tbl + 1) #define TBL_DIV ((mlib_u8*)vis_div8_tbl + 2) void ADD_SUFF(IntArgbToUshortGraySrcOverMaskBlit)(MASKBLIT_PARAMS) { mlib_s32 extraA; mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_u8 *mul8_extra; mlib_u16 *dst_end; mlib_d64 srcAx4, dd, d0, d1; mlib_d64 done = vis_to_double_dup(0x7fff7fff); mlib_s32 j, srcA0, srcA1, srcA2, srcA3; RGB_VARS; extraA = (mlib_s32)(pCompInfo->details.extraAlpha * 255.0 + 0.5); mul8_extra = mul8table[extraA]; if (pMask != NULL) { pMask += maskOff; if (srcScan == 4*width && dstScan == 2*width && maskScan == width) { width *= height; height = 1; } maskScan -= width; for (j = 0; j < height; j++) { mlib_f32 *src = srcBase; mlib_u16 *dst = dstBase; dst_end = dst + width; while (((mlib_s32)dst & 3) && dst < dst_end) { srcA0 = mul8table[mul8_extra[*pMask++]][*(mlib_u8*)src]; r = vis_ld_u8((mlib_u8*)src + 1); g = vis_ld_u8((mlib_u8*)src + 2); b = vis_ld_u8((mlib_u8*)src + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); d0 = vis_fpadd16(MUL8_VIS(ff, srcA0), d_half); d1 = MUL8_VIS(vis_read_lo(vis_ld_u8(dst)), 255 - srcA0); dd = vis_fpadd16(d0, d1); vis_st_u16(D64_FROM_F32x2(vis_fpack16(dd)), dst); dst++; src++; } #pragma pipeloop(0) for (; dst <= (dst_end - 4); dst += 4) { srcA0 = mul8table[mul8_extra[*pMask++]][*(mlib_u8*)src]; srcA1 = mul8table[mul8_extra[*pMask++]][*(mlib_u8*)(src + 1)]; srcA2 = mul8table[mul8_extra[*pMask++]][*(mlib_u8*)(src + 2)]; srcA3 = mul8table[mul8_extra[*pMask++]][*(mlib_u8*)(src + 3)]; srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA3), srcAx4); srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA2), srcAx4); srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA1), srcAx4); srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA0), srcAx4); s02 = vis_fpmerge(src[0], src[1]); ar = vis_fpmerge(fzeros, vis_read_hi(s02)); gb = vis_fpmerge(fzeros, vis_read_lo(s02)); GRAY_U16(ff, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb)); d0 = vis_fpadd16(vis_fmul8x16(ff, srcAx4), d_half); d1 = vis_fmul8x16(*(mlib_f32*)dst, vis_fpsub16(done, srcAx4)); dd = vis_fpadd16(d0, d1); *(mlib_f32*)dst = vis_fpack16(dd); src += 4; } while (dst < dst_end) { srcA0 = mul8table[mul8_extra[*pMask++]][*(mlib_u8*)src]; r = vis_ld_u8((mlib_u8*)src + 1); g = vis_ld_u8((mlib_u8*)src + 2); b = vis_ld_u8((mlib_u8*)src + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); d0 = vis_fpadd16(MUL8_VIS(ff, srcA0), d_half); d1 = MUL8_VIS(vis_read_lo(vis_ld_u8(dst)), 255 - srcA0); dd = vis_fpadd16(d0, d1); vis_st_u16(D64_FROM_F32x2(vis_fpack16(dd)), dst); dst++; src++; } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); PTR_ADD(pMask, maskScan); } } else { if (dstScan == width && srcScan == 4*width) { width *= height; height = 1; } for (j = 0; j < height; j++) { mlib_f32 *src = srcBase; mlib_u16 *dst = dstBase; dst_end = dst + width; while (((mlib_s32)dst & 3) && dst < dst_end) { srcA0 = mul8_extra[*(mlib_u8*)src]; r = vis_ld_u8((mlib_u8*)src + 1); g = vis_ld_u8((mlib_u8*)src + 2); b = vis_ld_u8((mlib_u8*)src + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); d0 = vis_fpadd16(MUL8_VIS(ff, srcA0), d_half); d1 = MUL8_VIS(vis_read_lo(vis_ld_u8(dst)), 255 - srcA0); dd = vis_fpadd16(d0, d1); vis_st_u16(D64_FROM_F32x2(vis_fpack16(dd)), dst); dst++; src++; } #pragma pipeloop(0) for (; dst <= (dst_end - 4); dst += 4) { srcA0 = mul8_extra[*(mlib_u8*)src]; srcA1 = mul8_extra[*(mlib_u8*)(src + 1)]; srcA2 = mul8_extra[*(mlib_u8*)(src + 2)]; srcA3 = mul8_extra[*(mlib_u8*)(src + 3)]; srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA3), srcAx4); srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA2), srcAx4); srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA1), srcAx4); srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA0), srcAx4); s02 = vis_fpmerge(src[0], src[2]); s13 = vis_fpmerge(src[1], src[3]); ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13)); gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13)); GRAY_U16(ff, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb)); d0 = vis_fpadd16(vis_fmul8x16(ff, srcAx4), d_half); d1 = vis_fmul8x16(*(mlib_f32*)dst, vis_fpsub16(done, srcAx4)); dd = vis_fpadd16(d0, d1); *(mlib_f32*)dst = vis_fpack16(dd); src += 4; } while (dst < dst_end) { srcA0 = mul8_extra[*(mlib_u8*)src]; r = vis_ld_u8((mlib_u8*)src + 1); g = vis_ld_u8((mlib_u8*)src + 2); b = vis_ld_u8((mlib_u8*)src + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); d0 = vis_fpadd16(MUL8_VIS(ff, srcA0), d_half); d1 = MUL8_VIS(vis_read_lo(vis_ld_u8(dst)), 255 - srcA0); dd = vis_fpadd16(d0, d1); vis_st_u16(D64_FROM_F32x2(vis_fpack16(dd)), dst); dst++; src++; } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } } } /***************************************************************/ #define GET_COEF(i) \ pathA = pMask[i]; \ srcA = *(mlib_u8*)(src + i); \ srcA = mul8table[extraA][srcA]; \ dstF = ((((srcA) & DstOpAnd) ^ DstOpXor) + DstOpAdd); \ srcF = mul8table[pathA][srcFbase]; \ dstA = 0xff - pathA + mul8table[pathA][dstF]; \ srcA = mul8table[srcF][srcA]; \ resA = srcA + dstA; \ srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA), srcAx4); \ divAx4 = vis_faligndata(vis_ld_u16(TBL_DIV + 8*resA), divAx4) /***************************************************************/ void ADD_SUFF(IntArgbToUshortGrayAlphaMaskBlit)(MASKBLIT_PARAMS) { mlib_s32 extraA; mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_u16 *dst_end; mlib_d64 srcAx4, dstAx4, divAx4, dd, ds; mlib_d64 done = vis_to_double_dup(0x01000100); mlib_f32 fscale = vis_to_float(0x02020202); mlib_s32 j; mlib_s32 SrcOpAnd, SrcOpXor, SrcOpAdd; mlib_s32 DstOpAnd, DstOpXor, DstOpAdd; mlib_s32 pathA, srcFbase, resA, resG, srcF, dstF, srcA, dstA; RGB_VARS; SrcOpAnd = (AlphaRules[pCompInfo->rule].srcOps).andval; SrcOpXor = (AlphaRules[pCompInfo->rule].srcOps).xorval; SrcOpAdd = (jint) (AlphaRules[pCompInfo->rule].srcOps).addval - SrcOpXor; DstOpAnd = (AlphaRules[pCompInfo->rule].dstOps).andval; DstOpXor = (AlphaRules[pCompInfo->rule].dstOps).xorval; DstOpAdd = (jint) (AlphaRules[pCompInfo->rule].dstOps).addval - DstOpXor; extraA = (mlib_s32)(pCompInfo->details.extraAlpha * 255.0 + 0.5); srcFbase = ((((0xff) & SrcOpAnd) ^ SrcOpXor) + SrcOpAdd); vis_write_gsr((7 << 3) | 6); if (pMask != NULL) { pMask += maskOff; if (dstScan == width && srcScan == 4*width && maskScan == width) { width *= height; height = 1; } maskScan -= width; for (j = 0; j < height; j++) { mlib_f32 *src = srcBase; mlib_u16 *dst = dstBase; dst_end = dst + width; while (((mlib_s32)dst & 3) && dst < dst_end) { pathA = *pMask++; srcA = *(mlib_u8*)src; srcA = mul8table[extraA][srcA]; dstF = ((((srcA) & DstOpAnd) ^ DstOpXor) + DstOpAdd); srcF = mul8table[pathA][srcFbase]; dstA = 0xff - pathA + mul8table[pathA][dstF]; srcA = mul8table[srcF][srcA]; resA = srcA + dstA; r = vis_ld_u8((mlib_u8*)src + 1); g = vis_ld_u8((mlib_u8*)src + 2); b = vis_ld_u8((mlib_u8*)src + 3); GRAY_U16(dd, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); dd = vis_fmul8x16(fscale, dd); ff = vis_fpack16(dd); dd = vis_freg_pair(vis_fzeros(), ((mlib_f32*)vis_mul8s_tbl)[dstA]); DIV_ALPHA(dd, resA); ds = vis_fpsub16(done, dd); dd = vis_fmul8x16(vis_read_lo(vis_ld_u8(dst)), dd); ds = vis_fmul8x16(ff, ds); dd = vis_fpadd16(dd, ds); ff = vis_fpack16(dd); vis_st_u16(D64_FROM_F32x2(ff), dst); dst++; src++; } #pragma pipeloop(0) for (; dst <= (dst_end - 4); dst += 4) { GET_COEF(3); GET_COEF(2); GET_COEF(1); GET_COEF(0); pMask += 4; srcAx4 = FMUL_16x16(srcAx4, divAx4); dstAx4 = vis_fpsub16(done, srcAx4); s02 = vis_fpmerge(src[0], src[2]); s13 = vis_fpmerge(src[1], src[3]); ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13)); gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13)); GRAY_U16(dd, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb)); dd = vis_fmul8x16(fscale, dd); ff = vis_fpack16(dd); dd = vis_fmul8x16(*(mlib_f32*)dst, dstAx4); ds = vis_fmul8x16(ff, srcAx4); dd = vis_fpadd16(dd, ds); *(mlib_f32*)dst = vis_fpack16(dd); src += 4; } while (dst < dst_end) { pathA = *pMask++; srcA = *(mlib_u8*)src; srcA = mul8table[extraA][srcA]; dstF = ((((srcA) & DstOpAnd) ^ DstOpXor) + DstOpAdd); srcF = mul8table[pathA][srcFbase]; dstA = 0xff - pathA + mul8table[pathA][dstF]; srcA = mul8table[srcF][srcA]; resA = srcA + dstA; r = vis_ld_u8((mlib_u8*)src + 1); g = vis_ld_u8((mlib_u8*)src + 2); b = vis_ld_u8((mlib_u8*)src + 3); GRAY_U16(dd, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); dd = vis_fmul8x16(fscale, dd); ff = vis_fpack16(dd); dd = vis_freg_pair(vis_fzeros(), ((mlib_f32*)vis_mul8s_tbl)[dstA]); DIV_ALPHA(dd, resA); ds = vis_fpsub16(done, dd); dd = vis_fmul8x16(vis_read_lo(vis_ld_u8(dst)), dd); ds = vis_fmul8x16(ff, ds); dd = vis_fpadd16(dd, ds); ff = vis_fpack16(dd); vis_st_u16(D64_FROM_F32x2(ff), dst); dst++; src++; } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); PTR_ADD(pMask, maskScan); } } else { if (dstScan == width && srcScan == 4*width) { width *= height; height = 1; } for (j = 0; j < height; j++) { mlib_f32 *src = srcBase; mlib_u16 *dst = dstBase; dst_end = dst + width; while (dst < dst_end) { srcA = *(mlib_u8*)src; srcA = mul8table[extraA][srcA]; dstA = ((((srcA) & DstOpAnd) ^ DstOpXor) + DstOpAdd); srcA = mul8table[srcFbase][srcA]; resA = srcA + dstA; r = vis_ld_u8((mlib_u8*)src + 1); g = vis_ld_u8((mlib_u8*)src + 2); b = vis_ld_u8((mlib_u8*)src + 3); GRAY_U16(dd, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); dd = vis_fmul8x16(fscale, dd); ff = vis_fpack16(dd); resG = mul8table[dstA][*dst] + mul8table[srcA][((mlib_u8*)&ff)[3]]; *dst = div8table[resA][resG]; dst++; src++; } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } } } /***************************************************************/ void ADD_SUFF(IntRgbToUshortGrayAlphaMaskBlit)(MASKBLIT_PARAMS) { mlib_s32 extraA; mlib_s32 dstScan = pDstInfo->scanStride; mlib_s32 srcScan = pSrcInfo->scanStride; mlib_u16 *dst_end; mlib_d64 srcA_d, dstA_d, dd, d0, d1; mlib_s32 i, j, srcG; mlib_s32 SrcOpAnd, SrcOpXor, SrcOpAdd; mlib_s32 DstOpAnd, DstOpXor, DstOpAdd; mlib_s32 pathA, srcFbase, dstFbase, resA, resG, srcA, dstA; RGB_VARS; SrcOpAnd = (AlphaRules[pCompInfo->rule].srcOps).andval; SrcOpXor = (AlphaRules[pCompInfo->rule].srcOps).xorval; SrcOpAdd = (jint) (AlphaRules[pCompInfo->rule].srcOps).addval - SrcOpXor; DstOpAnd = (AlphaRules[pCompInfo->rule].dstOps).andval; DstOpXor = (AlphaRules[pCompInfo->rule].dstOps).xorval; DstOpAdd = (jint) (AlphaRules[pCompInfo->rule].dstOps).addval - DstOpXor; extraA = (mlib_s32)(pCompInfo->details.extraAlpha * 255.0 + 0.5); srcFbase = ((((0xff) & SrcOpAnd) ^ SrcOpXor) + SrcOpAdd); dstFbase = (((extraA & DstOpAnd) ^ DstOpXor) + DstOpAdd); srcFbase = mul8table[srcFbase][extraA]; if (width < 16) { if (pMask != NULL) { pMask += maskOff; for (j = 0; j < height; j++) { mlib_u16 *dst = dstBase; mlib_u8 *src = srcBase; for (i = 0; i < width; i++) { pathA = pMask[i]; dstA = 0xff - pathA + mul8table[dstFbase][pathA]; srcA = mul8table[srcFbase][pathA]; resA = srcA + dstA; srcG = RGB2GRAY(src[4*i + 1], src[4*i + 2], src[4*i + 3]); resG = mul8table[dstA][dst[i]] + mul8table[srcA][srcG]; resG = div8table[resA][resG]; dst[i] = resG; } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); PTR_ADD(pMask, maskScan); } } else { dstA = dstFbase; srcA = srcFbase; resA = srcA + dstA; for (j = 0; j < height; j++) { mlib_u16 *dst = dstBase; mlib_u8 *src = srcBase; for (i = 0; i < width; i++) { srcG = RGB2GRAY(src[4*i + 1], src[4*i + 2], src[4*i + 3]); resG = mul8table[dstA][dst[i]] + mul8table[srcA][srcG]; resG = div8table[resA][resG]; dst[i] = resG; } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } } return; } if (pMask != NULL) { mlib_s32 srcA_buff[256]; mlib_d64 dscale = (mlib_d64)(1 << 15)*(1 << 16), ddiv; mlib_d64 d_one = vis_to_double_dup(0x7FFF7FFF); srcA_buff[0] = 0; #pragma pipeloop(0) for (pathA = 1; pathA < 256; pathA++) { dstA = 0xff - pathA + mul8table[dstFbase][pathA]; srcA = mul8table[srcFbase][pathA]; resA = dstA + srcA; ddiv = dscale*vis_d64_div_tbl[resA]; srcA_buff[pathA] = srcA*ddiv + (1 << 15); } pMask += maskOff; maskScan -= width; if (dstScan == width && srcScan == 4*width && maskScan == width) { width *= height; height = 1; } for (j = 0; j < height; j++) { mlib_f32 *src = srcBase; mlib_u16 *dst = dstBase; dst_end = dst + width; while (((mlib_s32)dst & 3) && dst < dst_end) { pathA = *pMask++; srcA_d = vis_ld_u16(srcA_buff + pathA); dstA_d = vis_fpsub16(d_one, srcA_d); r = vis_ld_u8((mlib_u8*)src + 1); g = vis_ld_u8((mlib_u8*)src + 2); b = vis_ld_u8((mlib_u8*)src + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); d0 = vis_fpadd16(vis_fmul8x16(ff, srcA_d), d_half); d1 = vis_fmul8x16(vis_read_lo(vis_ld_u8(dst)), dstA_d); dd = vis_fpadd16(d0, d1); vis_st_u16(D64_FROM_F32x2(vis_fpack16(dd)), dst); dst++; src++; } #pragma pipeloop(0) for (; dst <= (dst_end - 4); dst += 4) { LOAD_NEXT_U16(srcA_d, srcA_buff + pMask[3]); LOAD_NEXT_U16(srcA_d, srcA_buff + pMask[2]); LOAD_NEXT_U16(srcA_d, srcA_buff + pMask[1]); LOAD_NEXT_U16(srcA_d, srcA_buff + pMask[0]); dstA_d = vis_fpsub16(d_one, srcA_d); pMask += 4; s02 = vis_fpmerge(src[0], src[2]); s13 = vis_fpmerge(src[1], src[3]); ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13)); gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13)); GRAY_U16(ff, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb)); dd = vis_fpadd16(vis_fmul8x16(ff, srcA_d), d_half); dd = vis_fpadd16(vis_fmul8x16(*(mlib_f32*)dst, dstA_d), dd); *(mlib_f32*)dst = vis_fpack16(dd); src += 4; } while (dst < dst_end) { pathA = *pMask++; srcA_d = vis_ld_u16(srcA_buff + pathA); dstA_d = vis_fpsub16(d_one, srcA_d); r = vis_ld_u8((mlib_u8*)src + 1); g = vis_ld_u8((mlib_u8*)src + 2); b = vis_ld_u8((mlib_u8*)src + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); d0 = vis_fpadd16(vis_fmul8x16(ff, srcA_d), d_half); d1 = vis_fmul8x16(vis_read_lo(vis_ld_u8(dst)), dstA_d); dd = vis_fpadd16(d0, d1); ff = vis_fpack16(dd); vis_st_u16(D64_FROM_F32x2(ff), dst); dst++; src++; } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); PTR_ADD(pMask, maskScan); } } else { mlib_d64 dscale = (mlib_d64)(1 << 15)*(1 << 16), ddiv; mlib_d64 d_one = vis_to_double_dup(0x7FFF7FFF); dstA = dstFbase; srcA = srcFbase; resA = dstA + srcA; ddiv = dscale*vis_d64_div_tbl[resA]; srcA = (mlib_s32)(srcA*ddiv + (1 << 15)) >> 16; srcA_d = vis_to_double_dup((srcA << 16) | srcA); dstA_d = vis_fpsub16(d_one, srcA_d); if (dstScan == width && srcScan == 4*width) { width *= height; height = 1; } for (j = 0; j < height; j++) { mlib_f32 *src = srcBase; mlib_u16 *dst = dstBase; dst_end = dst + width; while (((mlib_s32)dst & 3) && dst < dst_end) { r = vis_ld_u8((mlib_u8*)src + 1); g = vis_ld_u8((mlib_u8*)src + 2); b = vis_ld_u8((mlib_u8*)src + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); d0 = vis_fpadd16(vis_fmul8x16(ff, srcA_d), d_half); d1 = vis_fmul8x16(vis_read_lo(vis_ld_u8(dst)), dstA_d); dd = vis_fpadd16(d0, d1); vis_st_u16(D64_FROM_F32x2(vis_fpack16(dd)), dst); dst++; src++; } #pragma pipeloop(0) for (; dst <= (dst_end - 4); dst += 4) { s02 = vis_fpmerge(src[0], src[2]); s13 = vis_fpmerge(src[1], src[3]); ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13)); gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13)); GRAY_U16(ff, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb)); dd = vis_fpadd16(vis_fmul8x16(ff, srcA_d), d_half); dd = vis_fpadd16(vis_fmul8x16(*(mlib_f32*)dst, dstA_d), dd); *(mlib_f32*)dst = vis_fpack16(dd); src += 4; } while (dst < dst_end) { r = vis_ld_u8((mlib_u8*)src + 1); g = vis_ld_u8((mlib_u8*)src + 2); b = vis_ld_u8((mlib_u8*)src + 3); GRAY_U16(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b)); d0 = vis_fpadd16(vis_fmul8x16(ff, srcA_d), d_half); d1 = vis_fmul8x16(vis_read_lo(vis_ld_u8(dst)), dstA_d); dd = vis_fpadd16(d0, d1); ff = vis_fpack16(dd); vis_st_u16(D64_FROM_F32x2(ff), dst); dst++; src++; } PTR_ADD(dstBase, dstScan); PTR_ADD(srcBase, srcScan); } } } /***************************************************************/ #endif