1/*
2 * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26#if !defined(JAVA2D_NO_MLIB) || defined(MLIB_ADD_SUFF)
27
28#include <vis_proto.h>
29#include "java2d_Mlib.h"
30#include "vis_AlphaMacros.h"
31
32/***************************************************************/
33
34extern mlib_d64 vis_d64_div_tbl[256];
35
36/***************************************************************/
37
38#define RGB2GRAY(r, g, b)      \
39    (((77 * (r)) + (150 * (g)) + (29 * (b)) + 128) >> 8)
40
41/***************************************************************/
42
43static const mlib_s32 RGB_weight[] = {
44    128*77,
45    128*150,
46    128*29,
47    (1 << (16 + 6)) | (1 << 6)
48};
49
50/***************************************************************/
51
52#define RGB_VARS                                               \
53    mlib_d64 r, g, b, ar, gb, s02, s13;                        \
54    mlib_f32 ff;                                               \
55    mlib_f32 alpha = ((mlib_f32*)RGB_weight)[0];               \
56    mlib_f32 beta  = ((mlib_f32*)RGB_weight)[1];               \
57    mlib_f32 gamma = ((mlib_f32*)RGB_weight)[2];               \
58    mlib_d64 d_half = vis_to_double_dup(RGB_weight[3]);        \
59                                                               \
60    vis_write_gsr((0 << 3) | 6)
61
62/***************************************************************/
63
64#define GRAY_U8(ff, r, g, b)           \
65{                                      \
66    mlib_d64 dr, dg, db;               \
67    dr = vis_fmul8x16al(r, alpha);     \
68    dg = vis_fmul8x16al(g, beta);      \
69    db = vis_fmul8x16al(b, gamma);     \
70    dr = vis_fpadd16(dr, dg);          \
71    db = vis_fpadd16(db, d_half);      \
72    dr = vis_fpadd16(dr, db);          \
73    ff = vis_fpack16(dr);              \
74}
75
76/***************************************************************/
77
78#define GRAY_S16(dd, r, g, b)          \
79{                                      \
80    mlib_d64 dr, dg, db;               \
81    dr = vis_fmul8x16al(r, alpha);     \
82    dg = vis_fmul8x16al(g, beta);      \
83    db = vis_fmul8x16al(b, gamma);     \
84    dr = vis_fpadd16(dr, dg);          \
85    db = vis_fpadd16(db, d_half);      \
86    dd = vis_fpadd16(dr, db);          \
87}
88
89/***************************************************************/
90
91#define LOAD_BGR(ind)                                          \
92    b = vis_faligndata(vis_ld_u8(src + (ind    )), b);         \
93    g = vis_faligndata(vis_ld_u8(src + (ind + 1)), g);         \
94    r = vis_faligndata(vis_ld_u8(src + (ind + 2)), r)
95
96/***************************************************************/
97
98void ADD_SUFF(IntArgbToByteGrayConvert)(BLIT_PARAMS)
99{
100    mlib_s32 dstScan = pDstInfo->scanStride;
101    mlib_s32 srcScan = pSrcInfo->scanStride;
102    mlib_u8  *dst_end;
103    mlib_s32 j;
104    RGB_VARS;
105
106    if (dstScan == width && srcScan == 4*width) {
107        width *= height;
108        height = 1;
109    }
110
111    for (j = 0; j < height; j++) {
112        mlib_f32 *src = srcBase;
113        mlib_u8  *dst = dstBase;
114
115        dst_end = dst + width;
116
117        while (((mlib_s32)dst & 3) && dst < dst_end) {
118            r = vis_ld_u8((mlib_u8*)src + 1);
119            g = vis_ld_u8((mlib_u8*)src + 2);
120            b = vis_ld_u8((mlib_u8*)src + 3);
121            GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
122            vis_st_u8(D64_FROM_F32x2(ff), dst);
123            dst++;
124            src++;
125        }
126
127#pragma pipeloop(0)
128        for (; dst <= (dst_end - 4); dst += 4) {
129            s02 = vis_fpmerge(src[0], src[2]);
130            s13 = vis_fpmerge(src[1], src[3]);
131            ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13));
132            gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13));
133            GRAY_U8(ff, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
134            *(mlib_f32*)dst = ff;
135            src += 4;
136        }
137
138        while (dst < dst_end) {
139            r = vis_ld_u8((mlib_u8*)src + 1);
140            g = vis_ld_u8((mlib_u8*)src + 2);
141            b = vis_ld_u8((mlib_u8*)src + 3);
142            GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
143            vis_st_u8(D64_FROM_F32x2(ff), dst);
144            dst++;
145            src++;
146        }
147
148        PTR_ADD(dstBase, dstScan);
149        PTR_ADD(srcBase, srcScan);
150    }
151}
152
153/***************************************************************/
154
155void ADD_SUFF(ThreeByteBgrToByteGrayConvert)(BLIT_PARAMS)
156{
157    mlib_s32 dstScan = pDstInfo->scanStride;
158    mlib_s32 srcScan = pSrcInfo->scanStride;
159    mlib_u8  *dst_end;
160    mlib_s32 j;
161    RGB_VARS;
162
163    vis_alignaddr(NULL, 7);
164
165    if (dstScan == width && srcScan == 3*width) {
166        width *= height;
167        height = 1;
168    }
169
170    for (j = 0; j < height; j++) {
171        mlib_u8 *src = srcBase;
172        mlib_u8 *dst = dstBase;
173
174        dst_end = dst + width;
175
176        while (((mlib_s32)dst & 3) && dst < dst_end) {
177            b = vis_ld_u8(src);
178            g = vis_ld_u8(src + 1);
179            r = vis_ld_u8(src + 2);
180            GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
181            vis_st_u8(D64_FROM_F32x2(ff), dst);
182            dst++;
183            src += 3;
184        }
185
186#pragma pipeloop(0)
187        for (; dst <= (dst_end - 4); dst += 4) {
188            LOAD_BGR(9);
189            LOAD_BGR(6);
190            LOAD_BGR(3);
191            LOAD_BGR(0);
192            GRAY_U8(ff, vis_read_hi(r), vis_read_hi(g), vis_read_hi(b));
193            *(mlib_f32*)dst = ff;
194            src += 3*4;
195        }
196
197        while (dst < dst_end) {
198            b = vis_ld_u8(src);
199            g = vis_ld_u8(src + 1);
200            r = vis_ld_u8(src + 2);
201            GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
202            vis_st_u8(D64_FROM_F32x2(ff), dst);
203            dst++;
204            src += 3;
205        }
206
207        PTR_ADD(dstBase, dstScan);
208        PTR_ADD(srcBase, srcScan);
209    }
210}
211
212/***************************************************************/
213
214void ADD_SUFF(IntArgbToByteGrayScaleConvert)(SCALE_PARAMS)
215{
216    mlib_s32 dstScan = pDstInfo->scanStride;
217    mlib_s32 srcScan = pSrcInfo->scanStride;
218    mlib_u8  *dst_end;
219    mlib_s32 i, j;
220    RGB_VARS;
221
222    for (j = 0; j < height; j++) {
223        mlib_f32 *src = srcBase;
224        mlib_u8  *dst = dstBase;
225        mlib_s32 tmpsxloc = sxloc;
226
227        PTR_ADD(src, (syloc >> shift) * srcScan);
228
229        dst_end = dst + width;
230
231        while (((mlib_s32)dst & 3) && dst < dst_end) {
232            i = tmpsxloc >> shift;
233            tmpsxloc += sxinc;
234            r = vis_ld_u8((mlib_u8*)(src + i) + 1);
235            g = vis_ld_u8((mlib_u8*)(src + i) + 2);
236            b = vis_ld_u8((mlib_u8*)(src + i) + 3);
237            GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
238            vis_st_u8(D64_FROM_F32x2(ff), dst);
239            dst++;
240        }
241
242#pragma pipeloop(0)
243        for (; dst <= (dst_end - 4); dst += 4) {
244            s02 = vis_fpmerge(src[(tmpsxloc          ) >> shift],
245                              src[(tmpsxloc + 2*sxinc) >> shift]);
246            s13 = vis_fpmerge(src[(tmpsxloc +   sxinc) >> shift],
247                              src[(tmpsxloc + 3*sxinc) >> shift]);
248            tmpsxloc += 4*sxinc;
249            ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13));
250            gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13));
251            GRAY_U8(ff, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
252            *(mlib_f32*)dst = ff;
253        }
254
255        while (dst < dst_end) {
256            i = tmpsxloc >> shift;
257            tmpsxloc += sxinc;
258            r = vis_ld_u8((mlib_u8*)(src + i) + 1);
259            g = vis_ld_u8((mlib_u8*)(src + i) + 2);
260            b = vis_ld_u8((mlib_u8*)(src + i) + 3);
261            GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
262            vis_st_u8(D64_FROM_F32x2(ff), dst);
263            dst++;
264        }
265
266        PTR_ADD(dstBase, dstScan);
267        syloc += syinc;
268    }
269}
270
271/***************************************************************/
272
273void ADD_SUFF(ThreeByteBgrToByteGrayScaleConvert)(SCALE_PARAMS)
274{
275    mlib_s32 dstScan = pDstInfo->scanStride;
276    mlib_s32 srcScan = pSrcInfo->scanStride;
277    mlib_u8  *dst_end;
278    mlib_s32 j, i0, i1, i2, i3;
279    RGB_VARS;
280
281    vis_alignaddr(NULL, 7);
282
283    for (j = 0; j < height; j++) {
284        mlib_u8  *src = srcBase;
285        mlib_u8  *dst = dstBase;
286        mlib_s32 tmpsxloc = sxloc;
287
288        PTR_ADD(src, (syloc >> shift) * srcScan);
289
290        dst_end = dst + width;
291
292        while (((mlib_s32)dst & 3) && dst < dst_end) {
293            i0 = 3*(tmpsxloc >> shift);
294            tmpsxloc += sxinc;
295            b = vis_ld_u8(src + i0);
296            g = vis_ld_u8(src + i0 + 1);
297            r = vis_ld_u8(src + i0 + 2);
298            GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
299            vis_st_u8(D64_FROM_F32x2(ff), dst);
300            dst++;
301        }
302
303#pragma pipeloop(0)
304        for (; dst <= (dst_end - 4); dst += 4) {
305            i0 = 3*(tmpsxloc >> shift);
306            tmpsxloc += sxinc;
307            i1 = 3*(tmpsxloc >> shift);
308            tmpsxloc += sxinc;
309            i2 = 3*(tmpsxloc >> shift);
310            tmpsxloc += sxinc;
311            i3 = 3*(tmpsxloc >> shift);
312            tmpsxloc += sxinc;
313            LOAD_BGR(i3);
314            LOAD_BGR(i2);
315            LOAD_BGR(i1);
316            LOAD_BGR(i0);
317            GRAY_U8(ff, vis_read_hi(r), vis_read_hi(g), vis_read_hi(b));
318            *(mlib_f32*)dst = ff;
319        }
320
321        while (dst < dst_end) {
322            i0 = 3*(tmpsxloc >> shift);
323            tmpsxloc += sxinc;
324            b = vis_ld_u8(src + i0);
325            g = vis_ld_u8(src + i0 + 1);
326            r = vis_ld_u8(src + i0 + 2);
327            GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
328            vis_st_u8(D64_FROM_F32x2(ff), dst);
329            dst++;
330        }
331
332        PTR_ADD(dstBase, dstScan);
333        syloc += syinc;
334    }
335}
336
337/***************************************************************/
338
339void ADD_SUFF(IntArgbBmToByteGrayXparOver)(BLIT_PARAMS)
340{
341    mlib_s32 dstScan = pDstInfo->scanStride;
342    mlib_s32 srcScan = pSrcInfo->scanStride;
343    mlib_u8  *dst_end;
344    mlib_d64 dzero = vis_fzero();
345    mlib_f32 f0, f1;
346    mlib_s32 i, j, mask0, mask1;
347    RGB_VARS;
348
349    if (width < 8) {
350        for (j = 0; j < height; j++) {
351            mlib_u8  *src = srcBase;
352            mlib_u8  *dst = dstBase;
353
354            for (i = 0; i < width; i++) {
355                if (src[4*i]) {
356                    dst[i] = RGB2GRAY(src[4*i + 1], src[4*i + 2], src[4*i + 3]);
357                }
358            }
359
360            PTR_ADD(dstBase, dstScan);
361            PTR_ADD(srcBase, srcScan);
362        }
363        return;
364    }
365
366    for (j = 0; j < height; j++) {
367        mlib_f32 *src = srcBase;
368        mlib_u8  *dst = dstBase;
369
370        dst_end = dst + width;
371
372        while (((mlib_s32)dst & 7) && dst < dst_end) {
373            if (*(mlib_u8*)src) {
374                r = vis_ld_u8((mlib_u8*)src + 1);
375                g = vis_ld_u8((mlib_u8*)src + 2);
376                b = vis_ld_u8((mlib_u8*)src + 3);
377                GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
378                vis_st_u8(D64_FROM_F32x2(ff), dst);
379            }
380            dst++;
381            src++;
382        }
383
384#pragma pipeloop(0)
385        for (; dst <= (dst_end - 8); dst += 8) {
386            s02 = vis_fpmerge(src[0], src[2]);
387            s13 = vis_fpmerge(src[1], src[3]);
388            src += 4;
389            ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13));
390            gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13));
391            mask0 = vis_fcmpne16(vis_fpmerge(vis_read_hi(ar), vis_read_hi(ar)),
392                                 dzero);
393            GRAY_U8(f0, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
394
395            s02 = vis_fpmerge(src[0], src[2]);
396            s13 = vis_fpmerge(src[1], src[3]);
397            src += 4;
398            ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13));
399            gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13));
400            mask1 = vis_fcmpne16(vis_fpmerge(vis_read_hi(ar), vis_read_hi(ar)),
401                                 dzero);
402            GRAY_U8(f1, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
403
404            vis_pst_8(vis_freg_pair(f0, f1), dst, (mask0 << 4) | mask1);
405        }
406
407        while (dst < dst_end) {
408            if (*(mlib_u8*)src) {
409                r = vis_ld_u8((mlib_u8*)src + 1);
410                g = vis_ld_u8((mlib_u8*)src + 2);
411                b = vis_ld_u8((mlib_u8*)src + 3);
412                GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
413                vis_st_u8(D64_FROM_F32x2(ff), dst);
414            }
415            dst++;
416            src++;
417        }
418
419        PTR_ADD(dstBase, dstScan);
420        PTR_ADD(srcBase, srcScan);
421    }
422}
423
424/***************************************************************/
425
426void ADD_SUFF(IntArgbBmToByteGrayXparBgCopy)(BCOPY_PARAMS)
427{
428    mlib_s32 dstScan = pDstInfo->scanStride;
429    mlib_s32 srcScan = pSrcInfo->scanStride;
430    mlib_u8  *dst_end;
431    mlib_d64 dzero = vis_fzero(), d_bgpixel;
432    mlib_f32 f0, f1;
433    mlib_s32 i, j, mask0, mask1;
434    RGB_VARS;
435
436    if (width < 8) {
437        for (j = 0; j < height; j++) {
438            mlib_u8  *src = srcBase;
439            mlib_u8  *dst = dstBase;
440
441            for (i = 0; i < width; i++) {
442                if (src[4*i]) {
443                    dst[i] = RGB2GRAY(src[4*i + 1], src[4*i + 2], src[4*i + 3]);
444                } else {
445                    dst[i] = bgpixel;
446                }
447            }
448
449            PTR_ADD(dstBase, dstScan);
450            PTR_ADD(srcBase, srcScan);
451        }
452        return;
453    }
454
455    D64_FROM_U8x8(d_bgpixel, bgpixel);
456
457    for (j = 0; j < height; j++) {
458        mlib_f32 *src = srcBase;
459        mlib_u8  *dst = dstBase;
460
461        dst_end = dst + width;
462
463        while (((mlib_s32)dst & 7) && dst < dst_end) {
464            if (*(mlib_u8*)src) {
465                r = vis_ld_u8((mlib_u8*)src + 1);
466                g = vis_ld_u8((mlib_u8*)src + 2);
467                b = vis_ld_u8((mlib_u8*)src + 3);
468                GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
469                vis_st_u8(D64_FROM_F32x2(ff), dst);
470            } else {
471                *dst = bgpixel;
472            }
473            dst++;
474            src++;
475        }
476
477#pragma pipeloop(0)
478        for (; dst <= (dst_end - 8); dst += 8) {
479            s02 = vis_fpmerge(src[0], src[2]);
480            s13 = vis_fpmerge(src[1], src[3]);
481            src += 4;
482            ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13));
483            gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13));
484            mask0 = vis_fcmpne16(vis_fpmerge(vis_read_hi(ar), vis_read_hi(ar)),
485                                 dzero);
486            GRAY_U8(f0, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
487
488            s02 = vis_fpmerge(src[0], src[2]);
489            s13 = vis_fpmerge(src[1], src[3]);
490            src += 4;
491            ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13));
492            gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13));
493            mask1 = vis_fcmpne16(vis_fpmerge(vis_read_hi(ar), vis_read_hi(ar)),
494                                 dzero);
495            GRAY_U8(f1, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
496
497            *(mlib_d64*)dst = d_bgpixel;
498            vis_pst_8(vis_freg_pair(f0, f1), dst, (mask0 << 4) | mask1);
499        }
500
501        while (dst < dst_end) {
502            if (*(mlib_u8*)src) {
503                r = vis_ld_u8((mlib_u8*)src + 1);
504                g = vis_ld_u8((mlib_u8*)src + 2);
505                b = vis_ld_u8((mlib_u8*)src + 3);
506                GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
507                vis_st_u8(D64_FROM_F32x2(ff), dst);
508            } else {
509                *dst = bgpixel;
510            }
511            dst++;
512            src++;
513        }
514
515        PTR_ADD(dstBase, dstScan);
516        PTR_ADD(srcBase, srcScan);
517    }
518}
519
520/***************************************************************/
521
522void ADD_SUFF(IntArgbToByteGrayXorBlit)(BLIT_PARAMS)
523{
524    mlib_s32 dstScan = pDstInfo->scanStride;
525    mlib_s32 srcScan = pSrcInfo->scanStride;
526    mlib_u8  *dst_end;
527    mlib_d64 dd, d_xorpixel, d_alphamask, dzero = vis_fzero();
528    mlib_f32 f0, f1;
529    mlib_s32 i, j, mask0, mask1;
530    jint  xorpixel = pCompInfo->details.xorPixel;
531    juint alphamask = pCompInfo->alphaMask;
532    RGB_VARS;
533
534    if (width < 8) {
535        for (j = 0; j < height; j++) {
536            mlib_s32 *src = srcBase;
537            mlib_u8  *dst = dstBase;
538            mlib_s32 srcpixel, r, g, b;
539
540            for (i = 0; i < width; i++) {
541                srcpixel = src[i];
542                if (srcpixel >= 0) continue;
543                b = (srcpixel) & 0xff;
544                g = (srcpixel >> 8) & 0xff;
545                r = (srcpixel >> 16) & 0xff;
546                srcpixel = (77*r + 150*g + 29*b + 128) / 256;
547                dst[i]  ^= (((srcpixel) ^ (xorpixel)) & ~(alphamask));
548            }
549
550            PTR_ADD(dstBase, dstScan);
551            PTR_ADD(srcBase, srcScan);
552        }
553        return;
554    }
555
556    D64_FROM_U8x8(d_xorpixel,  xorpixel);
557    D64_FROM_U8x8(d_alphamask, alphamask);
558
559    for (j = 0; j < height; j++) {
560        mlib_f32 *src = srcBase;
561        mlib_u8  *dst = dstBase;
562
563        dst_end = dst + width;
564
565        while (((mlib_s32)dst & 7) && dst < dst_end) {
566            if ((*(mlib_u8*)src) & 0x80) {
567                r = vis_ld_u8((mlib_u8*)src + 1);
568                g = vis_ld_u8((mlib_u8*)src + 2);
569                b = vis_ld_u8((mlib_u8*)src + 3);
570                GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
571                dd = vis_fxor(D64_FROM_F32x2(ff), d_xorpixel);
572                dd = vis_fandnot(d_alphamask, dd);
573                vis_st_u8(vis_fxor(vis_ld_u8(dst), dd), dst);
574            }
575            dst++;
576            src++;
577        }
578
579#pragma pipeloop(0)
580        for (; dst <= (dst_end - 8); dst += 8) {
581            s02 = vis_fpmerge(src[0], src[2]);
582            s13 = vis_fpmerge(src[1], src[3]);
583            src += 4;
584            ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13));
585            gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13));
586            mask0 = vis_fcmplt16(vis_fpmerge(vis_read_hi(ar), vis_read_hi(ar)),
587                                 dzero);
588            GRAY_U8(f0, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
589
590            s02 = vis_fpmerge(src[0], src[2]);
591            s13 = vis_fpmerge(src[1], src[3]);
592            src += 4;
593            ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13));
594            gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13));
595            mask1 = vis_fcmplt16(vis_fpmerge(vis_read_hi(ar), vis_read_hi(ar)),
596                                 dzero);
597            GRAY_U8(f1, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
598
599            dd = vis_freg_pair(f0, f1);
600            dd = vis_fandnot(d_alphamask, vis_fxor(dd, d_xorpixel));
601            vis_pst_8(vis_fxor(*(mlib_d64*)dst, dd), dst, (mask0 << 4) | mask1);
602        }
603
604        while (dst < dst_end) {
605            if ((*(mlib_u8*)src) & 0x80) {
606                r = vis_ld_u8((mlib_u8*)src + 1);
607                g = vis_ld_u8((mlib_u8*)src + 2);
608                b = vis_ld_u8((mlib_u8*)src + 3);
609                GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
610                dd = vis_fxor(D64_FROM_F32x2(ff), d_xorpixel);
611                dd = vis_fandnot(d_alphamask, dd);
612                vis_st_u8(vis_fxor(vis_ld_u8(dst), dd), dst);
613            }
614            dst++;
615            src++;
616        }
617
618        PTR_ADD(dstBase, dstScan);
619        PTR_ADD(srcBase, srcScan);
620    }
621}
622
623/***************************************************************/
624
625void ADD_SUFF(IntArgbBmToByteGrayScaleXparOver)(SCALE_PARAMS)
626{
627    mlib_s32 dstScan = pDstInfo->scanStride;
628    mlib_s32 srcScan = pSrcInfo->scanStride;
629    mlib_u8  *dst_end;
630    mlib_d64 dzero = vis_fzero();
631    mlib_f32 f0, f1;
632    mlib_s32 i, j, mask0, mask1;
633    RGB_VARS;
634
635    for (j = 0; j < height; j++) {
636        mlib_f32 *src = srcBase;
637        mlib_u8  *dst = dstBase;
638        mlib_s32 tmpsxloc = sxloc;
639
640        PTR_ADD(src, (syloc >> shift) * srcScan);
641
642        dst_end = dst + width;
643
644        while (((mlib_s32)dst & 7) && dst < dst_end) {
645            i = tmpsxloc >> shift;
646            tmpsxloc += sxinc;
647            if (*(mlib_u8*)(src + i)) {
648                r = vis_ld_u8((mlib_u8*)(src + i) + 1);
649                g = vis_ld_u8((mlib_u8*)(src + i) + 2);
650                b = vis_ld_u8((mlib_u8*)(src + i) + 3);
651                GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
652                vis_st_u8(D64_FROM_F32x2(ff), dst);
653            }
654            dst++;
655        }
656
657#pragma pipeloop(0)
658        for (; dst <= (dst_end - 8); dst += 8) {
659            s02 = vis_fpmerge(src[(tmpsxloc          ) >> shift],
660                              src[(tmpsxloc + 2*sxinc) >> shift]);
661            s13 = vis_fpmerge(src[(tmpsxloc +   sxinc) >> shift],
662                              src[(tmpsxloc + 3*sxinc) >> shift]);
663            tmpsxloc += 4*sxinc;
664            ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13));
665            gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13));
666            mask0 = vis_fcmpne16(vis_fpmerge(vis_read_hi(ar), vis_read_hi(ar)),
667                                 dzero);
668            GRAY_U8(f0, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
669
670            s02 = vis_fpmerge(src[(tmpsxloc          ) >> shift],
671                              src[(tmpsxloc + 2*sxinc) >> shift]);
672            s13 = vis_fpmerge(src[(tmpsxloc +   sxinc) >> shift],
673                              src[(tmpsxloc + 3*sxinc) >> shift]);
674            tmpsxloc += 4*sxinc;
675            ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13));
676            gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13));
677            mask1 = vis_fcmpne16(vis_fpmerge(vis_read_hi(ar), vis_read_hi(ar)),
678                                 dzero);
679            GRAY_U8(f1, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
680
681            vis_pst_8(vis_freg_pair(f0, f1), dst, (mask0 << 4) | mask1);
682        }
683
684        while (dst < dst_end) {
685            i = tmpsxloc >> shift;
686            tmpsxloc += sxinc;
687            if (*(mlib_u8*)(src + i)) {
688                r = vis_ld_u8((mlib_u8*)(src + i) + 1);
689                g = vis_ld_u8((mlib_u8*)(src + i) + 2);
690                b = vis_ld_u8((mlib_u8*)(src + i) + 3);
691                GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
692                vis_st_u8(D64_FROM_F32x2(ff), dst);
693            }
694            dst++;
695        }
696
697        PTR_ADD(dstBase, dstScan);
698        syloc += syinc;
699    }
700}
701
702/***************************************************************/
703
704#define TBL_MUL ((mlib_s16*)vis_mul8s_tbl + 1)
705#define TBL_DIV ((mlib_u8*)vis_div8_tbl + 2)
706
707void ADD_SUFF(IntArgbToByteGraySrcOverMaskBlit)(MASKBLIT_PARAMS)
708{
709    mlib_s32 extraA;
710    mlib_s32 dstScan = pDstInfo->scanStride;
711    mlib_s32 srcScan = pSrcInfo->scanStride;
712    mlib_u8  *mul8_extra;
713    mlib_u8  *dst_end;
714    mlib_d64 srcAx4, dd, d0, d1;
715    mlib_d64 done = vis_to_double_dup(0x7fff7fff);
716    mlib_s32 j, srcA0, srcA1, srcA2, srcA3;
717    RGB_VARS;
718
719    extraA = (mlib_s32)(pCompInfo->details.extraAlpha * 255.0 + 0.5);
720    mul8_extra = mul8table[extraA];
721
722    if (pMask != NULL) {
723        pMask += maskOff;
724
725        if (dstScan == width && srcScan == 4*width && maskScan == width) {
726            width *= height;
727            height = 1;
728        }
729
730        maskScan -= width;
731
732        for (j = 0; j < height; j++) {
733            mlib_f32 *src = srcBase;
734            mlib_u8  *dst = dstBase;
735
736            dst_end = dst + width;
737
738            while (((mlib_s32)dst & 3) && dst < dst_end) {
739                srcA0 = mul8table[mul8_extra[*pMask++]][*(mlib_u8*)src];
740                r = vis_ld_u8((mlib_u8*)src + 1);
741                g = vis_ld_u8((mlib_u8*)src + 2);
742                b = vis_ld_u8((mlib_u8*)src + 3);
743                GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
744                d0 = vis_fpadd16(MUL8_VIS(ff, srcA0), d_half);
745                d1 = MUL8_VIS(vis_read_lo(vis_ld_u8(dst)), 255 - srcA0);
746                dd = vis_fpadd16(d0, d1);
747                vis_st_u8(D64_FROM_F32x2(vis_fpack16(dd)), dst);
748                dst++;
749                src++;
750            }
751
752#pragma pipeloop(0)
753            for (; dst <= (dst_end - 4); dst += 4) {
754                srcA0 = mul8table[mul8_extra[*pMask++]][*(mlib_u8*)src];
755                srcA1 = mul8table[mul8_extra[*pMask++]][*(mlib_u8*)(src + 1)];
756                srcA2 = mul8table[mul8_extra[*pMask++]][*(mlib_u8*)(src + 2)];
757                srcA3 = mul8table[mul8_extra[*pMask++]][*(mlib_u8*)(src + 3)];
758                srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA3), srcAx4);
759                srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA2), srcAx4);
760                srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA1), srcAx4);
761                srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA0), srcAx4);
762
763                s02 = vis_fpmerge(src[0], src[2]);
764                s13 = vis_fpmerge(src[1], src[3]);
765                ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13));
766                gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13));
767                GRAY_U8(ff, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
768                d0 = vis_fpadd16(vis_fmul8x16(ff, srcAx4), d_half);
769                d1 = vis_fmul8x16(*(mlib_f32*)dst, vis_fpsub16(done, srcAx4));
770                dd = vis_fpadd16(d0, d1);
771                *(mlib_f32*)dst = vis_fpack16(dd);
772                src += 4;
773            }
774
775            while (dst < dst_end) {
776                srcA0 = mul8table[mul8_extra[*pMask++]][*(mlib_u8*)src];
777                r = vis_ld_u8((mlib_u8*)src + 1);
778                g = vis_ld_u8((mlib_u8*)src + 2);
779                b = vis_ld_u8((mlib_u8*)src + 3);
780                GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
781                d0 = vis_fpadd16(MUL8_VIS(ff, srcA0), d_half);
782                d1 = MUL8_VIS(vis_read_lo(vis_ld_u8(dst)), 255 - srcA0);
783                dd = vis_fpadd16(d0, d1);
784                vis_st_u8(D64_FROM_F32x2(vis_fpack16(dd)), dst);
785                dst++;
786                src++;
787            }
788
789            PTR_ADD(dstBase, dstScan);
790            PTR_ADD(srcBase, srcScan);
791            PTR_ADD(pMask,  maskScan);
792        }
793    } else {
794
795        if (dstScan == width && srcScan == 4*width) {
796            width *= height;
797            height = 1;
798        }
799
800        for (j = 0; j < height; j++) {
801            mlib_f32 *src = srcBase;
802            mlib_u8  *dst = dstBase;
803
804            dst_end = dst + width;
805
806            while (((mlib_s32)dst & 3) && dst < dst_end) {
807                srcA0 = mul8_extra[*(mlib_u8*)src];
808                r = vis_ld_u8((mlib_u8*)src + 1);
809                g = vis_ld_u8((mlib_u8*)src + 2);
810                b = vis_ld_u8((mlib_u8*)src + 3);
811                GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
812                d0 = vis_fpadd16(MUL8_VIS(ff, srcA0), d_half);
813                d1 = MUL8_VIS(vis_read_lo(vis_ld_u8(dst)), 255 - srcA0);
814                dd = vis_fpadd16(d0, d1);
815                vis_st_u8(D64_FROM_F32x2(vis_fpack16(dd)), dst);
816                dst++;
817                src++;
818            }
819
820#pragma pipeloop(0)
821            for (; dst <= (dst_end - 4); dst += 4) {
822                srcA0 = mul8_extra[*(mlib_u8*)src];
823                srcA1 = mul8_extra[*(mlib_u8*)(src + 1)];
824                srcA2 = mul8_extra[*(mlib_u8*)(src + 2)];
825                srcA3 = mul8_extra[*(mlib_u8*)(src + 3)];
826                srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA3), srcAx4);
827                srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA2), srcAx4);
828                srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA1), srcAx4);
829                srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA0), srcAx4);
830
831                s02 = vis_fpmerge(src[0], src[2]);
832                s13 = vis_fpmerge(src[1], src[3]);
833                ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13));
834                gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13));
835                GRAY_U8(ff, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
836                d0 = vis_fpadd16(vis_fmul8x16(ff, srcAx4), d_half);
837                d1 = vis_fmul8x16(*(mlib_f32*)dst, vis_fpsub16(done, srcAx4));
838                dd = vis_fpadd16(d0, d1);
839                *(mlib_f32*)dst = vis_fpack16(dd);
840                src += 4;
841            }
842
843            while (dst < dst_end) {
844                srcA0 = mul8_extra[*(mlib_u8*)src];
845                r = vis_ld_u8((mlib_u8*)src + 1);
846                g = vis_ld_u8((mlib_u8*)src + 2);
847                b = vis_ld_u8((mlib_u8*)src + 3);
848                GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
849                d0 = vis_fpadd16(MUL8_VIS(ff, srcA0), d_half);
850                d1 = MUL8_VIS(vis_read_lo(vis_ld_u8(dst)), 255 - srcA0);
851                dd = vis_fpadd16(d0, d1);
852                vis_st_u8(D64_FROM_F32x2(vis_fpack16(dd)), dst);
853                dst++;
854                src++;
855            }
856
857            PTR_ADD(dstBase, dstScan);
858            PTR_ADD(srcBase, srcScan);
859        }
860    }
861}
862
863/***************************************************************/
864
865#define GET_COEF(i)                                                    \
866    pathA = pMask[i];                                                  \
867    srcA = *(mlib_u8*)(src + i);                                       \
868    srcA = mul8table[extraA][srcA];                                    \
869    dstF = ((((srcA) & DstOpAnd) ^ DstOpXor) + DstOpAdd);              \
870    srcF = mul8table[pathA][srcFbase];                                 \
871    dstA = 0xff - pathA + mul8table[pathA][dstF];                      \
872    srcA = mul8table[srcF][srcA];                                      \
873    resA = srcA + dstA;                                                \
874    srcAx4 = vis_faligndata(vis_ld_u16(TBL_MUL + 2*srcA), srcAx4);     \
875    divAx4 = vis_faligndata(vis_ld_u16(TBL_DIV + 8*resA), divAx4)
876
877/***************************************************************/
878
879void ADD_SUFF(IntArgbToByteGrayAlphaMaskBlit)(MASKBLIT_PARAMS)
880{
881    mlib_s32 extraA;
882    mlib_s32 dstScan = pDstInfo->scanStride;
883    mlib_s32 srcScan = pSrcInfo->scanStride;
884    mlib_u8  *dst_end;
885    mlib_d64 srcAx4, dstAx4, divAx4, dd, ds;
886    mlib_d64 done = vis_to_double_dup(0x01000100);
887    mlib_f32 fscale = vis_to_float(0x02020202);
888    mlib_s32 j;
889    mlib_s32 SrcOpAnd, SrcOpXor, SrcOpAdd;
890    mlib_s32 DstOpAnd, DstOpXor, DstOpAdd;
891    mlib_s32 pathA, srcFbase, resA, resG, srcF, dstF, srcA, dstA;
892
893    RGB_VARS;
894
895    SrcOpAnd = (AlphaRules[pCompInfo->rule].srcOps).andval;
896    SrcOpXor = (AlphaRules[pCompInfo->rule].srcOps).xorval;
897    SrcOpAdd =
898        (jint) (AlphaRules[pCompInfo->rule].srcOps).addval - SrcOpXor;
899
900    DstOpAnd = (AlphaRules[pCompInfo->rule].dstOps).andval;
901    DstOpXor = (AlphaRules[pCompInfo->rule].dstOps).xorval;
902    DstOpAdd =
903        (jint) (AlphaRules[pCompInfo->rule].dstOps).addval - DstOpXor;
904
905    extraA = (mlib_s32)(pCompInfo->details.extraAlpha * 255.0 + 0.5);
906
907    srcFbase = ((((0xff) & SrcOpAnd) ^ SrcOpXor) + SrcOpAdd);
908
909    vis_write_gsr((7 << 3) | 6);
910
911    if (pMask != NULL) {
912        pMask += maskOff;
913
914        if (dstScan == width && srcScan == 4*width && maskScan == width) {
915            width *= height;
916            height = 1;
917        }
918
919        maskScan -= width;
920
921        for (j = 0; j < height; j++) {
922            mlib_f32 *src = srcBase;
923            mlib_u8  *dst = dstBase;
924
925            dst_end = dst + width;
926
927            while (((mlib_s32)dst & 3) && dst < dst_end) {
928                pathA = *pMask++;
929                srcA = *(mlib_u8*)src;
930                srcA = mul8table[extraA][srcA];
931                dstF = ((((srcA) & DstOpAnd) ^ DstOpXor) + DstOpAdd);
932                srcF = mul8table[pathA][srcFbase];
933                dstA = 0xff - pathA + mul8table[pathA][dstF];
934                srcA = mul8table[srcF][srcA];
935                resA = srcA + dstA;
936
937                r = vis_ld_u8((mlib_u8*)src + 1);
938                g = vis_ld_u8((mlib_u8*)src + 2);
939                b = vis_ld_u8((mlib_u8*)src + 3);
940                GRAY_S16(dd, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
941                dd = vis_fmul8x16(fscale, dd);
942                ff = vis_fpack16(dd);
943
944                dd = vis_freg_pair(vis_fzeros(),
945                                   ((mlib_f32*)vis_mul8s_tbl)[dstA]);
946                DIV_ALPHA(dd, resA);
947                ds = vis_fpsub16(done, dd);
948                dd = vis_fmul8x16(vis_read_lo(vis_ld_u8(dst)), dd);
949                ds = vis_fmul8x16(ff, ds);
950                dd = vis_fpadd16(dd, ds);
951                ff = vis_fpack16(dd);
952                vis_st_u8(D64_FROM_F32x2(ff), dst);
953
954                dst++;
955                src++;
956            }
957
958#pragma pipeloop(0)
959            for (; dst <= (dst_end - 4); dst += 4) {
960                GET_COEF(3);
961                GET_COEF(2);
962                GET_COEF(1);
963                GET_COEF(0);
964                pMask += 4;
965                srcAx4 = FMUL_16x16(srcAx4, divAx4);
966                dstAx4 = vis_fpsub16(done, srcAx4);
967
968                s02 = vis_fpmerge(src[0], src[2]);
969                s13 = vis_fpmerge(src[1], src[3]);
970                ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13));
971                gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13));
972                GRAY_S16(dd, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
973                dd = vis_fmul8x16(fscale, dd);
974                ff = vis_fpack16(dd);
975
976                dd = vis_fmul8x16(*(mlib_f32*)dst, dstAx4);
977                ds = vis_fmul8x16(ff, srcAx4);
978                dd = vis_fpadd16(dd, ds);
979                *(mlib_f32*)dst = vis_fpack16(dd);
980
981                src += 4;
982            }
983
984            while (dst < dst_end) {
985                pathA = *pMask++;
986                srcA = *(mlib_u8*)src;
987                srcA = mul8table[extraA][srcA];
988                dstF = ((((srcA) & DstOpAnd) ^ DstOpXor) + DstOpAdd);
989                srcF = mul8table[pathA][srcFbase];
990                dstA = 0xff - pathA + mul8table[pathA][dstF];
991                srcA = mul8table[srcF][srcA];
992                resA = srcA + dstA;
993
994                r = vis_ld_u8((mlib_u8*)src + 1);
995                g = vis_ld_u8((mlib_u8*)src + 2);
996                b = vis_ld_u8((mlib_u8*)src + 3);
997                GRAY_S16(dd, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
998                dd = vis_fmul8x16(fscale, dd);
999                ff = vis_fpack16(dd);
1000
1001                dd = vis_freg_pair(vis_fzeros(),
1002                                   ((mlib_f32*)vis_mul8s_tbl)[dstA]);
1003                DIV_ALPHA(dd, resA);
1004                ds = vis_fpsub16(done, dd);
1005                dd = vis_fmul8x16(vis_read_lo(vis_ld_u8(dst)), dd);
1006                ds = vis_fmul8x16(ff, ds);
1007                dd = vis_fpadd16(dd, ds);
1008                ff = vis_fpack16(dd);
1009                vis_st_u8(D64_FROM_F32x2(ff), dst);
1010
1011                dst++;
1012                src++;
1013            }
1014
1015            PTR_ADD(dstBase, dstScan);
1016            PTR_ADD(srcBase, srcScan);
1017            PTR_ADD(pMask,  maskScan);
1018        }
1019    } else {
1020
1021        if (dstScan == width && srcScan == 4*width) {
1022            width *= height;
1023            height = 1;
1024        }
1025
1026        for (j = 0; j < height; j++) {
1027            mlib_f32 *src = srcBase;
1028            mlib_u8  *dst = dstBase;
1029
1030            dst_end = dst + width;
1031
1032            while (dst < dst_end) {
1033                srcA = *(mlib_u8*)src;
1034                srcA = mul8table[extraA][srcA];
1035                dstA = ((((srcA) & DstOpAnd) ^ DstOpXor) + DstOpAdd);
1036                srcA = mul8table[srcFbase][srcA];
1037                resA = srcA + dstA;
1038
1039                r = vis_ld_u8((mlib_u8*)src + 1);
1040                g = vis_ld_u8((mlib_u8*)src + 2);
1041                b = vis_ld_u8((mlib_u8*)src + 3);
1042                GRAY_S16(dd, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
1043                dd = vis_fmul8x16(fscale, dd);
1044                ff = vis_fpack16(dd);
1045
1046                resG = mul8table[dstA][*dst] +
1047                       mul8table[srcA][((mlib_u8*)&ff)[3]];
1048                *dst = div8table[resA][resG];
1049
1050                dst++;
1051                src++;
1052            }
1053
1054            PTR_ADD(dstBase, dstScan);
1055            PTR_ADD(srcBase, srcScan);
1056        }
1057    }
1058}
1059
1060/***************************************************************/
1061
1062void ADD_SUFF(IntRgbToByteGrayAlphaMaskBlit)(MASKBLIT_PARAMS)
1063{
1064    mlib_s32 extraA;
1065    mlib_s32 dstScan = pDstInfo->scanStride;
1066    mlib_s32 srcScan = pSrcInfo->scanStride;
1067    mlib_u8  *dst_end;
1068    mlib_d64 srcA_d, dstA_d, dd, d0, d1;
1069    mlib_s32 i, j, srcG;
1070    mlib_s32 SrcOpAnd, SrcOpXor, SrcOpAdd;
1071    mlib_s32 DstOpAnd, DstOpXor, DstOpAdd;
1072    mlib_s32 pathA, srcFbase, dstFbase, resA, resG, srcA, dstA;
1073
1074    RGB_VARS;
1075
1076    SrcOpAnd = (AlphaRules[pCompInfo->rule].srcOps).andval;
1077    SrcOpXor = (AlphaRules[pCompInfo->rule].srcOps).xorval;
1078    SrcOpAdd =
1079        (jint) (AlphaRules[pCompInfo->rule].srcOps).addval - SrcOpXor;
1080
1081    DstOpAnd = (AlphaRules[pCompInfo->rule].dstOps).andval;
1082    DstOpXor = (AlphaRules[pCompInfo->rule].dstOps).xorval;
1083    DstOpAdd =
1084        (jint) (AlphaRules[pCompInfo->rule].dstOps).addval - DstOpXor;
1085
1086    extraA = (mlib_s32)(pCompInfo->details.extraAlpha * 255.0 + 0.5);
1087
1088    srcFbase = ((((0xff) & SrcOpAnd) ^ SrcOpXor) + SrcOpAdd);
1089    dstFbase = (((extraA & DstOpAnd) ^ DstOpXor) + DstOpAdd);
1090
1091    srcFbase = mul8table[srcFbase][extraA];
1092
1093    if (width < 16) {
1094        if (pMask != NULL) {
1095            pMask += maskOff;
1096
1097            for (j = 0; j < height; j++) {
1098                mlib_u8 *dst = dstBase;
1099                mlib_u8 *src = srcBase;
1100
1101                for (i = 0; i < width; i++) {
1102                    pathA = pMask[i];
1103                    dstA = 0xff - pathA + mul8table[dstFbase][pathA];
1104                    srcA = mul8table[srcFbase][pathA];
1105                    resA = srcA + dstA;
1106
1107                    srcG = RGB2GRAY(src[4*i + 1], src[4*i + 2], src[4*i + 3]);
1108                    resG = mul8table[dstA][dst[i]] + mul8table[srcA][srcG];
1109                    resG = div8table[resA][resG];
1110                    dst[i] = resG;
1111                }
1112
1113                PTR_ADD(dstBase, dstScan);
1114                PTR_ADD(srcBase, srcScan);
1115                PTR_ADD(pMask,  maskScan);
1116            }
1117        } else {
1118            dstA = dstFbase;
1119            srcA = srcFbase;
1120            resA = srcA + dstA;
1121
1122            for (j = 0; j < height; j++) {
1123                mlib_u8 *dst = dstBase;
1124                mlib_u8 *src = srcBase;
1125
1126                for (i = 0; i < width; i++) {
1127                    srcG = RGB2GRAY(src[4*i + 1], src[4*i + 2], src[4*i + 3]);
1128                    resG = mul8table[dstA][dst[i]] + mul8table[srcA][srcG];
1129                    resG = div8table[resA][resG];
1130                    dst[i] = resG;
1131                }
1132
1133                PTR_ADD(dstBase, dstScan);
1134                PTR_ADD(srcBase, srcScan);
1135            }
1136        }
1137        return;
1138    }
1139
1140    if (pMask != NULL) {
1141        mlib_s32 srcA_buff[256];
1142        mlib_d64 dscale = (mlib_d64)(1 << 15)*(1 << 16), ddiv;
1143        mlib_d64 d_one = vis_to_double_dup(0x7FFF7FFF);
1144
1145        srcA_buff[0] = 0;
1146#pragma pipeloop(0)
1147        for (pathA = 1; pathA < 256; pathA++) {
1148            dstA = 0xff - pathA + mul8table[dstFbase][pathA];
1149            srcA = mul8table[srcFbase][pathA];
1150            resA = dstA + srcA;
1151            ddiv = dscale*vis_d64_div_tbl[resA];
1152            srcA_buff[pathA] = srcA*ddiv + (1 << 15);
1153        }
1154
1155        pMask += maskOff;
1156        maskScan -= width;
1157
1158        if (dstScan == width && srcScan == 4*width && maskScan == width) {
1159            width *= height;
1160            height = 1;
1161        }
1162
1163        for (j = 0; j < height; j++) {
1164            mlib_f32 *src = srcBase;
1165            mlib_u8  *dst = dstBase;
1166
1167            dst_end = dst + width;
1168
1169            while (((mlib_s32)dst & 3) && dst < dst_end) {
1170                pathA = *pMask++;
1171                srcA_d = vis_ld_u16(srcA_buff + pathA);
1172                dstA_d = vis_fpsub16(d_one, srcA_d);
1173                r = vis_ld_u8((mlib_u8*)src + 1);
1174                g = vis_ld_u8((mlib_u8*)src + 2);
1175                b = vis_ld_u8((mlib_u8*)src + 3);
1176                GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
1177                d0 = vis_fpadd16(vis_fmul8x16(ff, srcA_d), d_half);
1178                d1 = vis_fmul8x16(vis_read_lo(vis_ld_u8(dst)), dstA_d);
1179                dd = vis_fpadd16(d0, d1);
1180                vis_st_u8(D64_FROM_F32x2(vis_fpack16(dd)), dst);
1181                dst++;
1182                src++;
1183            }
1184
1185#pragma pipeloop(0)
1186            for (; dst <= (dst_end - 4); dst += 4) {
1187                LOAD_NEXT_U16(srcA_d, srcA_buff + pMask[3]);
1188                LOAD_NEXT_U16(srcA_d, srcA_buff + pMask[2]);
1189                LOAD_NEXT_U16(srcA_d, srcA_buff + pMask[1]);
1190                LOAD_NEXT_U16(srcA_d, srcA_buff + pMask[0]);
1191                dstA_d = vis_fpsub16(d_one, srcA_d);
1192                pMask += 4;
1193
1194                s02 = vis_fpmerge(src[0], src[2]);
1195                s13 = vis_fpmerge(src[1], src[3]);
1196                ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13));
1197                gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13));
1198                GRAY_U8(ff, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
1199                dd = vis_fpadd16(vis_fmul8x16(ff, srcA_d), d_half);
1200                dd = vis_fpadd16(vis_fmul8x16(*(mlib_f32*)dst, dstA_d), dd);
1201                *(mlib_f32*)dst = vis_fpack16(dd);
1202                src += 4;
1203            }
1204
1205            while (dst < dst_end) {
1206                pathA = *pMask++;
1207                srcA_d = vis_ld_u16(srcA_buff + pathA);
1208                dstA_d = vis_fpsub16(d_one, srcA_d);
1209                r = vis_ld_u8((mlib_u8*)src + 1);
1210                g = vis_ld_u8((mlib_u8*)src + 2);
1211                b = vis_ld_u8((mlib_u8*)src + 3);
1212                GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
1213                d0 = vis_fpadd16(vis_fmul8x16(ff, srcA_d), d_half);
1214                d1 = vis_fmul8x16(vis_read_lo(vis_ld_u8(dst)), dstA_d);
1215                dd = vis_fpadd16(d0, d1);
1216                ff = vis_fpack16(dd);
1217                vis_st_u8(D64_FROM_F32x2(ff), dst);
1218                dst++;
1219                src++;
1220            }
1221
1222            PTR_ADD(dstBase, dstScan);
1223            PTR_ADD(srcBase, srcScan);
1224            PTR_ADD(pMask,  maskScan);
1225        }
1226    } else {
1227        mlib_d64 dscale = (mlib_d64)(1 << 15)*(1 << 16), ddiv;
1228        mlib_d64 d_one = vis_to_double_dup(0x7FFF7FFF);
1229
1230        dstA = dstFbase;
1231        srcA = srcFbase;
1232        resA = dstA + srcA;
1233        ddiv = dscale*vis_d64_div_tbl[resA];
1234        srcA = (mlib_s32)(srcA*ddiv + (1 << 15)) >> 16;
1235        srcA_d = vis_to_double_dup((srcA << 16) | srcA);
1236        dstA_d = vis_fpsub16(d_one, srcA_d);
1237
1238        if (dstScan == width && srcScan == 4*width) {
1239            width *= height;
1240            height = 1;
1241        }
1242
1243        for (j = 0; j < height; j++) {
1244            mlib_f32 *src = srcBase;
1245            mlib_u8  *dst = dstBase;
1246
1247            dst_end = dst + width;
1248
1249            while (((mlib_s32)dst & 3) && dst < dst_end) {
1250                r = vis_ld_u8((mlib_u8*)src + 1);
1251                g = vis_ld_u8((mlib_u8*)src + 2);
1252                b = vis_ld_u8((mlib_u8*)src + 3);
1253                GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
1254                d0 = vis_fpadd16(vis_fmul8x16(ff, srcA_d), d_half);
1255                d1 = vis_fmul8x16(vis_read_lo(vis_ld_u8(dst)), dstA_d);
1256                dd = vis_fpadd16(d0, d1);
1257                vis_st_u8(D64_FROM_F32x2(vis_fpack16(dd)), dst);
1258                dst++;
1259                src++;
1260            }
1261
1262#pragma pipeloop(0)
1263            for (; dst <= (dst_end - 4); dst += 4) {
1264                s02 = vis_fpmerge(src[0], src[2]);
1265                s13 = vis_fpmerge(src[1], src[3]);
1266                ar = vis_fpmerge(vis_read_hi(s02), vis_read_hi(s13));
1267                gb = vis_fpmerge(vis_read_lo(s02), vis_read_lo(s13));
1268                GRAY_U8(ff, vis_read_lo(ar), vis_read_hi(gb), vis_read_lo(gb));
1269                dd = vis_fpadd16(vis_fmul8x16(ff, srcA_d), d_half);
1270                dd = vis_fpadd16(vis_fmul8x16(*(mlib_f32*)dst, dstA_d), dd);
1271                *(mlib_f32*)dst = vis_fpack16(dd);
1272                src += 4;
1273            }
1274
1275            while (dst < dst_end) {
1276                r = vis_ld_u8((mlib_u8*)src + 1);
1277                g = vis_ld_u8((mlib_u8*)src + 2);
1278                b = vis_ld_u8((mlib_u8*)src + 3);
1279                GRAY_U8(ff, vis_read_lo(r), vis_read_lo(g), vis_read_lo(b));
1280                d0 = vis_fpadd16(vis_fmul8x16(ff, srcA_d), d_half);
1281                d1 = vis_fmul8x16(vis_read_lo(vis_ld_u8(dst)), dstA_d);
1282                dd = vis_fpadd16(d0, d1);
1283                ff = vis_fpack16(dd);
1284                vis_st_u8(D64_FROM_F32x2(ff), dst);
1285                dst++;
1286                src++;
1287            }
1288
1289            PTR_ADD(dstBase, dstScan);
1290            PTR_ADD(srcBase, srcScan);
1291        }
1292    }
1293}
1294
1295/***************************************************************/
1296
1297#endif
1298