1/*
2 * Copyright (c) 1998, 2003, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26
27
28#include "vis_proto.h"
29#include "mlib_image.h"
30#include "mlib_v_ImageLookUpFunc.h"
31
32/***************************************************************/
33static void mlib_v_ImageLookUpSI_U8_U8_2_SrcOff0_D1(const mlib_u8  *src,
34                                                    mlib_u8        *dst,
35                                                    mlib_s32       xsize,
36                                                    const mlib_u16 *table);
37
38static void mlib_v_ImageLookUpSI_U8_U8_2_SrcOff1_D1(const mlib_u8  *src,
39                                                    mlib_u8        *dst,
40                                                    mlib_s32       xsize,
41                                                    const mlib_u16 *table);
42
43static void mlib_v_ImageLookUpSI_U8_U8_2_SrcOff2_D1(const mlib_u8  *src,
44                                                    mlib_u8        *dst,
45                                                    mlib_s32       xsize,
46                                                    const mlib_u16 *table);
47
48static void mlib_v_ImageLookUpSI_U8_U8_2_SrcOff3_D1(const mlib_u8  *src,
49                                                    mlib_u8        *dst,
50                                                    mlib_s32       xsize,
51                                                    const mlib_u16 *table);
52
53static void mlib_v_ImageLookUpSI_U8_U8_2_DstNonAl_D1(const mlib_u8  *src,
54                                                     mlib_u8        *dst,
55                                                     mlib_s32       xsize,
56                                                     const mlib_u16 *table);
57
58static void mlib_v_ImageLookUpSI_U8_U8_2_DstA8D1_SMALL(const mlib_u8 *src,
59                                                       mlib_u8       *dst,
60                                                       mlib_s32      xsize,
61                                                       const mlib_u8 **table);
62
63static void mlib_v_ImageLookUpSI_U8_U8_2_D1_SMALL(const mlib_u8 *src,
64                                                  mlib_u8       *dst,
65                                                  mlib_s32      xsize,
66                                                  const mlib_u8 **table);
67
68static void mlib_v_ImageLookUpSI_U8_U8_3_SrcOff0_D1(const mlib_u8  *src,
69                                                    mlib_u8        *dst,
70                                                    mlib_s32       xsize,
71                                                    const mlib_d64 *table);
72
73static void mlib_v_ImageLookUpSI_U8_U8_3_SrcOff1_D1(const mlib_u8  *src,
74                                                    mlib_u8        *dst,
75                                                    mlib_s32       xsize,
76                                                    const mlib_d64 *table);
77
78static void mlib_v_ImageLookUpSI_U8_U8_3_SrcOff2_D1(const mlib_u8  *src,
79                                                    mlib_u8        *dst,
80                                                    mlib_s32       xsize,
81                                                    const mlib_d64 *table);
82
83static void mlib_v_ImageLookUpSI_U8_U8_3_SrcOff3_D1(const mlib_u8  *src,
84                                                    mlib_u8        *dst,
85                                                    mlib_s32       xsize,
86                                                    const mlib_d64 *table);
87
88static void mlib_v_ImageLookUpSI_U8_U8_3_D1_SMALL(const mlib_u8 *src,
89                                                  mlib_u8       *dst,
90                                                  mlib_s32      xsize,
91                                                  const mlib_u8 **table);
92
93static void mlib_v_ImageLookUpSI_U8_U8_4_SrcOff0_D1(const mlib_u8  *src,
94                                                    mlib_u8        *dst,
95                                                    mlib_s32       xsize,
96                                                    const mlib_f32 *table);
97
98static void mlib_v_ImageLookUpSI_U8_U8_4_DstNonAl_D1(const mlib_u8  *src,
99                                                     mlib_u8        *dst,
100                                                     mlib_s32       xsize,
101                                                     const mlib_f32 *table);
102
103static void mlib_v_ImageLookUpSI_U8_U8_4_DstOff0_D1_SMALL(const mlib_u8 *src,
104                                                          mlib_u8       *dst,
105                                                          mlib_s32      xsize,
106                                                          const mlib_u8 **table);
107
108static void mlib_v_ImageLookUpSI_U8_U8_4_DstOff1_D1_SMALL(const mlib_u8 *src,
109                                                          mlib_u8       *dst,
110                                                          mlib_s32      xsize,
111                                                          const mlib_u8 **table);
112
113static void mlib_v_ImageLookUpSI_U8_U8_4_DstOff2_D1_SMALL(const mlib_u8 *src,
114                                                          mlib_u8       *dst,
115                                                          mlib_s32      xsize,
116                                                          const mlib_u8 **table);
117
118static void mlib_v_ImageLookUpSI_U8_U8_4_DstOff3_D1_SMALL(const mlib_u8 *src,
119                                                          mlib_u8       *dst,
120                                                          mlib_s32      xsize,
121                                                          const mlib_u8 **table);
122
123/***************************************************************/
124#define VIS_LD_U8_I(X, Y)       vis_ld_u8_i((void *)(X), (Y))
125#define VIS_LD_U16_I(X, Y)      vis_ld_u16_i((void *)(X), (Y))
126
127/***************************************************************/
128void mlib_v_ImageLookUpSI_U8_U8_2_SrcOff0_D1(const mlib_u8  *src,
129                                             mlib_u8        *dst,
130                                             mlib_s32       xsize,
131                                             const mlib_u16 *table)
132{
133  mlib_u32 *sa;          /* aligned pointer to source data */
134  mlib_u8  *sp;          /* pointer to source data */
135  mlib_u32 s0;           /* source data */
136  mlib_u16 *dl;          /* pointer to start of destination */
137  mlib_u16 *dend;        /* pointer to end of destination */
138  mlib_d64 *dp;          /* aligned pointer to destination */
139  mlib_d64 t0, t1, t2;   /* destination data */
140  mlib_d64 t3, acc;      /* destination data */
141  mlib_s32 emask;        /* edge mask */
142  mlib_s32 i, num;       /* loop variable */
143
144  sa   = (mlib_u32*)src;
145  dl   = (mlib_u16*)dst;
146  dp   = (mlib_d64 *) dl;
147  dend = dl + xsize - 1;
148
149  vis_alignaddr((void *) 0, 6);
150
151  if (xsize >= 4) {
152
153    s0 = sa[0];
154    sa ++;
155
156#pragma pipeloop(0)
157    for(i = 0; i <= xsize - 8; i+=4, sa++) {
158      t3 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
159      t2 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
160      t1 = VIS_LD_U16_I(table, (s0 >> 15) & 0x1FE);
161      t0 = VIS_LD_U16_I(table, (s0 >> 23) & 0x1FE);
162      acc = vis_faligndata(t3, acc);
163      acc = vis_faligndata(t2, acc);
164      acc = vis_faligndata(t1, acc);
165      acc = vis_faligndata(t0, acc);
166      s0 = sa[0];
167      *dp++ = acc;
168    }
169
170    t3 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
171    t2 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
172    t1 = VIS_LD_U16_I(table, (s0 >> 15) & 0x1FE);
173    t0 = VIS_LD_U16_I(table, (s0 >> 23) & 0x1FE);
174    acc = vis_faligndata(t3, acc);
175    acc = vis_faligndata(t2, acc);
176    acc = vis_faligndata(t1, acc);
177    acc = vis_faligndata(t0, acc);
178    *dp++ = acc;
179  }
180
181  sp = (mlib_u8*)sa;
182
183  if ((mlib_addr) dp <= (mlib_addr) dend) {
184
185    num = (mlib_u16*) dend - (mlib_u16*) dp;
186    sp  += num;
187    num ++;
188#pragma pipeloop(0)
189    for (i = 0; i < num; i ++) {
190      s0 = (mlib_s32) *sp;
191      sp --;
192
193      t0  = VIS_LD_U16_I(table, 2*s0);
194      acc = vis_faligndata(t0, acc);
195    }
196
197    emask = vis_edge16(dp, dend);
198    vis_pst_16(acc, dp, emask);
199  }
200}
201
202/***************************************************************/
203void mlib_v_ImageLookUpSI_U8_U8_2_SrcOff1_D1(const mlib_u8  *src,
204                                             mlib_u8        *dst,
205                                             mlib_s32       xsize,
206                                             const mlib_u16 *table)
207{
208  mlib_u32 *sa;          /* aligned pointer to source data */
209  mlib_u8  *sp;          /* pointer to source data */
210  mlib_u32 s0, s1;       /* source data */
211  mlib_u16 *dl;          /* pointer to start of destination */
212  mlib_u16 *dend;        /* pointer to end of destination */
213  mlib_d64 *dp;          /* aligned pointer to destination */
214  mlib_d64 t0, t1, t2;   /* destination data */
215  mlib_d64 t3, acc;      /* destination data */
216  mlib_s32 emask;        /* edge mask */
217  mlib_s32 i, num;       /* loop variable */
218
219  sa   = (mlib_u32*)(src-1);
220  dl   = (mlib_u16*)dst;
221  dp   = (mlib_d64 *) dl;
222  dend = dl + xsize - 1;
223
224  vis_alignaddr((void *) 0, 6);
225
226  s0 = *sa++;
227
228  if (xsize >= 4) {
229
230    s1 = sa[0];
231    sa ++;
232
233#pragma pipeloop(0)
234    for(i = 0; i <= xsize - 8; i+=4, sa++) {
235      t3 = VIS_LD_U16_I(table, (s1 >> 23) & 0x1FE);
236      t2 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
237      t1 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
238      t0 = VIS_LD_U16_I(table, (s0 >> 15) & 0x1FE);
239      acc = vis_faligndata(t3, acc);
240      acc = vis_faligndata(t2, acc);
241      acc = vis_faligndata(t1, acc);
242      acc = vis_faligndata(t0, acc);
243      s0 = s1;
244      s1 = sa[0];
245      *dp++ = acc;
246    }
247
248    t3 = VIS_LD_U16_I(table, (s1 >> 23) & 0x1FE);
249    t2 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
250    t1 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
251    t0 = VIS_LD_U16_I(table, (s0 >> 15) & 0x1FE);
252    acc = vis_faligndata(t3, acc);
253    acc = vis_faligndata(t2, acc);
254    acc = vis_faligndata(t1, acc);
255    acc = vis_faligndata(t0, acc);
256    *dp++ = acc;
257  }
258
259  sp = (mlib_u8*)sa;
260  sp -= 3;
261
262  if ((mlib_addr) dp <= (mlib_addr) dend) {
263
264    num = (mlib_u16*) dend - (mlib_u16*) dp;
265    sp  += num;
266    num ++;
267#pragma pipeloop(0)
268    for (i = 0; i < num; i ++) {
269      s0 = (mlib_s32) *sp;
270      sp --;
271
272      t0  = VIS_LD_U16_I(table, 2*s0);
273      acc = vis_faligndata(t0, acc);
274    }
275
276    emask = vis_edge16(dp, dend);
277    vis_pst_16(acc, dp, emask);
278  }
279}
280
281/***************************************************************/
282void mlib_v_ImageLookUpSI_U8_U8_2_SrcOff2_D1(const mlib_u8  *src,
283                                             mlib_u8        *dst,
284                                             mlib_s32       xsize,
285                                             const mlib_u16 *table)
286{
287  mlib_u32 *sa;          /* pointer to source data */
288  mlib_u8  *sp;          /* pointer to source data */
289  mlib_u32 s0, s1;       /* source data */
290  mlib_u16 *dl;          /* pointer to start of destination */
291  mlib_u16 *dend;        /* pointer to end of destination */
292  mlib_d64 *dp;          /* aligned pointer to destination */
293  mlib_d64 t0, t1, t2;   /* destination data */
294  mlib_d64 t3, acc;      /* destination data */
295  mlib_s32 emask;        /* edge mask */
296  mlib_s32 i, num;       /* loop variable */
297
298  sa   = (mlib_u32*)(src-2);
299  dl   = (mlib_u16*)dst;
300  dp   = (mlib_d64 *) dl;
301  dend = dl + xsize - 1;
302
303  vis_alignaddr((void *) 0, 6);
304
305  s0 = *sa++;
306
307  if (xsize >= 4) {
308
309    s1 = sa[0];
310    sa ++;
311
312#pragma pipeloop(0)
313    for(i = 0; i <= xsize - 8; i+=4, sa++) {
314      t3 = VIS_LD_U16_I(table, (s1 >> 15) & 0x1FE);
315      t2 = VIS_LD_U16_I(table, (s1 >> 23) & 0x1FE);
316      t1 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
317      t0 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
318      acc = vis_faligndata(t3, acc);
319      acc = vis_faligndata(t2, acc);
320      acc = vis_faligndata(t1, acc);
321      acc = vis_faligndata(t0, acc);
322      s0 = s1;
323      s1 = sa[0];
324      *dp++ = acc;
325    }
326
327    t3 = VIS_LD_U16_I(table, (s1 >> 15) & 0x1FE);
328    t2 = VIS_LD_U16_I(table, (s1 >> 23) & 0x1FE);
329    t1 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
330    t0 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
331    acc = vis_faligndata(t3, acc);
332    acc = vis_faligndata(t2, acc);
333    acc = vis_faligndata(t1, acc);
334    acc = vis_faligndata(t0, acc);
335    *dp++ = acc;
336  }
337
338  sp = (mlib_u8*)sa;
339  sp -= 2;
340
341  if ((mlib_addr) dp <= (mlib_addr) dend) {
342
343    num = (mlib_u16*) dend - (mlib_u16*) dp;
344    sp  += num;
345    num ++;
346#pragma pipeloop(0)
347    for (i = 0; i < num; i ++) {
348      s0 = (mlib_s32) *sp;
349      sp --;
350
351      t0  = VIS_LD_U16_I(table, 2*s0);
352      acc = vis_faligndata(t0, acc);
353    }
354
355    emask = vis_edge16(dp, dend);
356    vis_pst_16(acc, dp, emask);
357  }
358}
359
360/***************************************************************/
361void mlib_v_ImageLookUpSI_U8_U8_2_SrcOff3_D1(const mlib_u8  *src,
362                                             mlib_u8        *dst,
363                                             mlib_s32       xsize,
364                                             const mlib_u16 *table)
365{
366  mlib_u32 *sa;          /* aligned pointer to source data */
367  mlib_u8  *sp;          /* pointer to source data */
368  mlib_u32 s0, s1;       /* source data */
369  mlib_u16 *dl;          /* pointer to start of destination */
370  mlib_u16 *dend;        /* pointer to end of destination */
371  mlib_d64 *dp;          /* aligned pointer to destination */
372  mlib_d64 t0, t1, t2;   /* destination data */
373  mlib_d64 t3, acc;      /* destination data */
374  mlib_s32 emask;        /* edge mask */
375  mlib_s32 i, num;       /* loop variable */
376
377  sa   = (mlib_u32*)(src-3);
378  dl   = (mlib_u16*)dst;
379  dp   = (mlib_d64 *) dl;
380  dend = dl + xsize - 1;
381
382  vis_alignaddr((void *) 0, 6);
383
384  s0 = *sa++;
385
386  if (xsize >= 4) {
387
388    s1 = sa[0];
389    sa ++;
390
391#pragma pipeloop(0)
392    for(i = 0; i <= xsize - 8; i+=4, sa++) {
393      t3 = VIS_LD_U16_I(table, (s1 >> 7) & 0x1FE);
394      t2 = VIS_LD_U16_I(table, (s1 >> 15) & 0x1FE);
395      t1 = VIS_LD_U16_I(table, (s1 >> 23) & 0x1FE);
396      t0 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
397      acc = vis_faligndata(t3, acc);
398      acc = vis_faligndata(t2, acc);
399      acc = vis_faligndata(t1, acc);
400      acc = vis_faligndata(t0, acc);
401      s0 = s1;
402      s1 = sa[0];
403      *dp++ = acc;
404    }
405
406    t3 = VIS_LD_U16_I(table, (s1 >> 7) & 0x1FE);
407    t2 = VIS_LD_U16_I(table, (s1 >> 15) & 0x1FE);
408    t1 = VIS_LD_U16_I(table, (s1 >> 23) & 0x1FE);
409    t0 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
410    acc = vis_faligndata(t3, acc);
411    acc = vis_faligndata(t2, acc);
412    acc = vis_faligndata(t1, acc);
413    acc = vis_faligndata(t0, acc);
414    *dp++ = acc;
415  }
416
417  sp = (mlib_u8*)sa;
418  sp -= 1;
419
420  if ((mlib_addr) dp <= (mlib_addr) dend) {
421
422    num = (mlib_u16*) dend - (mlib_u16*) dp;
423    sp  += num;
424    num ++;
425#pragma pipeloop(0)
426    for (i = 0; i < num; i ++) {
427      s0 = (mlib_s32) *sp;
428      sp --;
429
430      t0  = VIS_LD_U16_I(table, 2*s0);
431      acc = vis_faligndata(t0, acc);
432    }
433
434    emask = vis_edge16(dp, dend);
435    vis_pst_16(acc, dp, emask);
436  }
437}
438
439/***************************************************************/
440void mlib_v_ImageLookUpSI_U8_U8_2_DstNonAl_D1(const mlib_u8  *src,
441                                              mlib_u8        *dst,
442                                              mlib_s32       xsize,
443                                              const mlib_u16 *table)
444{
445  mlib_u32 *sa;             /* aligned pointer to source data */
446  mlib_u8  *sp;             /* pointer to source data */
447  mlib_u32 s0, s1, s2, s3;  /* source data */
448  mlib_u8  *dl;             /* pointer to start of destination */
449  mlib_u8  *dend;           /* pointer to end of destination */
450  mlib_d64 *dp;             /* aligned pointer to destination */
451  mlib_d64 t0, t1, t2;      /* destination data */
452  mlib_d64 t3, t4, t5;      /* destination data */
453  mlib_d64 t6, t7, acc0;    /* destination data */
454  mlib_d64 acc1, acc2;      /* destination data */
455  mlib_d64 acc3, acc4;      /* destination data */
456  mlib_s32 emask;           /* edge mask */
457  mlib_s32 i, num;          /* loop variable */
458  mlib_s32 off;             /* offset */
459
460  sa   = (mlib_u32*)src;
461  dl   = dst;
462  sp   = (void *)src;
463  dend = dl + 2*xsize - 1;
464  dp   = (mlib_d64 *) ((mlib_addr) dl & (~7));
465  off  = (mlib_addr) dp - (mlib_addr) dl;
466
467  emask = vis_edge8(dl, dend);
468  num = (xsize < 4) ? xsize : 4;
469
470  sp += (num-1);
471
472  vis_alignaddr(dp, 6);
473
474  for (i = 0; i < num; i ++) {
475    s0 = (mlib_s32) *sp;
476    sp --;
477
478    t0  = VIS_LD_U16_I(table, 2*s0);
479    acc0 = vis_faligndata(t0, acc0);
480  }
481
482  vis_alignaddr(dp, off);
483  vis_pst_8(vis_faligndata(acc0, acc0), dp++, emask);
484
485  sa++;
486
487  xsize -= 4;
488
489  i = 0;
490
491  if (xsize >= 16) {
492
493    s0 = sa[0];
494    s1 = sa[1];
495    s2 = sa[2];
496    s3 = sa[3];
497    sa += 4;
498
499#pragma pipeloop(0)
500    for(i = 0; i <= xsize - 32; i+=16, sa+=4) {
501      vis_alignaddr(dp, 6);
502      t3 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
503      t2 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
504      t1 = VIS_LD_U16_I(table, (s0 >> 15) & 0x1FE);
505      t0 = VIS_LD_U16_I(table, (s0 >> 23) & 0x1FE);
506      acc1 = vis_faligndata(t3, acc1);
507      acc1 = vis_faligndata(t2, acc1);
508      acc1 = vis_faligndata(t1, acc1);
509      acc1 = vis_faligndata(t0, acc1);
510      t7 = VIS_LD_U16_I(table, (s1 << 1) & 0x1FE);
511      t6 = VIS_LD_U16_I(table, (s1 >> 7) & 0x1FE);
512      t5 = VIS_LD_U16_I(table, (s1 >> 15) & 0x1FE);
513      t4 = VIS_LD_U16_I(table, (s1 >> 23) & 0x1FE);
514      acc2 = vis_faligndata(t7, acc2);
515      acc2 = vis_faligndata(t6, acc2);
516      acc2 = vis_faligndata(t5, acc2);
517      acc2 = vis_faligndata(t4, acc2);
518      t3 = VIS_LD_U16_I(table, (s2 << 1) & 0x1FE);
519      t2 = VIS_LD_U16_I(table, (s2 >> 7) & 0x1FE);
520      t1 = VIS_LD_U16_I(table, (s2 >> 15) & 0x1FE);
521      t0 = VIS_LD_U16_I(table, (s2 >> 23) & 0x1FE);
522      acc3 = vis_faligndata(t3, acc3);
523      acc3 = vis_faligndata(t2, acc3);
524      acc3 = vis_faligndata(t1, acc3);
525      acc3 = vis_faligndata(t0, acc3);
526      t7 = VIS_LD_U16_I(table, (s3 << 1) & 0x1FE);
527      t6 = VIS_LD_U16_I(table, (s3 >> 7) & 0x1FE);
528      t5 = VIS_LD_U16_I(table, (s3 >> 15) & 0x1FE);
529      t4 = VIS_LD_U16_I(table, (s3 >> 23) & 0x1FE);
530      acc4 = vis_faligndata(t7, acc4);
531      acc4 = vis_faligndata(t6, acc4);
532      acc4 = vis_faligndata(t5, acc4);
533      acc4 = vis_faligndata(t4, acc4);
534      vis_alignaddr(dp, off);
535      s0 = sa[0];
536      s1 = sa[1];
537      s2 = sa[2];
538      s3 = sa[3];
539      *dp++ = vis_faligndata(acc0, acc1);
540      *dp++ = vis_faligndata(acc1, acc2);
541      *dp++ = vis_faligndata(acc2, acc3);
542      *dp++ = vis_faligndata(acc3, acc4);
543      acc0 = acc4;
544    }
545
546    vis_alignaddr(dp, 6);
547    t3 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
548    t2 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
549    t1 = VIS_LD_U16_I(table, (s0 >> 15) & 0x1FE);
550    t0 = VIS_LD_U16_I(table, (s0 >> 23) & 0x1FE);
551    acc1 = vis_faligndata(t3, acc1);
552    acc1 = vis_faligndata(t2, acc1);
553    acc1 = vis_faligndata(t1, acc1);
554    acc1 = vis_faligndata(t0, acc1);
555    t7 = VIS_LD_U16_I(table, (s1 << 1) & 0x1FE);
556    t6 = VIS_LD_U16_I(table, (s1 >> 7) & 0x1FE);
557    t5 = VIS_LD_U16_I(table, (s1 >> 15) & 0x1FE);
558    t4 = VIS_LD_U16_I(table, (s1 >> 23) & 0x1FE);
559    acc2 = vis_faligndata(t7, acc2);
560    acc2 = vis_faligndata(t6, acc2);
561    acc2 = vis_faligndata(t5, acc2);
562    acc2 = vis_faligndata(t4, acc2);
563    t3 = VIS_LD_U16_I(table, (s2 << 1) & 0x1FE);
564    t2 = VIS_LD_U16_I(table, (s2 >> 7) & 0x1FE);
565    t1 = VIS_LD_U16_I(table, (s2 >> 15) & 0x1FE);
566    t0 = VIS_LD_U16_I(table, (s2 >> 23) & 0x1FE);
567    acc3 = vis_faligndata(t3, acc3);
568    acc3 = vis_faligndata(t2, acc3);
569    acc3 = vis_faligndata(t1, acc3);
570    acc3 = vis_faligndata(t0, acc3);
571    t7 = VIS_LD_U16_I(table, (s3 << 1) & 0x1FE);
572    t6 = VIS_LD_U16_I(table, (s3 >> 7) & 0x1FE);
573    t5 = VIS_LD_U16_I(table, (s3 >> 15) & 0x1FE);
574    t4 = VIS_LD_U16_I(table, (s3 >> 23) & 0x1FE);
575    acc4 = vis_faligndata(t7, acc4);
576    acc4 = vis_faligndata(t6, acc4);
577    acc4 = vis_faligndata(t5, acc4);
578    acc4 = vis_faligndata(t4, acc4);
579    vis_alignaddr(dp, off);
580    *dp++ = vis_faligndata(acc0, acc1);
581    *dp++ = vis_faligndata(acc1, acc2);
582    *dp++ = vis_faligndata(acc2, acc3);
583    *dp++ = vis_faligndata(acc3, acc4);
584    acc0 = acc4; i+=16;
585  }
586
587  if (i <= xsize - 8) {
588    s0 = sa[0];
589    s1 = sa[1];
590    vis_alignaddr(dp, 6);
591    t3 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
592    t2 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
593    t1 = VIS_LD_U16_I(table, (s0 >> 15) & 0x1FE);
594    t0 = VIS_LD_U16_I(table, (s0 >> 23) & 0x1FE);
595    acc1 = vis_faligndata(t3, acc1);
596    acc1 = vis_faligndata(t2, acc1);
597    acc1 = vis_faligndata(t1, acc1);
598    acc1 = vis_faligndata(t0, acc1);
599    t7 = VIS_LD_U16_I(table, (s1 << 1) & 0x1FE);
600    t6 = VIS_LD_U16_I(table, (s1 >> 7) & 0x1FE);
601    t5 = VIS_LD_U16_I(table, (s1 >> 15) & 0x1FE);
602    t4 = VIS_LD_U16_I(table, (s1 >> 23) & 0x1FE);
603    acc2 = vis_faligndata(t7, acc2);
604    acc2 = vis_faligndata(t6, acc2);
605    acc2 = vis_faligndata(t5, acc2);
606    acc2 = vis_faligndata(t4, acc2);
607    vis_alignaddr(dp, off);
608    *dp++ = vis_faligndata(acc0, acc1);
609    *dp++ = vis_faligndata(acc1, acc2);
610    acc0 = acc2; i += 8; sa += 2;
611  }
612
613  if (i <= xsize - 4) {
614    s0 = *sa++;
615    vis_alignaddr(dp, 6);
616    t3 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
617    t2 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
618    t1 = VIS_LD_U16_I(table, (s0 >> 15) & 0x1FE);
619    t0 = VIS_LD_U16_I(table, (s0 >> 23) & 0x1FE);
620    acc1 = vis_faligndata(t3, acc1);
621    acc1 = vis_faligndata(t2, acc1);
622    acc1 = vis_faligndata(t1, acc1);
623    acc1 = vis_faligndata(t0, acc1);
624    vis_alignaddr(dp, off);
625    *dp++ = vis_faligndata(acc0, acc1);
626    acc0 = acc1;
627  }
628
629  sp = (mlib_u8*)sa;
630
631  if ((mlib_addr) dp <= (mlib_addr) dend) {
632
633    num = (((mlib_u8*) dend - (mlib_u8*) dp) + off + 1) >> 1;
634    sp  += (num - 1);
635    vis_alignaddr(dp, 6);
636#pragma pipeloop(0)
637    for (i = 0; i < num; i ++) {
638      s0 = (mlib_s32) *sp;
639      sp --;
640
641      t0  = VIS_LD_U16_I(table, 2*s0);
642      acc1 = vis_faligndata(t0, acc1);
643    }
644
645    vis_alignaddr(dp, off);
646    emask = vis_edge8(dp, dend);
647    vis_pst_8(vis_faligndata(acc0, acc1), dp++, emask);
648  }
649
650  if ((mlib_addr) dp <= (mlib_addr) dend) {
651    emask = vis_edge8(dp, dend);
652    vis_pst_8(vis_faligndata(acc1, acc1), dp++, emask);
653  }
654}
655
656/***************************************************************/
657void mlib_v_ImageLookUpSI_U8_U8_2_DstA8D1_SMALL(const mlib_u8 *src,
658                                                mlib_u8       *dst,
659                                                mlib_s32      xsize,
660                                                const mlib_u8 **table)
661{
662  mlib_u8  *sp;              /* pointer to source data */
663  mlib_u32 s0, s1, s2, s3;   /* source data */
664  mlib_u16 *dl;              /* pointer to start of destination */
665  mlib_u16 *dend;            /* pointer to end of destination */
666  mlib_d64 *dp;              /* aligned pointer to destination */
667  mlib_d64 t0, t1, t2;       /* destination data */
668  mlib_d64 t3, t4, t5;       /* destination data */
669  mlib_d64 t6, t7, acc;      /* destination data */
670  mlib_s32 emask;            /* edge mask */
671  mlib_s32 i, num;           /* loop variable */
672  const mlib_u8  *tab0 = table[0];
673  const mlib_u8  *tab1 = table[1];
674
675  sp   = (void *)src;
676  dl   = (mlib_u16*)dst;
677  dp   = (mlib_d64 *) dl;
678  dend = dl + xsize - 1;
679
680  vis_alignaddr((void *) 0, 7);
681
682  if (xsize >= 4) {
683
684    s0 = sp[0];
685    s1 = sp[1];
686    s2 = sp[2];
687    s3 = sp[3];
688    sp += 4;
689
690#pragma pipeloop(0)
691    for(i = 0; i <= xsize - 8; i+=4, sp+=4) {
692      t7 = VIS_LD_U8_I(tab1, s3);
693      t6 = VIS_LD_U8_I(tab0, s3);
694      t5 = VIS_LD_U8_I(tab1, s2);
695      t4 = VIS_LD_U8_I(tab0, s2);
696      t3 = VIS_LD_U8_I(tab1, s1);
697      t2 = VIS_LD_U8_I(tab0, s1);
698      t1 = VIS_LD_U8_I(tab1, s0);
699      t0 = VIS_LD_U8_I(tab0, s0);
700      acc = vis_faligndata(t7, acc);
701      acc = vis_faligndata(t6, acc);
702      acc = vis_faligndata(t5, acc);
703      acc = vis_faligndata(t4, acc);
704      acc = vis_faligndata(t3, acc);
705      acc = vis_faligndata(t2, acc);
706      acc = vis_faligndata(t1, acc);
707      acc = vis_faligndata(t0, acc);
708      s0 = sp[0];
709      s1 = sp[1];
710      s2 = sp[2];
711      s3 = sp[3];
712      *dp++ = acc;
713    }
714
715    t7 = VIS_LD_U8_I(tab1, s3);
716    t6 = VIS_LD_U8_I(tab0, s3);
717    t5 = VIS_LD_U8_I(tab1, s2);
718    t4 = VIS_LD_U8_I(tab0, s2);
719    t3 = VIS_LD_U8_I(tab1, s1);
720    t2 = VIS_LD_U8_I(tab0, s1);
721    t1 = VIS_LD_U8_I(tab1, s0);
722    t0 = VIS_LD_U8_I(tab0, s0);
723    acc = vis_faligndata(t7, acc);
724    acc = vis_faligndata(t6, acc);
725    acc = vis_faligndata(t5, acc);
726    acc = vis_faligndata(t4, acc);
727    acc = vis_faligndata(t3, acc);
728    acc = vis_faligndata(t2, acc);
729    acc = vis_faligndata(t1, acc);
730    acc = vis_faligndata(t0, acc);
731    *dp++ = acc;
732  }
733
734  if ((mlib_addr) dp <= (mlib_addr) dend) {
735
736    num = (mlib_u16*) dend - (mlib_u16*) dp;
737    sp  += num;
738    num ++;
739#pragma pipeloop(0)
740    for (i = 0; i < num; i ++) {
741      s0 = (mlib_s32) *sp;
742      sp --;
743
744      t0  = VIS_LD_U8_I(tab1, s0);
745      acc = vis_faligndata(t0, acc);
746
747      t0  = VIS_LD_U8_I(tab0, s0);
748      acc = vis_faligndata(t0, acc);
749    }
750
751    emask = vis_edge16(dp, dend);
752    vis_pst_16(acc, dp, emask);
753  }
754}
755
756/***************************************************************/
757void mlib_v_ImageLookUpSI_U8_U8_2_D1_SMALL(const mlib_u8 *src,
758                                           mlib_u8       *dst,
759                                           mlib_s32      xsize,
760                                           const mlib_u8 **table)
761{
762  mlib_u8  *sp;                /* pointer to source data */
763  mlib_u32 s0, s1, s2, s3, s4; /* source data */
764  mlib_u8  *dl;                /* pointer to start of destination */
765  mlib_u8  *dend;              /* pointer to end of destination */
766  mlib_d64 *dp;                /* aligned pointer to destination */
767  mlib_d64 t0, t1, t2;         /* destination data */
768  mlib_d64 t3, t4, t5;         /* destination data */
769  mlib_d64 t6, t7, acc;        /* destination data */
770  mlib_s32 emask;              /* edge mask */
771  mlib_s32 i, num;             /* loop variable */
772  const mlib_u8  *tab0 = table[0];
773  const mlib_u8  *tab1 = table[1];
774
775  sp   = (void *)src;
776  dl   = dst;
777
778  dend = dl + 2 * xsize - 1;
779
780  vis_alignaddr((void *) 0, 7);
781
782  s0 = *sp++;
783  *dl++ = tab0[s0];
784  dp   = (mlib_d64 *) dl;
785  xsize--;
786
787  if (xsize >= 4) {
788
789    s1 = sp[0];
790    s2 = sp[1];
791    s3 = sp[2];
792    s4 = sp[3];
793    sp += 4;
794
795#pragma pipeloop(0)
796    for(i = 0; i <= xsize - 8; i+=4, sp+=4) {
797      t7 = VIS_LD_U8_I(tab0, s4);
798      t6 = VIS_LD_U8_I(tab1, s3);
799      t5 = VIS_LD_U8_I(tab0, s3);
800      t4 = VIS_LD_U8_I(tab1, s2);
801      t3 = VIS_LD_U8_I(tab0, s2);
802      t2 = VIS_LD_U8_I(tab1, s1);
803      t1 = VIS_LD_U8_I(tab0, s1);
804      t0 = VIS_LD_U8_I(tab1, s0);
805      acc = vis_faligndata(t7, acc);
806      acc = vis_faligndata(t6, acc);
807      acc = vis_faligndata(t5, acc);
808      acc = vis_faligndata(t4, acc);
809      acc = vis_faligndata(t3, acc);
810      acc = vis_faligndata(t2, acc);
811      acc = vis_faligndata(t1, acc);
812      acc = vis_faligndata(t0, acc);
813      s0 = s4;
814      s1 = sp[0];
815      s2 = sp[1];
816      s3 = sp[2];
817      s4 = sp[3];
818      *dp++ = acc;
819    }
820
821    t7 = VIS_LD_U8_I(tab0, s4);
822    t6 = VIS_LD_U8_I(tab1, s3);
823    t5 = VIS_LD_U8_I(tab0, s3);
824    t4 = VIS_LD_U8_I(tab1, s2);
825    t3 = VIS_LD_U8_I(tab0, s2);
826    t2 = VIS_LD_U8_I(tab1, s1);
827    t1 = VIS_LD_U8_I(tab0, s1);
828    t0 = VIS_LD_U8_I(tab1, s0);
829    acc = vis_faligndata(t7, acc);
830    acc = vis_faligndata(t6, acc);
831    acc = vis_faligndata(t5, acc);
832    acc = vis_faligndata(t4, acc);
833    acc = vis_faligndata(t3, acc);
834    acc = vis_faligndata(t2, acc);
835    acc = vis_faligndata(t1, acc);
836    acc = vis_faligndata(t0, acc);
837    s0 = s4;
838    *dp++ = acc;
839  }
840
841  num = ((mlib_u8*) dend - (mlib_u8*) dp) >> 1;
842  sp  += num;
843  num ++;
844
845#pragma pipeloop(0)
846  for (i = 0; i < num; i ++) {
847    s1 = (mlib_s32) *sp;
848    sp --;
849
850    t0  = VIS_LD_U8_I(tab1, s1);
851    acc = vis_faligndata(t0, acc);
852
853    t0  = VIS_LD_U8_I(tab0, s1);
854    acc = vis_faligndata(t0, acc);
855  }
856
857  t0  = VIS_LD_U8_I(tab1, s0);
858  acc = vis_faligndata(t0, acc);
859  emask = vis_edge8(dp, dend);
860  vis_pst_8(acc, dp, emask);
861}
862
863/***************************************************************/
864void mlib_v_ImageLookUpSI_U8_U8_2(const mlib_u8 *src,
865                                  mlib_s32      slb,
866                                  mlib_u8       *dst,
867                                  mlib_s32      dlb,
868                                  mlib_s32      xsize,
869                                  mlib_s32      ysize,
870                                  const mlib_u8 **table)
871{
872  if ((xsize * ysize) < 650) {
873    mlib_u8  *sl;
874    mlib_u8  *dl;
875    mlib_s32 i, j;
876
877    sl = (void *)src;
878    dl = dst;
879
880    /* row loop */
881    for (j = 0; j < ysize; j ++) {
882      mlib_u8 *sp = sl;
883      mlib_u8 *dp = dl;
884      mlib_s32 off, s0, size = xsize;
885
886      off = ((8 - ((mlib_addr)dp & 7)) & 7) >> 1;
887      off = (off < size) ? off : size;
888
889      for (i = 0; i < off; i++) {
890        s0 = *sp++;
891        *dp++ = table[0][s0];
892        *dp++ = table[1][s0];
893        size--;
894      }
895
896      if (size > 0) {
897
898        if (((mlib_addr)dp & 1) == 0) {
899          mlib_v_ImageLookUpSI_U8_U8_2_DstA8D1_SMALL(sp, dp, size, table);
900        } else {
901          mlib_v_ImageLookUpSI_U8_U8_2_D1_SMALL(sp, dp, size, table);
902        }
903      }
904
905      sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
906      dl = (mlib_u8 *) ((mlib_u8 *) dl + dlb);
907    }
908
909  } else {
910    mlib_u8  *sl;
911    mlib_u8  *dl;
912    mlib_u16 tab[256];
913    const mlib_u8  *tab0 = table[0];
914    const mlib_u8  *tab1 = table[1];
915    mlib_s32 i, j, s0, s1, s2;
916
917    s0 = tab0[0];
918    s1 = tab1[0];
919    for (i = 1; i < 256; i++) {
920      s2 = (s0 << 8) + s1;
921      s0 = tab0[i];
922      s1 = tab1[i];
923      tab[i-1] = (mlib_u16)s2;
924    }
925
926    s2 = (s0 << 8) + s1;
927    tab[255] = (mlib_u16)s2;
928
929    sl = (void *)src;
930    dl = dst;
931
932    /* row loop */
933    for (j = 0; j < ysize; j ++) {
934      mlib_u8 *sp = sl;
935      mlib_u8 *dp = dl;
936      mlib_s32 off, s0, size = xsize;
937
938      if (((mlib_addr)dp & 1) == 0) {
939
940        off = ((8 - ((mlib_addr)dp & 7)) & 7) >> 1;
941        off = (off < size) ? off : size;
942
943        for (i = 0; i < off; i++) {
944          *(mlib_u16*)dp = tab[(*sp)];
945          dp += 2;
946          size--; sp++;
947        }
948
949        if (size > 0) {
950
951          off = (mlib_addr)sp & 3;
952
953          if (off == 0) {
954            mlib_v_ImageLookUpSI_U8_U8_2_SrcOff0_D1(sp, dp, size, tab);
955          } else if (off == 1) {
956            mlib_v_ImageLookUpSI_U8_U8_2_SrcOff1_D1(sp, dp, size, tab);
957          } else if (off == 2) {
958            mlib_v_ImageLookUpSI_U8_U8_2_SrcOff2_D1(sp, dp, size, tab);
959          } else {
960            mlib_v_ImageLookUpSI_U8_U8_2_SrcOff3_D1(sp, dp, size, tab);
961          }
962        }
963
964      } else {
965
966        off = ((4 - ((mlib_addr)sp & 3)) & 3);
967        off = (off < size) ? off : size;
968
969        for (i = 0; i < off; i++) {
970          s0 = tab[(*sp)];
971          *dp++ = (s0 >> 8);
972          *dp++ = (s0 & 0xFF);
973          size--; sp++;
974        }
975
976        if (size > 0) {
977          mlib_v_ImageLookUpSI_U8_U8_2_DstNonAl_D1(sp, dp, size, tab);
978        }
979      }
980
981      sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
982      dl = (mlib_u8 *) ((mlib_u8 *) dl + dlb);
983    }
984  }
985}
986
987/***************************************************************/
988void mlib_v_ImageLookUpSI_U8_U8_3_SrcOff0_D1(const mlib_u8  *src,
989                                             mlib_u8        *dst,
990                                             mlib_s32       xsize,
991                                             const mlib_d64 *table)
992{
993  mlib_u8  *sp;            /* pointer to source data */
994  mlib_u32 *sa;            /* aligned pointer to source data */
995  mlib_u32 s0;             /* source data */
996  mlib_u8  *dl;            /* pointer to start of destination */
997  mlib_f32 *dp;            /* aligned pointer to destination */
998  mlib_d64 t0, t1, t2, t3; /* destination data */
999  mlib_d64 acc0, acc1;     /* destination data */
1000  mlib_s32 i;              /* loop variable */
1001  mlib_u8  *ptr;
1002
1003  dl   =  dst;
1004  dp   = (mlib_f32 *) dl;
1005  sp = (void *)src;
1006  sa = (mlib_u32*)sp;
1007
1008  vis_alignaddr((void *) 0, 3);
1009
1010  i = 0;
1011
1012  if (xsize >= 4) {
1013
1014    s0 = *sa++;
1015
1016#pragma pipeloop(0)
1017    for(i = 0; i <= xsize - 8; i+=4, dp+=3) {
1018      t0 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 21) & 0x7F8 ));
1019      t1 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 13) & 0x7F8 ));
1020      t2 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 5) & 0x7F8 ));
1021      t3 = *(mlib_d64*)((mlib_u8*)table + ((s0 << 3) & 0x7F8 ));
1022      acc0 = vis_faligndata(t0, t0);
1023      acc0 = vis_faligndata(acc0, t1);
1024      acc1 = vis_faligndata(acc0, acc0);
1025      acc0 = vis_faligndata(acc0, t2);
1026      acc1 = vis_faligndata(acc1, acc0);
1027      acc0 = vis_faligndata(acc0, t3);
1028      s0 = *sa++;
1029      dp[0] = vis_read_lo(acc1);
1030      dp[1] = vis_read_hi(acc0);
1031      dp[2] = vis_read_lo(acc0);
1032    }
1033
1034    t0 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 21) & 0x7F8 ));
1035    t1 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 13) & 0x7F8 ));
1036    t2 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 5) & 0x7F8 ));
1037    t3 = *(mlib_d64*)((mlib_u8*)table + ((s0 << 3) & 0x7F8 ));
1038    acc0 = vis_faligndata(t0, t0);
1039    acc0 = vis_faligndata(acc0, t1);
1040    acc1 = vis_faligndata(acc0, acc0);
1041    acc0 = vis_faligndata(acc0, t2);
1042    acc1 = vis_faligndata(acc1, acc0);
1043    acc0 = vis_faligndata(acc0, t3);
1044    dp[0] = vis_read_lo(acc1);
1045    dp[1] = vis_read_hi(acc0);
1046    dp[2] = vis_read_lo(acc0);
1047    dp += 3;
1048    i += 4;
1049  }
1050
1051  dl = (mlib_u8*)dp;
1052
1053#pragma pipeloop(0)
1054  for (; i < xsize; i++) {
1055    ptr = (mlib_u8*)(table + src[i]);
1056    dl[0] = ptr[0];
1057    dl[1] = ptr[1];
1058    dl[2] = ptr[2];
1059    dl += 3;
1060  }
1061}
1062
1063/***************************************************************/
1064void mlib_v_ImageLookUpSI_U8_U8_3_SrcOff1_D1(const mlib_u8  *src,
1065                                             mlib_u8        *dst,
1066                                             mlib_s32       xsize,
1067                                             const mlib_d64 *table)
1068{
1069  mlib_u8  *sp;            /* pointer to source data */
1070  mlib_u32 *sa;            /* aligned pointer to source data */
1071  mlib_u32 s0, s1;         /* source data */
1072  mlib_u8  *dl;            /* pointer to start of destination */
1073  mlib_f32 *dp;            /* aligned pointer to destination */
1074  mlib_d64 t0, t1, t2, t3; /* destination data */
1075  mlib_d64 acc0, acc1;     /* destination data */
1076  mlib_s32 i;              /* loop variable */
1077  mlib_u8  *ptr;
1078
1079  dl   =  dst;
1080  dp   = (mlib_f32 *) dl;
1081  sp = (void *)src;
1082  sa = (mlib_u32*)(sp - 1);
1083
1084  vis_alignaddr((void *) 0, 3);
1085
1086  i = 0;
1087  s0 = *sa++;
1088
1089  if (xsize >= 4) {
1090
1091    s1 = *sa++;
1092
1093#pragma pipeloop(0)
1094    for(i = 0; i <= xsize - 8; i+=4, dp+=3) {
1095      t0 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 13) & 0x7F8 ));
1096      t1 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 5) & 0x7F8 ));
1097      t2 = *(mlib_d64*)((mlib_u8*)table + ((s0 << 3) & 0x7F8 ));
1098      t3 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 21) & 0x7F8 ));
1099      acc0 = vis_faligndata(t0, t0);
1100      acc0 = vis_faligndata(acc0, t1);
1101      acc1 = vis_faligndata(acc0, acc0);
1102      acc0 = vis_faligndata(acc0, t2);
1103      acc1 = vis_faligndata(acc1, acc0);
1104      acc0 = vis_faligndata(acc0, t3);
1105      s0 = s1;
1106      s1 = *sa++;
1107      dp[0] = vis_read_lo(acc1);
1108      dp[1] = vis_read_hi(acc0);
1109      dp[2] = vis_read_lo(acc0);
1110    }
1111
1112    t0 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 13) & 0x7F8 ));
1113    t1 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 5) & 0x7F8 ));
1114    t2 = *(mlib_d64*)((mlib_u8*)table + ((s0 << 3) & 0x7F8 ));
1115    t3 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 21) & 0x7F8 ));
1116    acc0 = vis_faligndata(t0, t0);
1117    acc0 = vis_faligndata(acc0, t1);
1118    acc1 = vis_faligndata(acc0, acc0);
1119    acc0 = vis_faligndata(acc0, t2);
1120    acc1 = vis_faligndata(acc1, acc0);
1121    acc0 = vis_faligndata(acc0, t3);
1122    dp[0] = vis_read_lo(acc1);
1123    dp[1] = vis_read_hi(acc0);
1124    dp[2] = vis_read_lo(acc0);
1125    dp += 3;
1126    i += 4;
1127  }
1128
1129  dl = (mlib_u8*)dp;
1130
1131#pragma pipeloop(0)
1132  for (; i < xsize; i++) {
1133    ptr = (mlib_u8*)(table + src[i]);
1134    dl[0] = ptr[0];
1135    dl[1] = ptr[1];
1136    dl[2] = ptr[2];
1137    dl += 3;
1138  }
1139}
1140
1141/***************************************************************/
1142void mlib_v_ImageLookUpSI_U8_U8_3_SrcOff2_D1(const mlib_u8  *src,
1143                                             mlib_u8        *dst,
1144                                             mlib_s32       xsize,
1145                                             const mlib_d64 *table)
1146{
1147  mlib_u8  *sp;            /* pointer to source data */
1148  mlib_u32 *sa;            /* aligned pointer to source data */
1149  mlib_u32 s0, s1;         /* source data */
1150  mlib_u8  *dl;            /* pointer to start of destination */
1151  mlib_f32 *dp;            /* aligned pointer to destination */
1152  mlib_d64 t0, t1, t2, t3; /* destination data */
1153  mlib_d64 acc0, acc1;     /* destination data */
1154  mlib_s32 i;              /* loop variable */
1155  mlib_u8  *ptr;
1156
1157  dl   =  dst;
1158  dp   = (mlib_f32 *) dl;
1159  sp = (void *)src;
1160  sa = (mlib_u32*)(sp - 2);
1161
1162  vis_alignaddr((void *) 0, 3);
1163
1164  i = 0;
1165  s0 = *sa++;
1166
1167  if (xsize >= 4) {
1168
1169    s1 = *sa++;
1170
1171#pragma pipeloop(0)
1172    for(i = 0; i <= xsize - 8; i+=4, dp+=3) {
1173      t0 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 5) & 0x7F8 ));
1174      t1 = *(mlib_d64*)((mlib_u8*)table + ((s0 << 3) & 0x7F8 ));
1175      t2 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 21) & 0x7F8 ));
1176      t3 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 13) & 0x7F8 ));
1177      acc0 = vis_faligndata(t0, t0);
1178      acc0 = vis_faligndata(acc0, t1);
1179      acc1 = vis_faligndata(acc0, acc0);
1180      acc0 = vis_faligndata(acc0, t2);
1181      acc1 = vis_faligndata(acc1, acc0);
1182      acc0 = vis_faligndata(acc0, t3);
1183      s0 = s1;
1184      s1 = *sa++;
1185      dp[0] = vis_read_lo(acc1);
1186      dp[1] = vis_read_hi(acc0);
1187      dp[2] = vis_read_lo(acc0);
1188    }
1189
1190    t0 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 5) & 0x7F8 ));
1191    t1 = *(mlib_d64*)((mlib_u8*)table + ((s0 << 3) & 0x7F8 ));
1192    t2 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 21) & 0x7F8 ));
1193    t3 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 13) & 0x7F8 ));
1194    acc0 = vis_faligndata(t0, t0);
1195    acc0 = vis_faligndata(acc0, t1);
1196    acc1 = vis_faligndata(acc0, acc0);
1197    acc0 = vis_faligndata(acc0, t2);
1198    acc1 = vis_faligndata(acc1, acc0);
1199    acc0 = vis_faligndata(acc0, t3);
1200    dp[0] = vis_read_lo(acc1);
1201    dp[1] = vis_read_hi(acc0);
1202    dp[2] = vis_read_lo(acc0);
1203    dp += 3;
1204    i += 4;
1205  }
1206
1207  dl = (mlib_u8*)dp;
1208
1209#pragma pipeloop(0)
1210  for (; i < xsize; i++) {
1211    ptr = (mlib_u8*)(table + src[i]);
1212    dl[0] = ptr[0];
1213    dl[1] = ptr[1];
1214    dl[2] = ptr[2];
1215    dl += 3;
1216  }
1217}
1218
1219/***************************************************************/
1220void mlib_v_ImageLookUpSI_U8_U8_3_SrcOff3_D1(const mlib_u8  *src,
1221                                             mlib_u8        *dst,
1222                                             mlib_s32       xsize,
1223                                             const mlib_d64 *table)
1224{
1225  mlib_u8  *sp;            /* pointer to source data */
1226  mlib_u32 *sa;            /* aligned pointer to source data */
1227  mlib_u32 s0, s1;         /* source data */
1228  mlib_u8  *dl;            /* pointer to start of destination */
1229  mlib_f32 *dp;            /* aligned pointer to destination */
1230  mlib_d64 t0, t1, t2, t3; /* destination data */
1231  mlib_d64 acc0, acc1;     /* destination data */
1232  mlib_s32 i;              /* loop variable */
1233  mlib_u8  *ptr;
1234
1235  dl   =  dst;
1236  dp   = (mlib_f32 *) dl;
1237  sp = (void *)src;
1238  sa = (mlib_u32*)(sp - 3);
1239
1240  vis_alignaddr((void *) 0, 3);
1241
1242  i = 0;
1243  s0 = *sa++;
1244
1245  if (xsize >= 4) {
1246
1247    s1 = *sa++;
1248
1249#pragma pipeloop(0)
1250    for(i = 0; i <= xsize - 8; i+=4, dp+=3) {
1251      t0 = *(mlib_d64*)((mlib_u8*)table + ((s0 << 3) & 0x7F8 ));
1252      t1 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 21) & 0x7F8 ));
1253      t2 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 13) & 0x7F8 ));
1254      t3 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 5) & 0x7F8 ));
1255      acc0 = vis_faligndata(t0, t0);
1256      acc0 = vis_faligndata(acc0, t1);
1257      acc1 = vis_faligndata(acc0, acc0);
1258      acc0 = vis_faligndata(acc0, t2);
1259      acc1 = vis_faligndata(acc1, acc0);
1260      acc0 = vis_faligndata(acc0, t3);
1261      s0 = s1;
1262      s1 = *sa++;
1263      dp[0] = vis_read_lo(acc1);
1264      dp[1] = vis_read_hi(acc0);
1265      dp[2] = vis_read_lo(acc0);
1266    }
1267
1268    t0 = *(mlib_d64*)((mlib_u8*)table + ((s0 << 3) & 0x7F8 ));
1269    t1 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 21) & 0x7F8 ));
1270    t2 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 13) & 0x7F8 ));
1271    t3 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 5) & 0x7F8 ));
1272    acc0 = vis_faligndata(t0, t0);
1273    acc0 = vis_faligndata(acc0, t1);
1274    acc1 = vis_faligndata(acc0, acc0);
1275    acc0 = vis_faligndata(acc0, t2);
1276    acc1 = vis_faligndata(acc1, acc0);
1277    acc0 = vis_faligndata(acc0, t3);
1278    dp[0] = vis_read_lo(acc1);
1279    dp[1] = vis_read_hi(acc0);
1280    dp[2] = vis_read_lo(acc0);
1281    dp += 3;
1282    i += 4;
1283  }
1284
1285  dl = (mlib_u8*)dp;
1286
1287#pragma pipeloop(0)
1288  for (; i < xsize; i++) {
1289    ptr = (mlib_u8*)(table + src[i]);
1290    dl[0] = ptr[0];
1291    dl[1] = ptr[1];
1292    dl[2] = ptr[2];
1293    dl += 3;
1294  }
1295}
1296
1297/***************************************************************/
1298void mlib_v_ImageLookUpSI_U8_U8_3_D1_SMALL(const mlib_u8 *src,
1299                                           mlib_u8       *dst,
1300                                           mlib_s32      xsize,
1301                                           const mlib_u8 **table)
1302{
1303  mlib_u8  *sp;              /* pointer to source data */
1304  mlib_u8  *dl;              /* pointer to start of destination */
1305  mlib_d64 *dp;              /* aligned pointer to destination */
1306  mlib_d64 t0, t1, t2;       /* destination data */
1307  mlib_d64 t3, t4, t5;       /* destination data */
1308  mlib_d64 t6, t7;           /* destination data */
1309  mlib_d64 acc0, acc1, acc2; /* destination data */
1310  mlib_s32 i;                /* loop variable */
1311  const mlib_u8  *tab0 = table[0];
1312  const mlib_u8  *tab1 = table[1];
1313  const mlib_u8  *tab2 = table[2];
1314  mlib_u32 s00, s01, s02, s03;
1315  mlib_u32 s10, s11, s12, s13;
1316
1317  sp   = (void *)src;
1318  dl   = dst;
1319  dp   = (mlib_d64 *) dl;
1320
1321  vis_alignaddr((void *) 0, 7);
1322
1323  i = 0;
1324
1325  if (xsize >= 8) {
1326
1327    s00 = sp[0];
1328    s01 = sp[1];
1329    s02 = sp[2];
1330    s03 = sp[3];
1331    s10 = sp[4];
1332    s11 = sp[5];
1333    s12 = sp[6];
1334    s13 = sp[7];
1335    sp += 8;
1336
1337#pragma pipeloop(0)
1338    for(i = 0; i <= xsize - 16; i+=8, sp+=8) {
1339      t7 = VIS_LD_U8_I(tab1, s02);
1340      t6 = VIS_LD_U8_I(tab0, s02);
1341      t5 = VIS_LD_U8_I(tab2, s01);
1342      t4 = VIS_LD_U8_I(tab1, s01);
1343      t3 = VIS_LD_U8_I(tab0, s01);
1344      t2 = VIS_LD_U8_I(tab2, s00);
1345      t1 = VIS_LD_U8_I(tab1, s00);
1346      t0 = VIS_LD_U8_I(tab0, s00);
1347      acc0 = vis_faligndata(t7, acc0);
1348      acc0 = vis_faligndata(t6, acc0);
1349      acc0 = vis_faligndata(t5, acc0);
1350      acc0 = vis_faligndata(t4, acc0);
1351      acc0 = vis_faligndata(t3, acc0);
1352      acc0 = vis_faligndata(t2, acc0);
1353      acc0 = vis_faligndata(t1, acc0);
1354      acc0 = vis_faligndata(t0, acc0);
1355      t7 = VIS_LD_U8_I(tab0, s11);
1356      t6 = VIS_LD_U8_I(tab2, s10);
1357      t5 = VIS_LD_U8_I(tab1, s10);
1358      t4 = VIS_LD_U8_I(tab0, s10);
1359      t3 = VIS_LD_U8_I(tab2, s03);
1360      t2 = VIS_LD_U8_I(tab1, s03);
1361      t1 = VIS_LD_U8_I(tab0, s03);
1362      t0 = VIS_LD_U8_I(tab2, s02);
1363      acc1 = vis_faligndata(t7, acc1);
1364      acc1 = vis_faligndata(t6, acc1);
1365      acc1 = vis_faligndata(t5, acc1);
1366      acc1 = vis_faligndata(t4, acc1);
1367      acc1 = vis_faligndata(t3, acc1);
1368      acc1 = vis_faligndata(t2, acc1);
1369      acc1 = vis_faligndata(t1, acc1);
1370      acc1 = vis_faligndata(t0, acc1);
1371      t7 = VIS_LD_U8_I(tab2, s13);
1372      t6 = VIS_LD_U8_I(tab1, s13);
1373      t5 = VIS_LD_U8_I(tab0, s13);
1374      t4 = VIS_LD_U8_I(tab2, s12);
1375      t3 = VIS_LD_U8_I(tab1, s12);
1376      t2 = VIS_LD_U8_I(tab0, s12);
1377      t1 = VIS_LD_U8_I(tab2, s11);
1378      t0 = VIS_LD_U8_I(tab1, s11);
1379      acc2 = vis_faligndata(t7, acc2);
1380      acc2 = vis_faligndata(t6, acc2);
1381      acc2 = vis_faligndata(t5, acc2);
1382      acc2 = vis_faligndata(t4, acc2);
1383      acc2 = vis_faligndata(t3, acc2);
1384      acc2 = vis_faligndata(t2, acc2);
1385      acc2 = vis_faligndata(t1, acc2);
1386      acc2 = vis_faligndata(t0, acc2);
1387      s00 = sp[0];
1388      s01 = sp[1];
1389      s02 = sp[2];
1390      s03 = sp[3];
1391      s10 = sp[4];
1392      s11 = sp[5];
1393      s12 = sp[6];
1394      s13 = sp[7];
1395      *dp++ = acc0;
1396      *dp++ = acc1;
1397      *dp++ = acc2;
1398    }
1399
1400    t7 = VIS_LD_U8_I(tab1, s02);
1401    t6 = VIS_LD_U8_I(tab0, s02);
1402    t5 = VIS_LD_U8_I(tab2, s01);
1403    t4 = VIS_LD_U8_I(tab1, s01);
1404    t3 = VIS_LD_U8_I(tab0, s01);
1405    t2 = VIS_LD_U8_I(tab2, s00);
1406    t1 = VIS_LD_U8_I(tab1, s00);
1407    t0 = VIS_LD_U8_I(tab0, s00);
1408    acc0 = vis_faligndata(t7, acc0);
1409    acc0 = vis_faligndata(t6, acc0);
1410    acc0 = vis_faligndata(t5, acc0);
1411    acc0 = vis_faligndata(t4, acc0);
1412    acc0 = vis_faligndata(t3, acc0);
1413    acc0 = vis_faligndata(t2, acc0);
1414    acc0 = vis_faligndata(t1, acc0);
1415    acc0 = vis_faligndata(t0, acc0);
1416    t7 = VIS_LD_U8_I(tab0, s11);
1417    t6 = VIS_LD_U8_I(tab2, s10);
1418    t5 = VIS_LD_U8_I(tab1, s10);
1419    t4 = VIS_LD_U8_I(tab0, s10);
1420    t3 = VIS_LD_U8_I(tab2, s03);
1421    t2 = VIS_LD_U8_I(tab1, s03);
1422    t1 = VIS_LD_U8_I(tab0, s03);
1423    t0 = VIS_LD_U8_I(tab2, s02);
1424    acc1 = vis_faligndata(t7, acc1);
1425    acc1 = vis_faligndata(t6, acc1);
1426    acc1 = vis_faligndata(t5, acc1);
1427    acc1 = vis_faligndata(t4, acc1);
1428    acc1 = vis_faligndata(t3, acc1);
1429    acc1 = vis_faligndata(t2, acc1);
1430    acc1 = vis_faligndata(t1, acc1);
1431    acc1 = vis_faligndata(t0, acc1);
1432    t7 = VIS_LD_U8_I(tab2, s13);
1433    t6 = VIS_LD_U8_I(tab1, s13);
1434    t5 = VIS_LD_U8_I(tab0, s13);
1435    t4 = VIS_LD_U8_I(tab2, s12);
1436    t3 = VIS_LD_U8_I(tab1, s12);
1437    t2 = VIS_LD_U8_I(tab0, s12);
1438    t1 = VIS_LD_U8_I(tab2, s11);
1439    t0 = VIS_LD_U8_I(tab1, s11);
1440    acc2 = vis_faligndata(t7, acc2);
1441    acc2 = vis_faligndata(t6, acc2);
1442    acc2 = vis_faligndata(t5, acc2);
1443    acc2 = vis_faligndata(t4, acc2);
1444    acc2 = vis_faligndata(t3, acc2);
1445    acc2 = vis_faligndata(t2, acc2);
1446    acc2 = vis_faligndata(t1, acc2);
1447    acc2 = vis_faligndata(t0, acc2);
1448    *dp++ = acc0;
1449    *dp++ = acc1;
1450    *dp++ = acc2;
1451    i += 8;
1452  }
1453
1454  dl = (mlib_u8*)dp;
1455
1456#pragma pipeloop(0)
1457  for (; i < xsize; i++) {
1458    s00 = sp[0];
1459    dl[0] = tab0[s00];
1460    dl[1] = tab1[s00];
1461    dl[2] = tab2[s00];
1462    dl += 3; sp ++;
1463  }
1464}
1465
1466/***************************************************************/
1467void mlib_v_ImageLookUpSI_U8_U8_3(const mlib_u8 *src,
1468                                  mlib_s32      slb,
1469                                  mlib_u8       *dst,
1470                                  mlib_s32      dlb,
1471                                  mlib_s32      xsize,
1472                                  mlib_s32      ysize,
1473                                  const mlib_u8 **table)
1474{
1475  if ((xsize * ysize) < 650) {
1476    mlib_u8  *sl;
1477    mlib_u8  *dl;
1478    mlib_s32 i, j;
1479    const mlib_u8  *tab0 = table[0];
1480    const mlib_u8  *tab1 = table[1];
1481    const mlib_u8  *tab2 = table[2];
1482
1483    sl = (void *)src;
1484    dl = dst;
1485
1486    /* row loop */
1487    for (j = 0; j < ysize; j ++) {
1488      mlib_u8 *sp = sl;
1489      mlib_u8 *dp = dl;
1490      mlib_s32 off, s0, size = xsize;
1491
1492      off = (mlib_addr)dp & 7;
1493      off = (off * 5) & 7;
1494      off = (off < size) ? off : size;
1495
1496      for (i = 0; i < off; i++) {
1497        s0 = *sp++;
1498        *dp++ = tab0[s0];
1499        *dp++ = tab1[s0];
1500        *dp++ = tab2[s0];
1501        size--;
1502      }
1503
1504      if (size > 0) {
1505        mlib_v_ImageLookUpSI_U8_U8_3_D1_SMALL(sp, dp, size, table);
1506      }
1507
1508      sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
1509      dl = (mlib_u8 *) ((mlib_u8 *) dl + dlb);
1510    }
1511
1512  } else {
1513    mlib_u8  *sl;
1514    mlib_u8  *dl;
1515    mlib_u32 tab[512];
1516    const mlib_u8  *tab0 = table[0];
1517    const mlib_u8  *tab1 = table[1];
1518    const mlib_u8  *tab2 = table[2];
1519    mlib_s32 i, j;
1520    mlib_u32 s0, s1, s2, s3;
1521
1522    s0 = tab0[0];
1523    s1 = tab1[0];
1524    s2 = tab2[0];
1525    for (i = 1; i < 256; i++) {
1526      s3 = (s0 << 24) + (s1 << 16) + (s2 << 8);
1527      s0 = tab0[i];
1528      s1 = tab1[i];
1529      s2 = tab2[i];
1530      tab[2*i-2] = s3;
1531    }
1532
1533    s3 = (s0 << 24) + (s1 << 16) + (s2 << 8);
1534    tab[510] = s3;
1535
1536    sl = (void *)src;
1537    dl = dst;
1538
1539    /* row loop */
1540    for (j = 0; j < ysize; j ++) {
1541      mlib_u8 *sp = sl;
1542      mlib_u8 *dp = dl;
1543      mlib_s32 off, size = xsize;
1544      mlib_u8  *ptr;
1545
1546      off = ((mlib_addr)dp & 3);
1547      off = (off < size) ? off : size;
1548
1549#pragma pipeloop(0)
1550      for (i = 0; i < off; i++) {
1551        ptr = (mlib_u8*)(tab + 2*sp[i]);
1552        dp[0] = ptr[0];
1553        dp[1] = ptr[1];
1554        dp[2] = ptr[2];
1555        dp += 3;
1556      }
1557
1558      size -= off;
1559      sp += off;
1560
1561      if (size > 0) {
1562        off = (mlib_addr)sp & 3;
1563
1564        if (off == 0) {
1565          mlib_v_ImageLookUpSI_U8_U8_3_SrcOff0_D1(sp, dp, size, (mlib_d64*)tab);
1566        } else if (off == 1) {
1567          mlib_v_ImageLookUpSI_U8_U8_3_SrcOff1_D1(sp, dp, size, (mlib_d64*)tab);
1568        } else if (off == 2) {
1569          mlib_v_ImageLookUpSI_U8_U8_3_SrcOff2_D1(sp, dp, size, (mlib_d64*)tab);
1570        } else if (off == 3) {
1571          mlib_v_ImageLookUpSI_U8_U8_3_SrcOff3_D1(sp, dp, size, (mlib_d64*)tab);
1572        }
1573      }
1574
1575      sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
1576      dl = (mlib_u8 *) ((mlib_u8 *) dl + dlb);
1577    }
1578  }
1579}
1580
1581/***************************************************************/
1582void mlib_v_ImageLookUpSI_U8_U8_4_SrcOff0_D1(const mlib_u8  *src,
1583                                             mlib_u8        *dst,
1584                                             mlib_s32       xsize,
1585                                             const mlib_f32 *table)
1586{
1587  mlib_u32 *sa;          /* aligned pointer to source data */
1588  mlib_u8  *sp;          /* pointer to source data */
1589  mlib_u32 s0;           /* source data */
1590  mlib_f32 *dp;          /* aligned pointer to destination */
1591  mlib_f32 acc0, acc1;   /* destination data */
1592  mlib_f32 acc2, acc3;   /* destination data */
1593  mlib_s32 i;            /* loop variable */
1594  mlib_u32 s00, s01, s02, s03;
1595
1596  sa   = (mlib_u32*)src;
1597  dp   = (mlib_f32 *) dst;
1598
1599  i = 0;
1600
1601  if (xsize >= 4) {
1602
1603    s0 = *sa++;
1604    s00 = (s0 >> 22) & 0x3FC;
1605    s01 = (s0 >> 14) & 0x3FC;
1606
1607#pragma pipeloop(0)
1608    for(i = 0; i <= xsize - 8; i+=4, dp += 4) {
1609      s02 = (s0 >> 6) & 0x3FC;
1610      s03 = (s0 << 2) & 0x3FC;
1611      acc0 = *(mlib_f32*)((mlib_u8*)table + s00);
1612      acc1 = *(mlib_f32*)((mlib_u8*)table + s01);
1613      acc2 = *(mlib_f32*)((mlib_u8*)table + s02);
1614      acc3 = *(mlib_f32*)((mlib_u8*)table + s03);
1615      s0 = *sa++;
1616      s00 = (s0 >> 22) & 0x3FC;
1617      s01 = (s0 >> 14) & 0x3FC;
1618      dp[0] = acc0;
1619      dp[1] = acc1;
1620      dp[2] = acc2;
1621      dp[3] = acc3;
1622    }
1623
1624    s02 = (s0 >> 6) & 0x3FC;
1625    s03 = (s0 << 2) & 0x3FC;
1626    acc0 = *(mlib_f32*)((mlib_u8*)table + s00);
1627    acc1 = *(mlib_f32*)((mlib_u8*)table + s01);
1628    acc2 = *(mlib_f32*)((mlib_u8*)table + s02);
1629    acc3 = *(mlib_f32*)((mlib_u8*)table + s03);
1630    dp[0] = acc0;
1631    dp[1] = acc1;
1632    dp[2] = acc2;
1633    dp[3] = acc3;
1634    dp += 4;
1635    i += 4;
1636  }
1637
1638  sp = (mlib_u8*)sa;
1639
1640  if ( i <= xsize - 2) {
1641    *dp++ = table[sp[0]];
1642    *dp++ = table[sp[1]];
1643    i+=2; sp += 2;
1644  }
1645
1646  if ( i < xsize) *dp = table[sp[0]];
1647}
1648
1649/***************************************************************/
1650void mlib_v_ImageLookUpSI_U8_U8_4_DstNonAl_D1(const mlib_u8  *src,
1651                                              mlib_u8        *dst,
1652                                              mlib_s32       xsize,
1653                                              const mlib_f32 *table)
1654{
1655  mlib_u32 *sa;              /* aligned pointer to source data */
1656  mlib_u8  *sp;              /* pointer to source data */
1657  mlib_u32 s0;               /* source data */
1658  mlib_u8  *dl;              /* pointer to start of destination */
1659  mlib_d64 *dp;              /* aligned pointer to destination */
1660  mlib_d64 acc0, acc1, acc2; /* destination data */
1661  mlib_s32 i;                /* loop variable */
1662  mlib_u8  *dend;            /* pointer to end of destination */
1663  mlib_s32 emask;            /* edge mask */
1664  mlib_s32 off;
1665  mlib_u32 s00, s01, s02, s03;
1666
1667  sa   = (mlib_u32*)src;
1668  sp = (void *)src;
1669  dl = dst;
1670  dend = dl + (xsize << 2) - 1;
1671  dp   = (mlib_d64 *) ((mlib_addr) dl & (~7));
1672  off  = (mlib_addr) dp - (mlib_addr) dl;
1673  vis_alignaddr(dp, off);
1674
1675  emask = vis_edge8(dl, dend);
1676  acc0 = vis_freg_pair(table[sp[0]], table[sp[1]]);
1677  vis_pst_8(vis_faligndata(acc0, acc0), dp++, emask);
1678  sp += 2;
1679
1680  xsize -= 2;
1681
1682  if (xsize >= 2) {
1683    acc1 = vis_freg_pair(table[sp[0]], table[sp[1]]);
1684    *dp++ = vis_faligndata(acc0, acc1);
1685    acc0 = acc1;
1686    sp += 2; xsize -= 2;
1687  }
1688
1689  sa++;
1690
1691  i = 0;
1692
1693  if (xsize >= 4) {
1694
1695    s0 = *sa++;
1696    s00 = (s0 >> 22) & 0x3FC;
1697    s01 = (s0 >> 14) & 0x3FC;
1698
1699#pragma pipeloop(0)
1700    for(i = 0; i <= xsize - 8; i+=4, dp += 2) {
1701      s02 = (s0 >> 6) & 0x3FC;
1702      s03 = (s0 << 2) & 0x3FC;
1703      acc1 = vis_freg_pair(*(mlib_f32*)((mlib_u8*)table + s00),
1704                           *(mlib_f32*)((mlib_u8*)table + s01));
1705      acc2 = vis_freg_pair(*(mlib_f32*)((mlib_u8*)table + s02),
1706                           *(mlib_f32*)((mlib_u8*)table + s03));
1707      s0 = *sa++;
1708      s00 = (s0 >> 22) & 0x3FC;
1709      s01 = (s0 >> 14) & 0x3FC;
1710      dp[0] = vis_faligndata(acc0, acc1);
1711      dp[1] = vis_faligndata(acc1, acc2);
1712      acc0 = acc2;
1713    }
1714
1715    s02 = (s0 >> 6) & 0x3FC;
1716    s03 = (s0 << 2) & 0x3FC;
1717    acc1 = vis_freg_pair(*(mlib_f32*)((mlib_u8*)table + s00),
1718                         *(mlib_f32*)((mlib_u8*)table + s01));
1719    acc2 = vis_freg_pair(*(mlib_f32*)((mlib_u8*)table + s02),
1720                         *(mlib_f32*)((mlib_u8*)table + s03));
1721    dp[0] = vis_faligndata(acc0, acc1);
1722    dp[1] = vis_faligndata(acc1, acc2);
1723    acc0 = acc2;
1724    sp = (mlib_u8*)sa;
1725    dp += 2;
1726    i += 4;
1727  }
1728
1729  if ( i <= xsize - 2) {
1730    acc1 = vis_freg_pair(table[sp[0]], table[sp[1]]);
1731    *dp++ = vis_faligndata(acc0, acc1);
1732    acc0 = acc1;
1733    i+=2; sp += 2;
1734  }
1735
1736  if ((mlib_addr) dp <= (mlib_addr) dend) {
1737    emask = vis_edge8(dp, dend);
1738    acc1 = vis_freg_pair(table[sp[0]], table[sp[1]]);
1739    vis_pst_8(vis_faligndata(acc0, acc1), dp++, emask);
1740  }
1741
1742  if ((mlib_addr) dp <= (mlib_addr) dend) {
1743    emask = vis_edge8(dp, dend);
1744    vis_pst_8(vis_faligndata(acc1, acc1), dp++, emask);
1745  }
1746}
1747
1748/***************************************************************/
1749void mlib_v_ImageLookUpSI_U8_U8_4_DstOff0_D1_SMALL(const mlib_u8 *src,
1750                                                   mlib_u8       *dst,
1751                                                   mlib_s32      xsize,
1752                                                   const mlib_u8 **table)
1753{
1754  mlib_u8  *sp;              /* pointer to source data */
1755  mlib_u32 s0, s1;           /* source data */
1756  mlib_u8 *dl;               /* pointer to start of destination */
1757  mlib_d64 *dp;              /* aligned pointer to destination */
1758  mlib_d64 t0, t1, t2;       /* destination data */
1759  mlib_d64 t3, t4, t5;       /* destination data */
1760  mlib_d64 t6, t7, acc;      /* destination data */
1761  mlib_s32 i;                /* loop variable */
1762  const mlib_u8  *tab0 = table[0];
1763  const mlib_u8  *tab1 = table[1];
1764  const mlib_u8  *tab2 = table[2];
1765  const mlib_u8  *tab3 = table[3];
1766
1767  sp   = (void *)src;
1768  dl   = dst;
1769  dp   = (mlib_d64 *) dl;
1770
1771  vis_alignaddr((void *) 0, 7);
1772
1773  if (xsize >= 2) {
1774
1775    s0 = sp[0];
1776    s1 = sp[1];
1777    sp += 2;
1778
1779#pragma pipeloop(0)
1780    for(i = 0; i <= xsize - 4; i+=2, sp+=2) {
1781      t7 = VIS_LD_U8_I(tab3, s1);
1782      t6 = VIS_LD_U8_I(tab2, s1);
1783      t5 = VIS_LD_U8_I(tab1, s1);
1784      t4 = VIS_LD_U8_I(tab0, s1);
1785      t3 = VIS_LD_U8_I(tab3, s0);
1786      t2 = VIS_LD_U8_I(tab2, s0);
1787      t1 = VIS_LD_U8_I(tab1, s0);
1788      t0 = VIS_LD_U8_I(tab0, s0);
1789      acc = vis_faligndata(t7, acc);
1790      acc = vis_faligndata(t6, acc);
1791      acc = vis_faligndata(t5, acc);
1792      acc = vis_faligndata(t4, acc);
1793      acc = vis_faligndata(t3, acc);
1794      acc = vis_faligndata(t2, acc);
1795      acc = vis_faligndata(t1, acc);
1796      acc = vis_faligndata(t0, acc);
1797      s0 = sp[0];
1798      s1 = sp[1];
1799      *dp++ = acc;
1800    }
1801
1802    t7 = VIS_LD_U8_I(tab3, s1);
1803    t6 = VIS_LD_U8_I(tab2, s1);
1804    t5 = VIS_LD_U8_I(tab1, s1);
1805    t4 = VIS_LD_U8_I(tab0, s1);
1806    t3 = VIS_LD_U8_I(tab3, s0);
1807    t2 = VIS_LD_U8_I(tab2, s0);
1808    t1 = VIS_LD_U8_I(tab1, s0);
1809    t0 = VIS_LD_U8_I(tab0, s0);
1810    acc = vis_faligndata(t7, acc);
1811    acc = vis_faligndata(t6, acc);
1812    acc = vis_faligndata(t5, acc);
1813    acc = vis_faligndata(t4, acc);
1814    acc = vis_faligndata(t3, acc);
1815    acc = vis_faligndata(t2, acc);
1816    acc = vis_faligndata(t1, acc);
1817    acc = vis_faligndata(t0, acc);
1818    *dp++ = acc;
1819  }
1820
1821  if ((xsize & 1) != 0) {
1822    s0 = sp[0];
1823    t7 = VIS_LD_U8_I(tab3, s0);
1824    t6 = VIS_LD_U8_I(tab2, s0);
1825    t5 = VIS_LD_U8_I(tab1, s0);
1826    t4 = VIS_LD_U8_I(tab0, s0);
1827    acc = vis_faligndata(t7, acc);
1828    acc = vis_faligndata(t6, acc);
1829    acc = vis_faligndata(t5, acc);
1830    acc = vis_faligndata(t4, acc);
1831    *(mlib_f32*)dp = vis_read_hi(acc);
1832  }
1833}
1834
1835/***************************************************************/
1836void mlib_v_ImageLookUpSI_U8_U8_4_DstOff1_D1_SMALL(const mlib_u8 *src,
1837                                                   mlib_u8       *dst,
1838                                                   mlib_s32      xsize,
1839                                                   const mlib_u8 **table)
1840{
1841  mlib_u8  *sp;              /* pointer to source data */
1842  mlib_u32 s0, s1, s2;       /* source data */
1843  mlib_u8  *dl;              /* pointer to start of destination */
1844  mlib_d64 *dp;              /* aligned pointer to destination */
1845  mlib_d64 t0, t1, t2;       /* destination data */
1846  mlib_d64 t3, t4, t5;       /* destination data */
1847  mlib_d64 t6, t7, acc;      /* destination data */
1848  mlib_s32 i;                /* loop variable */
1849  const mlib_u8  *tab0 = table[0];
1850  const mlib_u8  *tab1 = table[1];
1851  const mlib_u8  *tab2 = table[2];
1852  const mlib_u8  *tab3 = table[3];
1853
1854  sp   = (void *)src;
1855  dl   = dst;
1856  dp   = (mlib_d64 *) dl;
1857
1858  vis_alignaddr((void *) 0, 7);
1859
1860  s0 = *sp++;
1861
1862  if (xsize >= 2) {
1863
1864    s1 = sp[0];
1865    s2 = sp[1];
1866    sp += 2;
1867
1868#pragma pipeloop(0)
1869    for(i = 0; i <= xsize - 4; i+=2, sp+=2) {
1870      t7 = VIS_LD_U8_I(tab0, s2);
1871      t6 = VIS_LD_U8_I(tab3, s1);
1872      t5 = VIS_LD_U8_I(tab2, s1);
1873      t4 = VIS_LD_U8_I(tab1, s1);
1874      t3 = VIS_LD_U8_I(tab0, s1);
1875      t2 = VIS_LD_U8_I(tab3, s0);
1876      t1 = VIS_LD_U8_I(tab2, s0);
1877      t0 = VIS_LD_U8_I(tab1, s0);
1878      acc = vis_faligndata(t7, acc);
1879      acc = vis_faligndata(t6, acc);
1880      acc = vis_faligndata(t5, acc);
1881      acc = vis_faligndata(t4, acc);
1882      acc = vis_faligndata(t3, acc);
1883      acc = vis_faligndata(t2, acc);
1884      acc = vis_faligndata(t1, acc);
1885      acc = vis_faligndata(t0, acc);
1886      s0 = s2;
1887      s1 = sp[0];
1888      s2 = sp[1];
1889      *dp++ = acc;
1890    }
1891
1892    t7 = VIS_LD_U8_I(tab0, s2);
1893    t6 = VIS_LD_U8_I(tab3, s1);
1894    t5 = VIS_LD_U8_I(tab2, s1);
1895    t4 = VIS_LD_U8_I(tab1, s1);
1896    t3 = VIS_LD_U8_I(tab0, s1);
1897    t2 = VIS_LD_U8_I(tab3, s0);
1898    t1 = VIS_LD_U8_I(tab2, s0);
1899    t0 = VIS_LD_U8_I(tab1, s0);
1900    acc = vis_faligndata(t7, acc);
1901    acc = vis_faligndata(t6, acc);
1902    acc = vis_faligndata(t5, acc);
1903    acc = vis_faligndata(t4, acc);
1904    acc = vis_faligndata(t3, acc);
1905    acc = vis_faligndata(t2, acc);
1906    acc = vis_faligndata(t1, acc);
1907    acc = vis_faligndata(t0, acc);
1908    s0 = s2;
1909    *dp++ = acc;
1910  }
1911
1912  dl = (mlib_u8*)dp;
1913
1914  if ((xsize & 1) != 0) {
1915    s1 = sp[0];
1916    t7 = VIS_LD_U8_I(tab0, s1);
1917    t6 = VIS_LD_U8_I(tab3, s0);
1918    t5 = VIS_LD_U8_I(tab2, s0);
1919    t4 = VIS_LD_U8_I(tab1, s0);
1920    acc = vis_faligndata(t7, acc);
1921    acc = vis_faligndata(t6, acc);
1922    acc = vis_faligndata(t5, acc);
1923    acc = vis_faligndata(t4, acc);
1924    *(mlib_f32*)dl = vis_read_hi(acc);
1925    dl += 4;
1926    s0 = s1;
1927  }
1928
1929  dl[0] = tab1[s0];
1930  dl[1] = tab2[s0];
1931  dl[2] = tab3[s0];
1932}
1933
1934/***************************************************************/
1935void mlib_v_ImageLookUpSI_U8_U8_4_DstOff2_D1_SMALL(const mlib_u8 *src,
1936                                                   mlib_u8       *dst,
1937                                                   mlib_s32      xsize,
1938                                                   const mlib_u8 **table)
1939{
1940  mlib_u8  *sp;              /* pointer to source data */
1941  mlib_u32 s0, s1, s2;       /* source data */
1942  mlib_u8  *dl;              /* pointer to start of destination */
1943  mlib_d64 *dp;              /* aligned pointer to destination */
1944  mlib_d64 t0, t1, t2;       /* destination data */
1945  mlib_d64 t3, t4, t5;       /* destination data */
1946  mlib_d64 t6, t7, acc;      /* destination data */
1947  mlib_s32 i;                /* loop variable */
1948  const mlib_u8  *tab0 = table[0];
1949  const mlib_u8  *tab1 = table[1];
1950  const mlib_u8  *tab2 = table[2];
1951  const mlib_u8  *tab3 = table[3];
1952
1953  sp   = (void *)src;
1954  dl   = dst;
1955  dp   = (mlib_d64 *) dl;
1956
1957  vis_alignaddr((void *) 0, 7);
1958
1959  s0 = *sp++;
1960
1961  if (xsize >= 2) {
1962
1963    s1 = sp[0];
1964    s2 = sp[1];
1965    sp += 2;
1966
1967#pragma pipeloop(0)
1968    for(i = 0; i <= xsize - 4; i+=2, sp+=2) {
1969      t7 = VIS_LD_U8_I(tab1, s2);
1970      t6 = VIS_LD_U8_I(tab0, s2);
1971      t5 = VIS_LD_U8_I(tab3, s1);
1972      t4 = VIS_LD_U8_I(tab2, s1);
1973      t3 = VIS_LD_U8_I(tab1, s1);
1974      t2 = VIS_LD_U8_I(tab0, s1);
1975      t1 = VIS_LD_U8_I(tab3, s0);
1976      t0 = VIS_LD_U8_I(tab2, s0);
1977      acc = vis_faligndata(t7, acc);
1978      acc = vis_faligndata(t6, acc);
1979      acc = vis_faligndata(t5, acc);
1980      acc = vis_faligndata(t4, acc);
1981      acc = vis_faligndata(t3, acc);
1982      acc = vis_faligndata(t2, acc);
1983      acc = vis_faligndata(t1, acc);
1984      acc = vis_faligndata(t0, acc);
1985      s0 = s2;
1986      s1 = sp[0];
1987      s2 = sp[1];
1988      *dp++ = acc;
1989    }
1990
1991    t7 = VIS_LD_U8_I(tab1, s2);
1992    t6 = VIS_LD_U8_I(tab0, s2);
1993    t5 = VIS_LD_U8_I(tab3, s1);
1994    t4 = VIS_LD_U8_I(tab2, s1);
1995    t3 = VIS_LD_U8_I(tab1, s1);
1996    t2 = VIS_LD_U8_I(tab0, s1);
1997    t1 = VIS_LD_U8_I(tab3, s0);
1998    t0 = VIS_LD_U8_I(tab2, s0);
1999    acc = vis_faligndata(t7, acc);
2000    acc = vis_faligndata(t6, acc);
2001    acc = vis_faligndata(t5, acc);
2002    acc = vis_faligndata(t4, acc);
2003    acc = vis_faligndata(t3, acc);
2004    acc = vis_faligndata(t2, acc);
2005    acc = vis_faligndata(t1, acc);
2006    acc = vis_faligndata(t0, acc);
2007    s0 = s2;
2008    *dp++ = acc;
2009  }
2010
2011  dl = (mlib_u8*)dp;
2012
2013  if ((xsize & 1) != 0) {
2014    s1 = sp[0];
2015    t7 = VIS_LD_U8_I(tab1, s1);
2016    t6 = VIS_LD_U8_I(tab0, s1);
2017    t5 = VIS_LD_U8_I(tab3, s0);
2018    t4 = VIS_LD_U8_I(tab2, s0);
2019    acc = vis_faligndata(t7, acc);
2020    acc = vis_faligndata(t6, acc);
2021    acc = vis_faligndata(t5, acc);
2022    acc = vis_faligndata(t4, acc);
2023    *(mlib_f32*)dl = vis_read_hi(acc);
2024    dl += 4;
2025    s0 = s1;
2026  }
2027
2028  dl[0] = tab2[s0];
2029  dl[1] = tab3[s0];
2030}
2031
2032/***************************************************************/
2033void mlib_v_ImageLookUpSI_U8_U8_4_DstOff3_D1_SMALL(const mlib_u8 *src,
2034                                                   mlib_u8       *dst,
2035                                                   mlib_s32      xsize,
2036                                                   const mlib_u8 **table)
2037{
2038  mlib_u8  *sp;              /* pointer to source data */
2039  mlib_u32 s0, s1, s2;       /* source data */
2040  mlib_u8 *dl;               /* pointer to start of destination */
2041  mlib_d64 *dp;              /* aligned pointer to destination */
2042  mlib_d64 t0, t1, t2;       /* destination data */
2043  mlib_d64 t3, t4, t5;       /* destination data */
2044  mlib_d64 t6, t7, acc;      /* destination data */
2045  mlib_s32 i;                /* loop variable */
2046  const mlib_u8  *tab0 = table[0];
2047  const mlib_u8  *tab1 = table[1];
2048  const mlib_u8  *tab2 = table[2];
2049  const mlib_u8  *tab3 = table[3];
2050
2051  sp   = (void *)src;
2052  dl   = dst;
2053  dp   = (mlib_d64 *) dl;
2054
2055  vis_alignaddr((void *) 0, 7);
2056
2057  s0 = *sp++;
2058
2059  if (xsize >= 2) {
2060
2061    s1 = sp[0];
2062    s2 = sp[1];
2063    sp += 2;
2064
2065#pragma pipeloop(0)
2066    for(i = 0; i <= xsize - 4; i+=2, sp+=2) {
2067      t7 = VIS_LD_U8_I(tab2, s2);
2068      t6 = VIS_LD_U8_I(tab1, s2);
2069      t5 = VIS_LD_U8_I(tab0, s2);
2070      t4 = VIS_LD_U8_I(tab3, s1);
2071      t3 = VIS_LD_U8_I(tab2, s1);
2072      t2 = VIS_LD_U8_I(tab1, s1);
2073      t1 = VIS_LD_U8_I(tab0, s1);
2074      t0 = VIS_LD_U8_I(tab3, s0);
2075      acc = vis_faligndata(t7, acc);
2076      acc = vis_faligndata(t6, acc);
2077      acc = vis_faligndata(t5, acc);
2078      acc = vis_faligndata(t4, acc);
2079      acc = vis_faligndata(t3, acc);
2080      acc = vis_faligndata(t2, acc);
2081      acc = vis_faligndata(t1, acc);
2082      acc = vis_faligndata(t0, acc);
2083      s0 = s2;
2084      s1 = sp[0];
2085      s2 = sp[1];
2086      *dp++ = acc;
2087    }
2088
2089    t7 = VIS_LD_U8_I(tab2, s2);
2090    t6 = VIS_LD_U8_I(tab1, s2);
2091    t5 = VIS_LD_U8_I(tab0, s2);
2092    t4 = VIS_LD_U8_I(tab3, s1);
2093    t3 = VIS_LD_U8_I(tab2, s1);
2094    t2 = VIS_LD_U8_I(tab1, s1);
2095    t1 = VIS_LD_U8_I(tab0, s1);
2096    t0 = VIS_LD_U8_I(tab3, s0);
2097    acc = vis_faligndata(t7, acc);
2098    acc = vis_faligndata(t6, acc);
2099    acc = vis_faligndata(t5, acc);
2100    acc = vis_faligndata(t4, acc);
2101    acc = vis_faligndata(t3, acc);
2102    acc = vis_faligndata(t2, acc);
2103    acc = vis_faligndata(t1, acc);
2104    acc = vis_faligndata(t0, acc);
2105    s0 = s2;
2106    *dp++ = acc;
2107  }
2108
2109  dl = (mlib_u8*)dp;
2110
2111  if ((xsize & 1) != 0) {
2112    s1 = sp[0];
2113    t7 = VIS_LD_U8_I(tab2, s1);
2114    t6 = VIS_LD_U8_I(tab1, s1);
2115    t5 = VIS_LD_U8_I(tab0, s1);
2116    t4 = VIS_LD_U8_I(tab3, s0);
2117    acc = vis_faligndata(t7, acc);
2118    acc = vis_faligndata(t6, acc);
2119    acc = vis_faligndata(t5, acc);
2120    acc = vis_faligndata(t4, acc);
2121    *(mlib_f32*)dl = vis_read_hi(acc);
2122    dl += 4;
2123    s0 = s1;
2124  }
2125
2126  dl[0] = tab3[s0];
2127}
2128
2129/***************************************************************/
2130void mlib_v_ImageLookUpSI_U8_U8_4(const mlib_u8 *src,
2131                                  mlib_s32      slb,
2132                                  mlib_u8       *dst,
2133                                  mlib_s32      dlb,
2134                                  mlib_s32      xsize,
2135                                  mlib_s32      ysize,
2136                                  const mlib_u8 **table)
2137{
2138  if ((xsize * ysize) < 500) {
2139    mlib_u8  *sl;
2140    mlib_u8  *dl;
2141    mlib_s32 j;
2142    const mlib_u8  *tab0 = table[0];
2143    const mlib_u8  *tab1 = table[1];
2144    const mlib_u8  *tab2 = table[2];
2145    const mlib_u8  *tab3 = table[3];
2146
2147    sl = (void *)src;
2148    dl = dst;
2149
2150    /* row loop */
2151    for (j = 0; j < ysize; j ++) {
2152      mlib_u8 *sp = sl;
2153      mlib_u8 *dp = dl;
2154      mlib_s32 off, s0, size = xsize;
2155
2156      off =  (8 - ((mlib_addr)dp & 7)) & 7;
2157
2158      if ((off >= 4) && (size > 0)) {
2159        s0 = *sp++;
2160        *dp++ = tab0[s0];
2161        *dp++ = tab1[s0];
2162        *dp++ = tab2[s0];
2163        *dp++ = tab3[s0];
2164        size--;
2165      }
2166
2167      if (size > 0) {
2168        off =  (4 - ((mlib_addr)dp & 3)) & 3;
2169
2170        if (off == 0) {
2171          mlib_v_ImageLookUpSI_U8_U8_4_DstOff0_D1_SMALL(sp, dp, size, table);
2172        } else if (off == 1) {
2173          s0 = *sp;
2174          *dp++ = tab0[s0];
2175          size--;
2176          mlib_v_ImageLookUpSI_U8_U8_4_DstOff1_D1_SMALL(sp, dp, size, table);
2177        } else if (off == 2) {
2178          s0 = *sp;
2179          *dp++ = tab0[s0];
2180          *dp++ = tab1[s0];
2181          size--;
2182          mlib_v_ImageLookUpSI_U8_U8_4_DstOff2_D1_SMALL(sp, dp, size, table);
2183        } else if (off == 3) {
2184          s0 = *sp;
2185          *dp++ = tab0[s0];
2186          *dp++ = tab1[s0];
2187          *dp++ = tab2[s0];
2188          size--;
2189          mlib_v_ImageLookUpSI_U8_U8_4_DstOff3_D1_SMALL(sp, dp, size, table);
2190        }
2191      }
2192
2193      sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
2194      dl = (mlib_u8 *) ((mlib_u8 *) dl + dlb);
2195    }
2196
2197  } else {
2198    mlib_u8  *sl;
2199    mlib_u8  *dl;
2200    mlib_u32 tab[256];
2201    const mlib_u8  *tab0 = table[0];
2202    const mlib_u8  *tab1 = table[1];
2203    const mlib_u8  *tab2 = table[2];
2204    const mlib_u8  *tab3 = table[3];
2205    mlib_s32 i, j;
2206    mlib_u32 s0, s1, s2, s3, s4;
2207
2208    s0 = tab0[0];
2209    s1 = tab1[0];
2210    s2 = tab2[0];
2211    s3 = tab3[0];
2212    for (i = 1; i < 256; i++) {
2213      s4 = (s0 << 24) + (s1 << 16) + (s2 << 8) + s3;
2214      s0 = tab0[i];
2215      s1 = tab1[i];
2216      s2 = tab2[i];
2217      s3 = tab3[i];
2218      tab[i-1] = s4;
2219    }
2220
2221    s4 = (s0 << 24) + (s1 << 16) + (s2 << 8) + s3;
2222    tab[255] = s4;
2223
2224    sl = (void *)src;
2225    dl = dst;
2226
2227    /* row loop */
2228    for (j = 0; j < ysize; j ++) {
2229      mlib_u8 *sp = sl;
2230      mlib_u8 *dp = dl;
2231      mlib_s32 off, size = xsize;
2232
2233      if (((mlib_addr)dp & 3) == 0) {
2234        off = (4 - (mlib_addr)sp & 3) & 3;
2235
2236        off = (off < size) ? off : size;
2237
2238#pragma pipeloop(0)
2239        for (i = 0; i < off; i++) {
2240          *(mlib_u32*)dp = tab[(*sp)];
2241          dp += 4; sp++;
2242        }
2243
2244        size -= off;
2245
2246        if (size > 0) {
2247          mlib_v_ImageLookUpSI_U8_U8_4_SrcOff0_D1(sp, dp, size, (mlib_f32*)tab);
2248        }
2249
2250      } else {
2251
2252        off = ((4 - ((mlib_addr)sp & 3)) & 3);
2253        off = (off < size) ? off : size;
2254
2255        for (i = 0; i < off; i++) {
2256          s0 = tab[(*sp)];
2257          *dp++ = (s0 >> 24);
2258          *dp++ = (s0 >> 16);
2259          *dp++ = (s0 >> 8);
2260          *dp++ = s0;
2261          size--; sp++;
2262        }
2263
2264        if (size > 0) {
2265          mlib_v_ImageLookUpSI_U8_U8_4_DstNonAl_D1(sp, dp, size, (mlib_f32*)tab);
2266        }
2267      }
2268
2269      sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
2270      dl = (mlib_u8 *) ((mlib_u8 *) dl + dlb);
2271    }
2272  }
2273}
2274
2275/***************************************************************/
2276