1/*
2 * Copyright (c) 1998, 2003, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26
27
28#include "vis_proto.h"
29#include "mlib_image.h"
30#include "mlib_v_ImageLookUpFunc.h"
31
32/***************************************************************/
33static void mlib_v_ImageLookUpSI_U8_S16_2_SrcOff0_D1(const mlib_u8  *src,
34                                                     mlib_s16       *dst,
35                                                     mlib_s32       xsize,
36                                                     const mlib_f32 *table);
37
38static void mlib_v_ImageLookUpSI_U8_S16_2_DstNonAl_D1(const mlib_u8  *src,
39                                                      mlib_s16       *dst,
40                                                      mlib_s32       xsize,
41                                                      const mlib_f32 *table);
42
43static void mlib_v_ImageLookUpSI_U8_S16_2_DstA8D1_SMALL(const mlib_u8  *src,
44                                                        mlib_s16       *dst,
45                                                        mlib_s32       xsize,
46                                                        const mlib_s16 **table);
47
48static void mlib_v_ImageLookUpSI_U8_S16_2_D1_SMALL(const mlib_u8  *src,
49                                                   mlib_s16       *dst,
50                                                   mlib_s32       xsize,
51                                                   const mlib_s16 **table);
52
53static void mlib_v_ImageLookUpSI_U8_S16_3_SrcOff0_D1(const mlib_u8  *src,
54                                                     mlib_s16       *dst,
55                                                     mlib_s32       xsize,
56                                                     const mlib_d64 *table);
57
58static void mlib_v_ImageLookUpSI_U8_S16_3_SrcOff1_D1(const mlib_u8  *src,
59                                                     mlib_s16       *dst,
60                                                     mlib_s32       xsize,
61                                                     const mlib_d64 *table);
62
63static void mlib_v_ImageLookUpSI_U8_S16_3_SrcOff2_D1(const mlib_u8  *src,
64                                                     mlib_s16       *dst,
65                                                     mlib_s32       xsize,
66                                                     const mlib_d64 *table);
67
68static void mlib_v_ImageLookUpSI_U8_S16_3_SrcOff3_D1(const mlib_u8  *src,
69                                                     mlib_s16       *dst,
70                                                     mlib_s32       xsize,
71                                                     const mlib_d64 *table);
72
73static void mlib_v_ImageLookUpSI_U8_S16_3_D1_SMALL(const mlib_u8  *src,
74                                                   mlib_s16       *dst,
75                                                   mlib_s32       xsize,
76                                                   const mlib_s16 **table);
77
78static void mlib_v_ImageLookUpSI_U8_S16_4_DstA8D1_D1(const mlib_u8  *src,
79                                                     mlib_s16       *dst,
80                                                     mlib_s32       xsize,
81                                                     const mlib_d64 *table);
82
83static void mlib_v_ImageLookUpSI_U8_S16_4_DstNonAl_D1(const mlib_u8  *src,
84                                                      mlib_s16       *dst,
85                                                      mlib_s32       xsize,
86                                                      const mlib_d64 *table);
87
88static void mlib_v_ImageLookUpSI_U8_S16_4_DstOff0_D1_SMALL(const mlib_u8  *src,
89                                                           mlib_s16       *dst,
90                                                           mlib_s32       xsize,
91                                                           const mlib_s16 **table);
92
93static void mlib_v_ImageLookUpSI_U8_S16_4_DstOff1_D1_SMALL(const mlib_u8  *src,
94                                                           mlib_s16       *dst,
95                                                           mlib_s32       xsize,
96                                                           const mlib_s16 **table);
97
98static void mlib_v_ImageLookUpSI_U8_S16_4_DstOff2_D1_SMALL(const mlib_u8  *src,
99                                                           mlib_s16       *dst,
100                                                           mlib_s32       xsize,
101                                                           const mlib_s16 **table);
102
103static void mlib_v_ImageLookUpSI_U8_S16_4_DstOff3_D1_SMALL(const mlib_u8  *src,
104                                                           mlib_s16       *dst,
105                                                           mlib_s32       xsize,
106                                                           const mlib_s16 **table);
107
108/***************************************************************/
109#define VIS_LD_U16_I(X, Y)      vis_ld_u16_i((void *)(X), (Y))
110
111/***************************************************************/
112void mlib_v_ImageLookUpSI_U8_S16_2_SrcOff0_D1(const mlib_u8  *src,
113                                              mlib_s16       *dst,
114                                              mlib_s32       xsize,
115                                              const mlib_f32 *table)
116{
117  mlib_u32 *sa;                        /* aligned pointer to source data */
118  mlib_u8 *sp;                         /* pointer to source data */
119  mlib_u32 s0;                         /* source data */
120  mlib_f32 *dp;                        /* aligned pointer to destination */
121  mlib_f32 acc0, acc1;                 /* destination data */
122  mlib_f32 acc2, acc3;                 /* destination data */
123  mlib_s32 i;                          /* loop variable */
124  mlib_u32 s00, s01, s02, s03;
125
126  sa = (mlib_u32 *) src;
127  dp = (mlib_f32 *) dst;
128
129  i = 0;
130
131  if (xsize >= 4) {
132
133    s0 = *sa++;
134    s00 = (s0 >> 22) & 0x3FC;
135    s01 = (s0 >> 14) & 0x3FC;
136
137#pragma pipeloop(0)
138    for (i = 0; i <= xsize - 8; i += 4, dp += 4) {
139      s02 = (s0 >> 6) & 0x3FC;
140      s03 = (s0 << 2) & 0x3FC;
141      acc0 = *(mlib_f32 *) ((mlib_u8 *) table + s00);
142      acc1 = *(mlib_f32 *) ((mlib_u8 *) table + s01);
143      acc2 = *(mlib_f32 *) ((mlib_u8 *) table + s02);
144      acc3 = *(mlib_f32 *) ((mlib_u8 *) table + s03);
145      s0 = *sa++;
146      s00 = (s0 >> 22) & 0x3FC;
147      s01 = (s0 >> 14) & 0x3FC;
148      dp[0] = acc0;
149      dp[1] = acc1;
150      dp[2] = acc2;
151      dp[3] = acc3;
152    }
153
154    s02 = (s0 >> 6) & 0x3FC;
155    s03 = (s0 << 2) & 0x3FC;
156    acc0 = *(mlib_f32 *) ((mlib_u8 *) table + s00);
157    acc1 = *(mlib_f32 *) ((mlib_u8 *) table + s01);
158    acc2 = *(mlib_f32 *) ((mlib_u8 *) table + s02);
159    acc3 = *(mlib_f32 *) ((mlib_u8 *) table + s03);
160    dp[0] = acc0;
161    dp[1] = acc1;
162    dp[2] = acc2;
163    dp[3] = acc3;
164    dp += 4;
165    i += 4;
166  }
167
168  sp = (mlib_u8 *) sa;
169
170  if (i <= xsize - 2) {
171    *dp++ = table[sp[0]];
172    *dp++ = table[sp[1]];
173    i += 2;
174    sp += 2;
175  }
176
177  if (i < xsize)
178    *dp = table[sp[0]];
179}
180
181/***************************************************************/
182void mlib_v_ImageLookUpSI_U8_S16_2_DstNonAl_D1(const mlib_u8  *src,
183                                               mlib_s16       *dst,
184                                               mlib_s32       xsize,
185                                               const mlib_f32 *table)
186{
187  mlib_u32 *sa;                        /* aligned pointer to source data */
188  mlib_u8 *sp;                         /* pointer to source data */
189  mlib_u32 s0;                         /* source data */
190  mlib_s16 *dl;                        /* pointer to start of destination */
191  mlib_d64 *dp;                        /* aligned pointer to destination */
192  mlib_d64 acc0, acc1, acc2;           /* destination data */
193  mlib_s32 i;                          /* loop variable */
194  mlib_s16 *dend;                      /* pointer to end of destination */
195  mlib_s32 emask;                      /* edge mask */
196  mlib_s32 off;
197  mlib_u32 s00, s01, s02, s03;
198
199  sa = (mlib_u32 *) src;
200  sp = (void *)src;
201  dl = dst;
202  dend = dl + (xsize << 1) - 1;
203  dp = (mlib_d64 *) ((mlib_addr) dl & (~7));
204  off = (mlib_addr) dp - (mlib_addr) dl;
205  vis_alignaddr(dp, off);
206
207  emask = vis_edge16(dl, dend);
208  acc0 = vis_freg_pair(table[sp[0]], table[sp[1]]);
209  vis_pst_16(vis_faligndata(acc0, acc0), dp++, emask);
210  sp += 2;
211
212  xsize -= 2;
213
214  if (xsize >= 2) {
215    acc1 = vis_freg_pair(table[sp[0]], table[sp[1]]);
216    *dp++ = vis_faligndata(acc0, acc1);
217    acc0 = acc1;
218    sp += 2;
219    xsize -= 2;
220  }
221
222  sa++;
223
224  i = 0;
225
226  if (xsize >= 4) {
227
228    s0 = *sa++;
229    s00 = (s0 >> 22) & 0x3FC;
230    s01 = (s0 >> 14) & 0x3FC;
231
232#pragma pipeloop(0)
233    for (i = 0; i <= xsize - 8; i += 4, dp += 2) {
234      s02 = (s0 >> 6) & 0x3FC;
235      s03 = (s0 << 2) & 0x3FC;
236      acc1 = vis_freg_pair(*(mlib_f32 *) ((mlib_u8 *) table + s00),
237                           *(mlib_f32 *) ((mlib_u8 *) table + s01));
238      acc2 = vis_freg_pair(*(mlib_f32 *) ((mlib_u8 *) table + s02),
239                           *(mlib_f32 *) ((mlib_u8 *) table + s03));
240      s0 = *sa++;
241      s00 = (s0 >> 22) & 0x3FC;
242      s01 = (s0 >> 14) & 0x3FC;
243      dp[0] = vis_faligndata(acc0, acc1);
244      dp[1] = vis_faligndata(acc1, acc2);
245      acc0 = acc2;
246    }
247
248    s02 = (s0 >> 6) & 0x3FC;
249    s03 = (s0 << 2) & 0x3FC;
250    acc1 = vis_freg_pair(*(mlib_f32 *) ((mlib_u8 *) table + s00),
251                         *(mlib_f32 *) ((mlib_u8 *) table + s01));
252    acc2 = vis_freg_pair(*(mlib_f32 *) ((mlib_u8 *) table + s02),
253                         *(mlib_f32 *) ((mlib_u8 *) table + s03));
254    dp[0] = vis_faligndata(acc0, acc1);
255    dp[1] = vis_faligndata(acc1, acc2);
256    acc0 = acc2;
257    sp = (mlib_u8 *) sa;
258    dp += 2;
259    i += 4;
260  }
261
262  if (i <= xsize - 2) {
263    acc1 = vis_freg_pair(table[sp[0]], table[sp[1]]);
264    *dp++ = vis_faligndata(acc0, acc1);
265    acc0 = acc1;
266    i += 2;
267    sp += 2;
268  }
269
270  if ((mlib_addr) dp <= (mlib_addr) dend) {
271    emask = vis_edge16(dp, dend);
272    acc1 = vis_freg_pair(table[sp[0]], table[sp[1]]);
273    vis_pst_16(vis_faligndata(acc0, acc1), dp++, emask);
274  }
275
276  if ((mlib_addr) dp <= (mlib_addr) dend) {
277    emask = vis_edge16(dp, dend);
278    vis_pst_16(vis_faligndata(acc1, acc1), dp++, emask);
279  }
280}
281
282/***************************************************************/
283void mlib_v_ImageLookUpSI_U8_S16_2_DstA8D1_SMALL(const mlib_u8  *src,
284                                                 mlib_s16       *dst,
285                                                 mlib_s32       xsize,
286                                                 const mlib_s16 **table)
287{
288  mlib_u8 *sp;                         /* pointer to source data */
289  mlib_u32 s0, s1;                     /* source data */
290  mlib_s16 *dl;                        /* pointer to start of destination */
291  mlib_d64 *dp;                        /* aligned pointer to destination */
292  mlib_d64 t0, t1, t2;                 /* destination data */
293  mlib_d64 t3, acc;                    /* destination data */
294  mlib_s32 i;                          /* loop variable */
295  const mlib_s16 *tab0 = table[0];
296  const mlib_s16 *tab1 = table[1];
297
298  sp = (void *)src;
299  dl = dst;
300  dp = (mlib_d64 *) dl;
301
302  vis_alignaddr((void *)0, 6);
303
304  if (xsize >= 2) {
305
306    s0 = (sp[0] << 1);
307    s1 = (sp[1] << 1);
308    sp += 2;
309
310#pragma pipeloop(0)
311    for (i = 0; i <= xsize - 4; i += 2, sp += 2) {
312      t3 = VIS_LD_U16_I(tab1, s1);
313      t2 = VIS_LD_U16_I(tab0, s1);
314      t1 = VIS_LD_U16_I(tab1, s0);
315      t0 = VIS_LD_U16_I(tab0, s0);
316      acc = vis_faligndata(t3, acc);
317      acc = vis_faligndata(t2, acc);
318      acc = vis_faligndata(t1, acc);
319      acc = vis_faligndata(t0, acc);
320      s0 = (sp[0] << 1);
321      s1 = (sp[1] << 1);
322      *dp++ = acc;
323    }
324
325    t3 = VIS_LD_U16_I(tab1, s1);
326    t2 = VIS_LD_U16_I(tab0, s1);
327    t1 = VIS_LD_U16_I(tab1, s0);
328    t0 = VIS_LD_U16_I(tab0, s0);
329    acc = vis_faligndata(t3, acc);
330    acc = vis_faligndata(t2, acc);
331    acc = vis_faligndata(t1, acc);
332    acc = vis_faligndata(t0, acc);
333    *dp++ = acc;
334  }
335
336  if ((xsize & 1) != 0) {
337    s0 = (sp[0] << 1);
338    t1 = VIS_LD_U16_I(tab1, s0);
339    t0 = VIS_LD_U16_I(tab0, s0);
340    acc = vis_faligndata(t1, acc);
341    acc = vis_faligndata(t0, acc);
342    *(mlib_f32 *) dp = vis_read_hi(acc);
343  }
344}
345
346/***************************************************************/
347void mlib_v_ImageLookUpSI_U8_S16_2_D1_SMALL(const mlib_u8  *src,
348                                            mlib_s16       *dst,
349                                            mlib_s32       xsize,
350                                            const mlib_s16 **table)
351{
352  mlib_u8 *sp;                         /* pointer to source data */
353  mlib_u32 s0, s1, s2;                 /* source data */
354  mlib_s16 *dl;                        /* pointer to start of destination */
355  mlib_d64 *dp;                        /* aligned pointer to destination */
356  mlib_d64 t0, t1, t2;                 /* destination data */
357  mlib_d64 t3, acc;                    /* destination data */
358  mlib_s32 i;                          /* loop variable */
359  const mlib_s16 *tab0 = table[0];
360  const mlib_s16 *tab1 = table[1];
361
362  sp = (void *)src;
363  dl = dst;
364
365  vis_alignaddr((void *)0, 6);
366
367  s0 = *sp++;
368  *dl++ = tab0[s0];
369  dp = (mlib_d64 *) dl;
370  xsize--;
371  s0 <<= 1;
372
373  if (xsize >= 2) {
374
375    s1 = (sp[0] << 1);
376    s2 = (sp[1] << 1);
377    sp += 2;
378
379#pragma pipeloop(0)
380    for (i = 0; i <= xsize - 4; i += 2, sp += 2) {
381      t3 = VIS_LD_U16_I(tab0, s2);
382      t2 = VIS_LD_U16_I(tab1, s1);
383      t1 = VIS_LD_U16_I(tab0, s1);
384      t0 = VIS_LD_U16_I(tab1, s0);
385      acc = vis_faligndata(t3, acc);
386      acc = vis_faligndata(t2, acc);
387      acc = vis_faligndata(t1, acc);
388      acc = vis_faligndata(t0, acc);
389      s0 = s2;
390      s1 = (sp[0] << 1);
391      s2 = (sp[1] << 1);
392      *dp++ = acc;
393    }
394
395    t3 = VIS_LD_U16_I(tab0, s2);
396    t2 = VIS_LD_U16_I(tab1, s1);
397    t1 = VIS_LD_U16_I(tab0, s1);
398    t0 = VIS_LD_U16_I(tab1, s0);
399    acc = vis_faligndata(t3, acc);
400    acc = vis_faligndata(t2, acc);
401    acc = vis_faligndata(t1, acc);
402    acc = vis_faligndata(t0, acc);
403    s0 = s2;
404    *dp++ = acc;
405  }
406
407  dl = (mlib_s16 *) dp;
408
409  if ((xsize & 1) != 0) {
410    s1 = (sp[0] << 1);
411    t1 = VIS_LD_U16_I(tab0, s1);
412    t0 = VIS_LD_U16_I(tab1, s0);
413    acc = vis_faligndata(t1, acc);
414    acc = vis_faligndata(t0, acc);
415    *(mlib_f32 *) dp = vis_read_hi(acc);
416    s0 = s1;
417    dl += 2;
418  }
419
420  s0 >>= 1;
421  *dl = tab1[s0];
422}
423
424/***************************************************************/
425void mlib_v_ImageLookUpSI_U8_S16_2(const mlib_u8  *src,
426                                   mlib_s32       slb,
427                                   mlib_s16       *dst,
428                                   mlib_s32       dlb,
429                                   mlib_s32       xsize,
430                                   mlib_s32       ysize,
431                                   const mlib_s16 **table)
432{
433  if ((xsize * ysize) < 550) {
434    mlib_u8 *sl;
435    mlib_s16 *dl;
436    mlib_s32 j;
437    const mlib_s16 *tab0 = table[0];
438    const mlib_s16 *tab1 = table[1];
439
440    sl = (void *)src;
441    dl = dst;
442
443    /* row loop */
444    for (j = 0; j < ysize; j++) {
445      mlib_u8 *sp = sl;
446      mlib_s16 *dp = dl;
447      mlib_s32 off, s0, size = xsize;
448
449      off = ((8 - ((mlib_addr) dp & 7)) & 7);
450
451      if ((off >= 4) && (size > 0)) {
452        s0 = *sp++;
453        *dp++ = tab0[s0];
454        *dp++ = tab1[s0];
455        size--;
456      }
457
458      if (size > 0) {
459
460        if (((mlib_addr) dp & 7) == 0) {
461          mlib_v_ImageLookUpSI_U8_S16_2_DstA8D1_SMALL(sp, dp, size, table);
462        }
463        else {
464          mlib_v_ImageLookUpSI_U8_S16_2_D1_SMALL(sp, dp, size, table);
465        }
466      }
467
468      sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
469      dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
470    }
471  }
472  else {
473    mlib_u8 *sl;
474    mlib_s16 *dl;
475    mlib_u32 tab[256];
476    mlib_u16 *tab0 = (mlib_u16 *) table[0];
477    mlib_u16 *tab1 = (mlib_u16 *) table[1];
478    mlib_s32 i, j;
479    mlib_u32 s0, s1, s2;
480
481    s0 = tab0[0];
482    s1 = tab1[0];
483    for (i = 1; i < 256; i++) {
484      s2 = (s0 << 16) + s1;
485      s0 = tab0[i];
486      s1 = tab1[i];
487      tab[i - 1] = s2;
488    }
489
490    s2 = (s0 << 16) + s1;
491    tab[255] = s2;
492
493    sl = (void *)src;
494    dl = dst;
495
496    /* row loop */
497    for (j = 0; j < ysize; j++) {
498      mlib_u8 *sp = sl;
499      mlib_s16 *dp = dl;
500      mlib_s32 off, s0, size = xsize;
501
502      if (((mlib_addr) dp & 3) == 0) {
503
504        off = (4 - (mlib_addr) sp & 3) & 3;
505
506        off = (off < size) ? off : size;
507
508#pragma pipeloop(0)
509        for (i = 0; i < off; i++, sp++) {
510          *(mlib_u32 *) dp = tab[(*sp)];
511          dp += 2;
512        }
513
514        size -= off;
515
516        if (size > 0) {
517          mlib_v_ImageLookUpSI_U8_S16_2_SrcOff0_D1(sp, dp, size,
518                                                   (mlib_f32 *) tab);
519        }
520      }
521      else {
522
523        off = ((4 - ((mlib_addr) sp & 3)) & 3);
524        off = (off < size) ? off : size;
525
526        for (i = 0; i < off; i++) {
527          s0 = tab[(*sp)];
528          *dp++ = (s0 >> 16);
529          *dp++ = (s0 & 0xFFFF);
530          size--;
531          sp++;
532        }
533
534        if (size > 0) {
535          mlib_v_ImageLookUpSI_U8_S16_2_DstNonAl_D1(sp, dp, size,
536                                                    (mlib_f32 *) tab);
537        }
538      }
539
540      sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
541      dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
542    }
543  }
544}
545
546/***************************************************************/
547void mlib_v_ImageLookUpSI_U8_S16_3_SrcOff0_D1(const mlib_u8  *src,
548                                              mlib_s16       *dst,
549                                              mlib_s32       xsize,
550                                              const mlib_d64 *table)
551{
552  mlib_u8 *sp;                         /* pointer to source data */
553  mlib_u32 *sa;                        /* aligned pointer to source data */
554  mlib_u32 s0;                         /* source data */
555  mlib_s16 *dl;                        /* pointer to start of destination */
556  mlib_d64 *dp;                        /* aligned pointer to destination */
557  mlib_d64 t0, t1, t2, t3;             /* destination data */
558  mlib_d64 acc0, acc1, acc2;           /* destination data */
559  mlib_s32 i;                          /* loop variable */
560  mlib_s16 *ptr;
561
562  dl = dst;
563  sp = (void *)src;
564  dp = (mlib_d64 *) dl;
565  sa = (mlib_u32 *) sp;
566
567  vis_alignaddr((void *)0, 6);
568
569  i = 0;
570
571  if (xsize >= 4) {
572
573    s0 = *sa++;
574
575#pragma pipeloop(0)
576    for (i = 0; i <= xsize - 8; i += 4, dp += 3) {
577      t0 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 >> 21) & 0x7F8));
578      t1 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 >> 13) & 0x7F8));
579      t2 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 >> 5) & 0x7F8));
580      t3 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 << 3) & 0x7F8));
581      acc0 = vis_faligndata(t0, t0);
582      acc1 = vis_faligndata(acc0, acc0);
583      acc2 = vis_faligndata(acc0, t1);
584      acc0 = vis_faligndata(acc1, acc1);
585      acc1 = vis_faligndata(acc1, acc2);
586      acc2 = vis_faligndata(acc2, t2);
587      acc0 = vis_faligndata(acc0, acc1);
588      acc1 = vis_faligndata(acc1, acc2);
589      acc2 = vis_faligndata(acc2, t3);
590      s0 = *sa++;
591      dp[0] = acc0;
592      dp[1] = acc1;
593      dp[2] = acc2;
594    }
595
596    t0 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 >> 21) & 0x7F8));
597    t1 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 >> 13) & 0x7F8));
598    t2 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 >> 5) & 0x7F8));
599    t3 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 << 3) & 0x7F8));
600    acc0 = vis_faligndata(t0, t0);
601    acc1 = vis_faligndata(acc0, acc0);
602    acc2 = vis_faligndata(acc0, t1);
603    acc0 = vis_faligndata(acc1, acc1);
604    acc1 = vis_faligndata(acc1, acc2);
605    acc2 = vis_faligndata(acc2, t2);
606    acc0 = vis_faligndata(acc0, acc1);
607    acc1 = vis_faligndata(acc1, acc2);
608    acc2 = vis_faligndata(acc2, t3);
609    dp[0] = acc0;
610    dp[1] = acc1;
611    dp[2] = acc2;
612    i += 4;
613    dp += 3;
614  }
615
616  dl = (mlib_s16 *) dp;
617
618#pragma pipeloop(0)
619  for (; i < xsize; i++) {
620    ptr = (mlib_s16 *) (table + src[i]);
621    dl[0] = ptr[0];
622    dl[1] = ptr[1];
623    dl[2] = ptr[2];
624    dl += 3;
625  }
626}
627
628/***************************************************************/
629void mlib_v_ImageLookUpSI_U8_S16_3_SrcOff1_D1(const mlib_u8  *src,
630                                              mlib_s16       *dst,
631                                              mlib_s32       xsize,
632                                              const mlib_d64 *table)
633{
634  mlib_u8 *sp;                         /* pointer to source data */
635  mlib_u32 *sa;                        /* aligned pointer to source data */
636  mlib_u32 s0, s1;                     /* source data */
637  mlib_s16 *dl;                        /* pointer to start of destination */
638  mlib_d64 *dp;                        /* aligned pointer to destination */
639  mlib_d64 t0, t1, t2, t3;             /* destination data */
640  mlib_d64 acc0, acc1, acc2;           /* destination data */
641  mlib_s32 i;                          /* loop variable */
642  mlib_s16 *ptr;
643
644  dl = dst;
645  sp = (void *)src;
646  dp = (mlib_d64 *) dl;
647  sa = (mlib_u32 *) (sp - 1);
648
649  i = 0;
650  s0 = *sa++;
651
652  vis_alignaddr((void *)0, 6);
653
654  if (xsize >= 4) {
655
656    s1 = *sa++;
657
658#pragma pipeloop(0)
659    for (i = 0; i <= xsize - 8; i += 4, dp += 3) {
660      t0 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 >> 13) & 0x7F8));
661      t1 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 >> 5) & 0x7F8));
662      t2 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 << 3) & 0x7F8));
663      t3 = *(mlib_d64 *) ((mlib_u8 *) table + ((s1 >> 21) & 0x7F8));
664      acc0 = vis_faligndata(t0, t0);
665      acc1 = vis_faligndata(acc0, acc0);
666      acc2 = vis_faligndata(acc0, t1);
667      acc0 = vis_faligndata(acc1, acc1);
668      acc1 = vis_faligndata(acc1, acc2);
669      acc2 = vis_faligndata(acc2, t2);
670      acc0 = vis_faligndata(acc0, acc1);
671      acc1 = vis_faligndata(acc1, acc2);
672      acc2 = vis_faligndata(acc2, t3);
673      s0 = s1;
674      s1 = *sa++;
675      dp[0] = acc0;
676      dp[1] = acc1;
677      dp[2] = acc2;
678    }
679
680    t0 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 >> 13) & 0x7F8));
681    t1 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 >> 5) & 0x7F8));
682    t2 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 << 3) & 0x7F8));
683    t3 = *(mlib_d64 *) ((mlib_u8 *) table + ((s1 >> 21) & 0x7F8));
684    acc0 = vis_faligndata(t0, t0);
685    acc1 = vis_faligndata(acc0, acc0);
686    acc2 = vis_faligndata(acc0, t1);
687    acc0 = vis_faligndata(acc1, acc1);
688    acc1 = vis_faligndata(acc1, acc2);
689    acc2 = vis_faligndata(acc2, t2);
690    acc0 = vis_faligndata(acc0, acc1);
691    acc1 = vis_faligndata(acc1, acc2);
692    acc2 = vis_faligndata(acc2, t3);
693    dp[0] = acc0;
694    dp[1] = acc1;
695    dp[2] = acc2;
696    i += 4;
697    dp += 3;
698  }
699
700  dl = (mlib_s16 *) dp;
701
702#pragma pipeloop(0)
703  for (; i < xsize; i++) {
704    ptr = (mlib_s16 *) (table + src[i]);
705    dl[0] = ptr[0];
706    dl[1] = ptr[1];
707    dl[2] = ptr[2];
708    dl += 3;
709  }
710}
711
712/***************************************************************/
713void mlib_v_ImageLookUpSI_U8_S16_3_SrcOff2_D1(const mlib_u8  *src,
714                                              mlib_s16       *dst,
715                                              mlib_s32       xsize,
716                                              const mlib_d64 *table)
717{
718  mlib_u8 *sp;                         /* pointer to source data */
719  mlib_u32 *sa;                        /* aligned pointer to source data */
720  mlib_u32 s0, s1;                     /* source data */
721  mlib_s16 *dl;                        /* pointer to start of destination */
722  mlib_d64 *dp;                        /* aligned pointer to destination */
723  mlib_d64 t0, t1, t2, t3;             /* destination data */
724  mlib_d64 acc0, acc1, acc2;           /* destination data */
725  mlib_s32 i;                          /* loop variable */
726  mlib_s16 *ptr;
727
728  dl = dst;
729  sp = (void *)src;
730  dp = (mlib_d64 *) dl;
731  sa = (mlib_u32 *) (sp - 2);
732
733  i = 0;
734  s0 = *sa++;
735
736  vis_alignaddr((void *)0, 6);
737
738  if (xsize >= 4) {
739
740    s1 = *sa++;
741
742#pragma pipeloop(0)
743    for (i = 0; i <= xsize - 8; i += 4, dp += 3) {
744      t0 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 >> 5) & 0x7F8));
745      t1 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 << 3) & 0x7F8));
746      t2 = *(mlib_d64 *) ((mlib_u8 *) table + ((s1 >> 21) & 0x7F8));
747      t3 = *(mlib_d64 *) ((mlib_u8 *) table + ((s1 >> 13) & 0x7F8));
748      acc0 = vis_faligndata(t0, t0);
749      acc1 = vis_faligndata(acc0, acc0);
750      acc2 = vis_faligndata(acc0, t1);
751      acc0 = vis_faligndata(acc1, acc1);
752      acc1 = vis_faligndata(acc1, acc2);
753      acc2 = vis_faligndata(acc2, t2);
754      acc0 = vis_faligndata(acc0, acc1);
755      acc1 = vis_faligndata(acc1, acc2);
756      acc2 = vis_faligndata(acc2, t3);
757      s0 = s1;
758      s1 = *sa++;
759      dp[0] = acc0;
760      dp[1] = acc1;
761      dp[2] = acc2;
762    }
763
764    t0 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 >> 5) & 0x7F8));
765    t1 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 << 3) & 0x7F8));
766    t2 = *(mlib_d64 *) ((mlib_u8 *) table + ((s1 >> 21) & 0x7F8));
767    t3 = *(mlib_d64 *) ((mlib_u8 *) table + ((s1 >> 13) & 0x7F8));
768    acc0 = vis_faligndata(t0, t0);
769    acc1 = vis_faligndata(acc0, acc0);
770    acc2 = vis_faligndata(acc0, t1);
771    acc0 = vis_faligndata(acc1, acc1);
772    acc1 = vis_faligndata(acc1, acc2);
773    acc2 = vis_faligndata(acc2, t2);
774    acc0 = vis_faligndata(acc0, acc1);
775    acc1 = vis_faligndata(acc1, acc2);
776    acc2 = vis_faligndata(acc2, t3);
777    dp[0] = acc0;
778    dp[1] = acc1;
779    dp[2] = acc2;
780    i += 4;
781    dp += 3;
782  }
783
784  dl = (mlib_s16 *) dp;
785
786#pragma pipeloop(0)
787  for (; i < xsize; i++) {
788    ptr = (mlib_s16 *) (table + src[i]);
789    dl[0] = ptr[0];
790    dl[1] = ptr[1];
791    dl[2] = ptr[2];
792    dl += 3;
793  }
794}
795
796/***************************************************************/
797void mlib_v_ImageLookUpSI_U8_S16_3_SrcOff3_D1(const mlib_u8  *src,
798                                              mlib_s16       *dst,
799                                              mlib_s32       xsize,
800                                              const mlib_d64 *table)
801{
802  mlib_u8 *sp;                         /* pointer to source data */
803  mlib_u32 *sa;                        /* aligned pointer to source data */
804  mlib_u32 s0, s1;                     /* source data */
805  mlib_s16 *dl;                        /* pointer to start of destination */
806  mlib_d64 *dp;                        /* aligned pointer to destination */
807  mlib_d64 t0, t1, t2, t3;             /* destination data */
808  mlib_d64 acc0, acc1, acc2;           /* destination data */
809  mlib_s32 i;                          /* loop variable */
810  mlib_s16 *ptr;
811
812  dl = dst;
813  sp = (void *)src;
814  dp = (mlib_d64 *) dl;
815  sa = (mlib_u32 *) (sp - 3);
816
817  i = 0;
818  s0 = *sa++;
819
820  vis_alignaddr((void *)0, 6);
821
822  if (xsize >= 4) {
823
824    s1 = *sa++;
825
826#pragma pipeloop(0)
827    for (i = 0; i <= xsize - 8; i += 4, dp += 3) {
828      t0 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 << 3) & 0x7F8));
829      t1 = *(mlib_d64 *) ((mlib_u8 *) table + ((s1 >> 21) & 0x7F8));
830      t2 = *(mlib_d64 *) ((mlib_u8 *) table + ((s1 >> 13) & 0x7F8));
831      t3 = *(mlib_d64 *) ((mlib_u8 *) table + ((s1 >> 5) & 0x7F8));
832      acc0 = vis_faligndata(t0, t0);
833      acc1 = vis_faligndata(acc0, acc0);
834      acc2 = vis_faligndata(acc0, t1);
835      acc0 = vis_faligndata(acc1, acc1);
836      acc1 = vis_faligndata(acc1, acc2);
837      acc2 = vis_faligndata(acc2, t2);
838      acc0 = vis_faligndata(acc0, acc1);
839      acc1 = vis_faligndata(acc1, acc2);
840      acc2 = vis_faligndata(acc2, t3);
841      s0 = s1;
842      s1 = *sa++;
843      dp[0] = acc0;
844      dp[1] = acc1;
845      dp[2] = acc2;
846    }
847
848    t0 = *(mlib_d64 *) ((mlib_u8 *) table + ((s0 << 3) & 0x7F8));
849    t1 = *(mlib_d64 *) ((mlib_u8 *) table + ((s1 >> 21) & 0x7F8));
850    t2 = *(mlib_d64 *) ((mlib_u8 *) table + ((s1 >> 13) & 0x7F8));
851    t3 = *(mlib_d64 *) ((mlib_u8 *) table + ((s1 >> 5) & 0x7F8));
852    acc0 = vis_faligndata(t0, t0);
853    acc1 = vis_faligndata(acc0, acc0);
854    acc2 = vis_faligndata(acc0, t1);
855    acc0 = vis_faligndata(acc1, acc1);
856    acc1 = vis_faligndata(acc1, acc2);
857    acc2 = vis_faligndata(acc2, t2);
858    acc0 = vis_faligndata(acc0, acc1);
859    acc1 = vis_faligndata(acc1, acc2);
860    acc2 = vis_faligndata(acc2, t3);
861    dp[0] = acc0;
862    dp[1] = acc1;
863    dp[2] = acc2;
864    i += 4;
865    dp += 3;
866  }
867
868  dl = (mlib_s16 *) dp;
869
870#pragma pipeloop(0)
871  for (; i < xsize; i++) {
872    ptr = (mlib_s16 *) (table + src[i]);
873    dl[0] = ptr[0];
874    dl[1] = ptr[1];
875    dl[2] = ptr[2];
876    dl += 3;
877  }
878}
879
880/***************************************************************/
881void mlib_v_ImageLookUpSI_U8_S16_3_D1_SMALL(const mlib_u8  *src,
882                                            mlib_s16       *dst,
883                                            mlib_s32       xsize,
884                                            const mlib_s16 **table)
885{
886  mlib_u8 *sp;                         /* pointer to source data */
887  mlib_s16 *dl;                        /* pointer to start of destination */
888  mlib_d64 *dp;                        /* aligned pointer to destination */
889  mlib_d64 t0, t1, t2, t3;             /* destination data */
890  mlib_d64 acc0, acc1, acc2;           /* destination data */
891  mlib_s32 i;                          /* loop variable */
892  const mlib_s16 *tab0 = table[0];
893  const mlib_s16 *tab1 = table[1];
894  const mlib_s16 *tab2 = table[2];
895  mlib_u32 s00, s01, s02, s03;
896
897  sp = (void *)src;
898  dl = dst;
899  dp = (mlib_d64 *) dl;
900
901  vis_alignaddr((void *)0, 6);
902
903  i = 0;
904
905  if (xsize >= 4) {
906
907    s00 = (sp[0] << 1);
908    s01 = (sp[1] << 1);
909    s02 = (sp[2] << 1);
910    s03 = (sp[3] << 1);
911    sp += 4;
912
913#pragma pipeloop(0)
914    for (i = 0; i <= xsize - 8; i += 4, sp += 4) {
915      t3 = VIS_LD_U16_I(tab0, s01);
916      t2 = VIS_LD_U16_I(tab2, s00);
917      t1 = VIS_LD_U16_I(tab1, s00);
918      t0 = VIS_LD_U16_I(tab0, s00);
919      acc0 = vis_faligndata(t3, acc0);
920      acc0 = vis_faligndata(t2, acc0);
921      acc0 = vis_faligndata(t1, acc0);
922      acc0 = vis_faligndata(t0, acc0);
923      t3 = VIS_LD_U16_I(tab1, s02);
924      t2 = VIS_LD_U16_I(tab0, s02);
925      t1 = VIS_LD_U16_I(tab2, s01);
926      t0 = VIS_LD_U16_I(tab1, s01);
927      acc1 = vis_faligndata(t3, acc1);
928      acc1 = vis_faligndata(t2, acc1);
929      acc1 = vis_faligndata(t1, acc1);
930      acc1 = vis_faligndata(t0, acc1);
931      t3 = VIS_LD_U16_I(tab2, s03);
932      t2 = VIS_LD_U16_I(tab1, s03);
933      t1 = VIS_LD_U16_I(tab0, s03);
934      t0 = VIS_LD_U16_I(tab2, s02);
935      acc2 = vis_faligndata(t3, acc2);
936      acc2 = vis_faligndata(t2, acc2);
937      acc2 = vis_faligndata(t1, acc2);
938      acc2 = vis_faligndata(t0, acc2);
939      s00 = (sp[0] << 1);
940      s01 = (sp[1] << 1);
941      s02 = (sp[2] << 1);
942      s03 = (sp[3] << 1);
943      *dp++ = acc0;
944      *dp++ = acc1;
945      *dp++ = acc2;
946    }
947
948    t3 = VIS_LD_U16_I(tab0, s01);
949    t2 = VIS_LD_U16_I(tab2, s00);
950    t1 = VIS_LD_U16_I(tab1, s00);
951    t0 = VIS_LD_U16_I(tab0, s00);
952    acc0 = vis_faligndata(t3, acc0);
953    acc0 = vis_faligndata(t2, acc0);
954    acc0 = vis_faligndata(t1, acc0);
955    acc0 = vis_faligndata(t0, acc0);
956    t3 = VIS_LD_U16_I(tab1, s02);
957    t2 = VIS_LD_U16_I(tab0, s02);
958    t1 = VIS_LD_U16_I(tab2, s01);
959    t0 = VIS_LD_U16_I(tab1, s01);
960    acc1 = vis_faligndata(t3, acc1);
961    acc1 = vis_faligndata(t2, acc1);
962    acc1 = vis_faligndata(t1, acc1);
963    acc1 = vis_faligndata(t0, acc1);
964    t3 = VIS_LD_U16_I(tab2, s03);
965    t2 = VIS_LD_U16_I(tab1, s03);
966    t1 = VIS_LD_U16_I(tab0, s03);
967    t0 = VIS_LD_U16_I(tab2, s02);
968    acc2 = vis_faligndata(t3, acc2);
969    acc2 = vis_faligndata(t2, acc2);
970    acc2 = vis_faligndata(t1, acc2);
971    acc2 = vis_faligndata(t0, acc2);
972    *dp++ = acc0;
973    *dp++ = acc1;
974    *dp++ = acc2;
975    i += 4;
976  }
977
978  dl = (mlib_s16 *) dp;
979
980#pragma pipeloop(0)
981  for (; i < xsize; i++) {
982    s00 = sp[0];
983    dl[0] = tab0[s00];
984    dl[1] = tab1[s00];
985    dl[2] = tab2[s00];
986    dl += 3;
987    sp++;
988  }
989}
990
991/***************************************************************/
992void mlib_v_ImageLookUpSI_U8_S16_3(const mlib_u8  *src,
993                                   mlib_s32       slb,
994                                   mlib_s16       *dst,
995                                   mlib_s32       dlb,
996                                   mlib_s32       xsize,
997                                   mlib_s32       ysize,
998                                   const mlib_s16 **table)
999{
1000  if ((xsize * ysize) < 550) {
1001    mlib_u8 *sl;
1002    mlib_s16 *dl;
1003    mlib_s32 i, j;
1004    const mlib_s16 *tab0 = table[0];
1005    const mlib_s16 *tab1 = table[1];
1006    const mlib_s16 *tab2 = table[2];
1007
1008    sl = (void *)src;
1009    dl = dst;
1010
1011    /* row loop */
1012    for (j = 0; j < ysize; j++) {
1013      mlib_u8 *sp = sl;
1014      mlib_s16 *dp = dl;
1015      mlib_s32 off, s0, size = xsize;
1016
1017      off = ((mlib_addr) dp & 7) >> 1;
1018      off = (off < size) ? off : size;
1019
1020      for (i = 0; i < off; i++) {
1021        s0 = *sp++;
1022        *dp++ = tab0[s0];
1023        *dp++ = tab1[s0];
1024        *dp++ = tab2[s0];
1025        size--;
1026      }
1027
1028      if (size > 0) {
1029        mlib_v_ImageLookUpSI_U8_S16_3_D1_SMALL(sp, dp, size, table);
1030      }
1031
1032      sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
1033      dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
1034    }
1035  }
1036  else {
1037    mlib_u8 *sl;
1038    mlib_s16 *dl;
1039    mlib_u32 tab[512];
1040    mlib_u16 *tab0 = (mlib_u16 *) table[0];
1041    mlib_u16 *tab1 = (mlib_u16 *) table[1];
1042    mlib_u16 *tab2 = (mlib_u16 *) table[2];
1043    mlib_s32 i, j;
1044    mlib_u32 s0, s1, s2, s3;
1045
1046    s0 = tab0[0];
1047    s1 = tab1[0];
1048    s2 = tab2[0];
1049    for (i = 1; i < 256; i++) {
1050      s3 = (s0 << 16) + s1;
1051      s0 = tab0[i];
1052      s1 = tab1[i];
1053      tab[2 * i - 2] = s3;
1054      tab[2 * i - 1] = (s2 << 16);
1055      s2 = tab2[i];
1056    }
1057
1058    s3 = (s0 << 16) + s1;
1059    tab[510] = s3;
1060    tab[511] = (s2 << 16);
1061
1062    sl = (void *)src;
1063    dl = dst;
1064
1065    /* row loop */
1066    for (j = 0; j < ysize; j++) {
1067      mlib_u8 *sp = sl;
1068      mlib_s16 *dp = dl;
1069      mlib_s32 off, size = xsize;
1070      mlib_s16 *ptr;
1071
1072      off = ((mlib_addr) dp & 7) >> 1;
1073      off = (off < size) ? off : size;
1074
1075#pragma pipeloop(0)
1076      for (i = 0; i < off; i++) {
1077        ptr = (mlib_s16 *) (tab + 2 * sp[i]);
1078        dp[0] = ptr[0];
1079        dp[1] = ptr[1];
1080        dp[2] = ptr[2];
1081        dp += 3;
1082      }
1083
1084      size -= off;
1085      sp += off;
1086
1087      if (size > 0) {
1088        off = (mlib_addr) sp & 3;
1089
1090        if (off == 0) {
1091          mlib_v_ImageLookUpSI_U8_S16_3_SrcOff0_D1(sp, dp, size,
1092                                                   (mlib_d64 *) tab);
1093        }
1094        else if (off == 1) {
1095          mlib_v_ImageLookUpSI_U8_S16_3_SrcOff1_D1(sp, dp, size,
1096                                                   (mlib_d64 *) tab);
1097        }
1098        else if (off == 2) {
1099          mlib_v_ImageLookUpSI_U8_S16_3_SrcOff2_D1(sp, dp, size,
1100                                                   (mlib_d64 *) tab);
1101        }
1102        else if (off == 3) {
1103          mlib_v_ImageLookUpSI_U8_S16_3_SrcOff3_D1(sp, dp, size,
1104                                                   (mlib_d64 *) tab);
1105        }
1106      }
1107
1108      sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
1109      dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
1110    }
1111  }
1112}
1113
1114/***************************************************************/
1115void mlib_v_ImageLookUpSI_U8_S16_4_DstA8D1_D1(const mlib_u8  *src,
1116                                              mlib_s16       *dst,
1117                                              mlib_s32       xsize,
1118                                              const mlib_d64 *table)
1119{
1120  mlib_u32 *sa;                        /* aligned pointer to source data */
1121  mlib_u8 *sp;                         /* pointer to source data */
1122  mlib_u32 s0;                         /* source data */
1123  mlib_d64 *dp;                        /* aligned pointer to destination */
1124  mlib_d64 acc0, acc1;                 /* destination data */
1125  mlib_d64 acc2, acc3;                 /* destination data */
1126  mlib_s32 i;                          /* loop variable */
1127  mlib_u32 s00, s01, s02, s03;
1128
1129  sa = (mlib_u32 *) src;
1130  dp = (mlib_d64 *) dst;
1131
1132  i = 0;
1133
1134  if (xsize >= 4) {
1135
1136    s0 = *sa++;
1137    s00 = (s0 >> 21) & 0x7F8;
1138    s01 = (s0 >> 13) & 0x7F8;
1139
1140#pragma pipeloop(0)
1141    for (i = 0; i <= xsize - 8; i += 4, dp += 4) {
1142      s02 = (s0 >> 5) & 0x7F8;
1143      s03 = (s0 << 3) & 0x7F8;
1144      acc0 = *(mlib_d64 *) ((mlib_u8 *) table + s00);
1145      acc1 = *(mlib_d64 *) ((mlib_u8 *) table + s01);
1146      acc2 = *(mlib_d64 *) ((mlib_u8 *) table + s02);
1147      acc3 = *(mlib_d64 *) ((mlib_u8 *) table + s03);
1148      s0 = *sa++;
1149      s00 = (s0 >> 21) & 0x7F8;
1150      s01 = (s0 >> 13) & 0x7F8;
1151      dp[0] = acc0;
1152      dp[1] = acc1;
1153      dp[2] = acc2;
1154      dp[3] = acc3;
1155    }
1156
1157    s02 = (s0 >> 5) & 0x7F8;
1158    s03 = (s0 << 3) & 0x7F8;
1159    acc0 = *(mlib_d64 *) ((mlib_u8 *) table + s00);
1160    acc1 = *(mlib_d64 *) ((mlib_u8 *) table + s01);
1161    acc2 = *(mlib_d64 *) ((mlib_u8 *) table + s02);
1162    acc3 = *(mlib_d64 *) ((mlib_u8 *) table + s03);
1163    dp[0] = acc0;
1164    dp[1] = acc1;
1165    dp[2] = acc2;
1166    dp[3] = acc3;
1167    dp += 4;
1168    i += 4;
1169  }
1170
1171  sp = (mlib_u8 *) sa;
1172
1173  if (i <= xsize - 2) {
1174    *dp++ = table[sp[0]];
1175    *dp++ = table[sp[1]];
1176    i += 2;
1177    sp += 2;
1178  }
1179
1180  if (i < xsize)
1181    *dp++ = table[sp[0]];
1182}
1183
1184/***************************************************************/
1185void mlib_v_ImageLookUpSI_U8_S16_4_DstNonAl_D1(const mlib_u8  *src,
1186                                               mlib_s16       *dst,
1187                                               mlib_s32       xsize,
1188                                               const mlib_d64 *table)
1189{
1190  mlib_u32 *sa;                        /* aligned pointer to source data */
1191  mlib_u8 *sp;                         /* pointer to source data */
1192  mlib_u32 s0;                         /* source data */
1193  mlib_s16 *dl;                        /* pointer to start of destination */
1194  mlib_d64 *dp;                        /* aligned pointer to destination */
1195  mlib_d64 acc0, acc1;                 /* destination data */
1196  mlib_d64 acc2, acc3, acc4;           /* destination data */
1197  mlib_s32 i;                          /* loop variable */
1198  mlib_s16 *dend;                      /* pointer to end of destination */
1199  mlib_s32 emask;                      /* edge mask */
1200  mlib_s32 off;
1201  mlib_u32 s00, s01, s02, s03;
1202
1203  sp = (void *)src;
1204  dl = dst;
1205  dend = dl + (xsize << 2) - 1;
1206  dp = (mlib_d64 *) ((mlib_addr) dl & (~7));
1207  off = (mlib_addr) dp - (mlib_addr) dl;
1208  vis_alignaddr(dp, off);
1209
1210  emask = vis_edge16(dl, dend);
1211  acc0 = table[sp[0]];
1212  vis_pst_16(vis_faligndata(acc0, acc0), dp++, emask);
1213  sp++;
1214
1215  sa = (mlib_u32 *) sp;
1216
1217  xsize--;
1218
1219  i = 0;
1220
1221  if (xsize >= 4) {
1222
1223    s0 = *sa++;
1224    s00 = (s0 >> 21) & 0x7F8;
1225    s01 = (s0 >> 13) & 0x7F8;
1226
1227#pragma pipeloop(0)
1228    for (i = 0; i <= xsize - 8; i += 4, dp += 4) {
1229      s02 = (s0 >> 5) & 0x7F8;
1230      s03 = (s0 << 3) & 0x7F8;
1231      acc1 = *(mlib_d64 *) ((mlib_u8 *) table + s00);
1232      acc2 = *(mlib_d64 *) ((mlib_u8 *) table + s01);
1233      acc3 = *(mlib_d64 *) ((mlib_u8 *) table + s02);
1234      acc4 = *(mlib_d64 *) ((mlib_u8 *) table + s03);
1235      s0 = *sa++;
1236      s00 = (s0 >> 21) & 0x7F8;
1237      s01 = (s0 >> 13) & 0x7F8;
1238      dp[0] = vis_faligndata(acc0, acc1);
1239      dp[1] = vis_faligndata(acc1, acc2);
1240      dp[2] = vis_faligndata(acc2, acc3);
1241      dp[3] = vis_faligndata(acc3, acc4);
1242      acc0 = acc4;
1243    }
1244
1245    s02 = (s0 >> 5) & 0x7F8;
1246    s03 = (s0 << 3) & 0x7F8;
1247    acc1 = *(mlib_d64 *) ((mlib_u8 *) table + s00);
1248    acc2 = *(mlib_d64 *) ((mlib_u8 *) table + s01);
1249    acc3 = *(mlib_d64 *) ((mlib_u8 *) table + s02);
1250    acc4 = *(mlib_d64 *) ((mlib_u8 *) table + s03);
1251    dp[0] = vis_faligndata(acc0, acc1);
1252    dp[1] = vis_faligndata(acc1, acc2);
1253    dp[2] = vis_faligndata(acc2, acc3);
1254    dp[3] = vis_faligndata(acc3, acc4);
1255    acc0 = acc4;
1256    dp += 4;
1257    i += 4;
1258  }
1259
1260  sp = (mlib_u8 *) sa;
1261
1262  if (i <= xsize - 2) {
1263    acc1 = table[sp[0]];
1264    acc2 = table[sp[1]];
1265    *dp++ = vis_faligndata(acc0, acc1);
1266    *dp++ = vis_faligndata(acc1, acc2);
1267    i += 2;
1268    sp += 2;
1269    acc0 = acc2;
1270  }
1271
1272  if (i < xsize) {
1273    acc1 = table[sp[0]];
1274    *dp++ = vis_faligndata(acc0, acc1);
1275    acc0 = acc1;
1276  }
1277
1278  emask = vis_edge16(dp, dend);
1279  vis_pst_16(vis_faligndata(acc0, acc0), dp++, emask);
1280}
1281
1282/***************************************************************/
1283void mlib_v_ImageLookUpSI_U8_S16_4_DstOff0_D1_SMALL(const mlib_u8  *src,
1284                                                    mlib_s16       *dst,
1285                                                    mlib_s32       xsize,
1286                                                    const mlib_s16 **table)
1287{
1288  mlib_u8 *sp;                         /* pointer to source data */
1289  mlib_u32 s0;                         /* source data */
1290  mlib_s16 *dl;                        /* pointer to start of destination */
1291  mlib_d64 *dp;                        /* aligned pointer to destination */
1292  mlib_d64 t0, t1, t2, t3;             /* destination data */
1293  mlib_d64 acc;                        /* destination data */
1294  mlib_s32 i;                          /* loop variable */
1295  const mlib_s16 *tab0 = table[0];
1296  const mlib_s16 *tab1 = table[1];
1297  const mlib_s16 *tab2 = table[2];
1298  const mlib_s16 *tab3 = table[3];
1299
1300  sp = (void *)src;
1301  dl = dst;
1302  dp = (mlib_d64 *) dl;
1303
1304  vis_alignaddr((void *)0, 6);
1305
1306  if (xsize >= 1) {
1307
1308    s0 = (*sp++) << 1;
1309
1310#pragma pipeloop(0)
1311    for (i = 0; i <= xsize - 2; i++) {
1312      t3 = VIS_LD_U16_I(tab3, s0);
1313      t2 = VIS_LD_U16_I(tab2, s0);
1314      t1 = VIS_LD_U16_I(tab1, s0);
1315      t0 = VIS_LD_U16_I(tab0, s0);
1316      acc = vis_faligndata(t3, acc);
1317      acc = vis_faligndata(t2, acc);
1318      acc = vis_faligndata(t1, acc);
1319      acc = vis_faligndata(t0, acc);
1320      s0 = (*sp++) << 1;
1321      *dp++ = acc;
1322    }
1323
1324    t3 = VIS_LD_U16_I(tab3, s0);
1325    t2 = VIS_LD_U16_I(tab2, s0);
1326    t1 = VIS_LD_U16_I(tab1, s0);
1327    t0 = VIS_LD_U16_I(tab0, s0);
1328    acc = vis_faligndata(t3, acc);
1329    acc = vis_faligndata(t2, acc);
1330    acc = vis_faligndata(t1, acc);
1331    acc = vis_faligndata(t0, acc);
1332    *dp++ = acc;
1333  }
1334}
1335
1336/***************************************************************/
1337void mlib_v_ImageLookUpSI_U8_S16_4_DstOff1_D1_SMALL(const mlib_u8  *src,
1338                                                    mlib_s16       *dst,
1339                                                    mlib_s32       xsize,
1340                                                    const mlib_s16 **table)
1341{
1342  mlib_u8 *sp;                         /* pointer to source data */
1343  mlib_u32 s0, s1;                     /* source data */
1344  mlib_s16 *dl;                        /* pointer to start of destination */
1345  mlib_d64 *dp;                        /* aligned pointer to destination */
1346  mlib_d64 t0, t1, t2, t3;             /* destination data */
1347  mlib_d64 acc;                        /* destination data */
1348  mlib_s32 i;                          /* loop variable */
1349  const mlib_s16 *tab0 = table[0];
1350  const mlib_s16 *tab1 = table[1];
1351  const mlib_s16 *tab2 = table[2];
1352  const mlib_s16 *tab3 = table[3];
1353
1354  sp = (void *)src;
1355  dl = dst;
1356  dp = (mlib_d64 *) dl;
1357
1358  vis_alignaddr((void *)0, 6);
1359
1360  s0 = (*sp++) << 1;
1361
1362  if (xsize >= 1) {
1363
1364    s1 = (*sp++) << 1;
1365
1366#pragma pipeloop(0)
1367    for (i = 0; i <= xsize - 2; i++) {
1368      t3 = VIS_LD_U16_I(tab0, s1);
1369      t2 = VIS_LD_U16_I(tab3, s0);
1370      t1 = VIS_LD_U16_I(tab2, s0);
1371      t0 = VIS_LD_U16_I(tab1, s0);
1372      acc = vis_faligndata(t3, acc);
1373      acc = vis_faligndata(t2, acc);
1374      acc = vis_faligndata(t1, acc);
1375      acc = vis_faligndata(t0, acc);
1376      s0 = s1;
1377      s1 = (*sp++) << 1;
1378      *dp++ = acc;
1379    }
1380
1381    t3 = VIS_LD_U16_I(tab0, s1);
1382    t2 = VIS_LD_U16_I(tab3, s0);
1383    t1 = VIS_LD_U16_I(tab2, s0);
1384    t0 = VIS_LD_U16_I(tab1, s0);
1385    acc = vis_faligndata(t3, acc);
1386    acc = vis_faligndata(t2, acc);
1387    acc = vis_faligndata(t1, acc);
1388    acc = vis_faligndata(t0, acc);
1389    s0 = s1;
1390    *dp++ = acc;
1391  }
1392
1393  dl = (mlib_s16 *) dp;
1394  s0 >>= 1;
1395
1396  dl[0] = tab1[s0];
1397  dl[1] = tab2[s0];
1398  dl[2] = tab3[s0];
1399}
1400
1401/***************************************************************/
1402void mlib_v_ImageLookUpSI_U8_S16_4_DstOff2_D1_SMALL(const mlib_u8  *src,
1403                                                    mlib_s16       *dst,
1404                                                    mlib_s32       xsize,
1405                                                    const mlib_s16 **table)
1406{
1407  mlib_u8 *sp;                         /* pointer to source data */
1408  mlib_u32 s0, s1;                     /* source data */
1409  mlib_s16 *dl;                        /* pointer to start of destination */
1410  mlib_d64 *dp;                        /* aligned pointer to destination */
1411  mlib_d64 t0, t1, t2, t3;             /* destination data */
1412  mlib_d64 acc;                        /* destination data */
1413  mlib_s32 i;                          /* loop variable */
1414  const mlib_s16 *tab0 = table[0];
1415  const mlib_s16 *tab1 = table[1];
1416  const mlib_s16 *tab2 = table[2];
1417  const mlib_s16 *tab3 = table[3];
1418
1419  sp = (void *)src;
1420  dl = dst;
1421  dp = (mlib_d64 *) dl;
1422
1423  vis_alignaddr((void *)0, 6);
1424
1425  s0 = (*sp++) << 1;
1426
1427  if (xsize >= 1) {
1428
1429    s1 = (*sp++) << 1;
1430
1431#pragma pipeloop(0)
1432    for (i = 0; i <= xsize - 2; i++) {
1433      t3 = VIS_LD_U16_I(tab1, s1);
1434      t2 = VIS_LD_U16_I(tab0, s1);
1435      t1 = VIS_LD_U16_I(tab3, s0);
1436      t0 = VIS_LD_U16_I(tab2, s0);
1437      acc = vis_faligndata(t3, acc);
1438      acc = vis_faligndata(t2, acc);
1439      acc = vis_faligndata(t1, acc);
1440      acc = vis_faligndata(t0, acc);
1441      s0 = s1;
1442      s1 = (*sp++) << 1;
1443      *dp++ = acc;
1444    }
1445
1446    t3 = VIS_LD_U16_I(tab1, s1);
1447    t2 = VIS_LD_U16_I(tab0, s1);
1448    t1 = VIS_LD_U16_I(tab3, s0);
1449    t0 = VIS_LD_U16_I(tab2, s0);
1450    acc = vis_faligndata(t3, acc);
1451    acc = vis_faligndata(t2, acc);
1452    acc = vis_faligndata(t1, acc);
1453    acc = vis_faligndata(t0, acc);
1454    s0 = s1;
1455    *dp++ = acc;
1456  }
1457
1458  dl = (mlib_s16 *) dp;
1459  s0 >>= 1;
1460
1461  dl[0] = tab2[s0];
1462  dl[1] = tab3[s0];
1463}
1464
1465/***************************************************************/
1466void mlib_v_ImageLookUpSI_U8_S16_4_DstOff3_D1_SMALL(const mlib_u8  *src,
1467                                                    mlib_s16       *dst,
1468                                                    mlib_s32       xsize,
1469                                                    const mlib_s16 **table)
1470{
1471  mlib_u8 *sp;                         /* pointer to source data */
1472  mlib_u32 s0, s1;                     /* source data */
1473  mlib_s16 *dl;                        /* pointer to start of destination */
1474  mlib_d64 *dp;                        /* aligned pointer to destination */
1475  mlib_d64 t0, t1, t2, t3;             /* destination data */
1476  mlib_d64 acc;                        /* destination data */
1477  mlib_s32 i;                          /* loop variable */
1478  const mlib_s16 *tab0 = table[0];
1479  const mlib_s16 *tab1 = table[1];
1480  const mlib_s16 *tab2 = table[2];
1481  const mlib_s16 *tab3 = table[3];
1482
1483  sp = (void *)src;
1484  dl = dst;
1485  dp = (mlib_d64 *) dl;
1486
1487  vis_alignaddr((void *)0, 6);
1488
1489  s0 = (*sp++) << 1;
1490
1491  if (xsize >= 1) {
1492
1493    s1 = (*sp++) << 1;
1494
1495#pragma pipeloop(0)
1496    for (i = 0; i <= xsize - 2; i++) {
1497      t3 = VIS_LD_U16_I(tab2, s1);
1498      t2 = VIS_LD_U16_I(tab1, s1);
1499      t1 = VIS_LD_U16_I(tab0, s1);
1500      t0 = VIS_LD_U16_I(tab3, s0);
1501      acc = vis_faligndata(t3, acc);
1502      acc = vis_faligndata(t2, acc);
1503      acc = vis_faligndata(t1, acc);
1504      acc = vis_faligndata(t0, acc);
1505      s0 = s1;
1506      s1 = (*sp++) << 1;
1507      *dp++ = acc;
1508    }
1509
1510    t3 = VIS_LD_U16_I(tab2, s1);
1511    t2 = VIS_LD_U16_I(tab1, s1);
1512    t1 = VIS_LD_U16_I(tab0, s1);
1513    t0 = VIS_LD_U16_I(tab3, s0);
1514    acc = vis_faligndata(t3, acc);
1515    acc = vis_faligndata(t2, acc);
1516    acc = vis_faligndata(t1, acc);
1517    acc = vis_faligndata(t0, acc);
1518    s0 = s1;
1519    *dp++ = acc;
1520  }
1521
1522  dl = (mlib_s16 *) dp;
1523  s0 >>= 1;
1524
1525  dl[0] = tab3[s0];
1526}
1527
1528/***************************************************************/
1529void mlib_v_ImageLookUpSI_U8_S16_4(const mlib_u8  *src,
1530                                   mlib_s32       slb,
1531                                   mlib_s16       *dst,
1532                                   mlib_s32       dlb,
1533                                   mlib_s32       xsize,
1534                                   mlib_s32       ysize,
1535                                   const mlib_s16 **table)
1536{
1537  if ((xsize * ysize) < 550) {
1538    mlib_u8 *sl;
1539    mlib_s16 *dl;
1540    mlib_s32 j;
1541    const mlib_s16 *tab0 = table[0];
1542    const mlib_s16 *tab1 = table[1];
1543    const mlib_s16 *tab2 = table[2];
1544
1545    sl = (void *)src;
1546    dl = dst;
1547
1548    /* row loop */
1549    for (j = 0; j < ysize; j++) {
1550      mlib_u8 *sp = sl;
1551      mlib_s16 *dp = dl;
1552      mlib_s32 off, s0, size = xsize;
1553
1554      if (size > 0) {
1555        off = ((8 - ((mlib_addr) dp & 7)) & 7) >> 1;
1556
1557        if (off == 0) {
1558          mlib_v_ImageLookUpSI_U8_S16_4_DstOff0_D1_SMALL(sp, dp, size, table);
1559        }
1560        else if (off == 1) {
1561          s0 = *sp;
1562          *dp++ = tab0[s0];
1563          size--;
1564          mlib_v_ImageLookUpSI_U8_S16_4_DstOff1_D1_SMALL(sp, dp, size, table);
1565        }
1566        else if (off == 2) {
1567          s0 = *sp;
1568          *dp++ = tab0[s0];
1569          *dp++ = tab1[s0];
1570          size--;
1571          mlib_v_ImageLookUpSI_U8_S16_4_DstOff2_D1_SMALL(sp, dp, size, table);
1572        }
1573        else if (off == 3) {
1574          s0 = *sp;
1575          *dp++ = tab0[s0];
1576          *dp++ = tab1[s0];
1577          *dp++ = tab2[s0];
1578          size--;
1579          mlib_v_ImageLookUpSI_U8_S16_4_DstOff3_D1_SMALL(sp, dp, size, table);
1580        }
1581      }
1582
1583      sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
1584      dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
1585    }
1586  }
1587  else {
1588    mlib_u8 *sl;
1589    mlib_s16 *dl;
1590    mlib_u32 tab[512];
1591    mlib_u16 *tab0 = (mlib_u16 *) table[0];
1592    mlib_u16 *tab1 = (mlib_u16 *) table[1];
1593    mlib_u16 *tab2 = (mlib_u16 *) table[2];
1594    mlib_u16 *tab3 = (mlib_u16 *) table[3];
1595    mlib_s32 i, j;
1596    mlib_u32 s0, s1, s2, s3, s4, s5;
1597
1598    s0 = tab0[0];
1599    s1 = tab1[0];
1600    s2 = tab2[0];
1601    s3 = tab3[0];
1602    for (i = 1; i < 256; i++) {
1603      s4 = (s0 << 16) + s1;
1604      s5 = (s2 << 16) + s3;
1605      s0 = tab0[i];
1606      s1 = tab1[i];
1607      s2 = tab2[i];
1608      s3 = tab3[i];
1609      tab[2 * i - 2] = s4;
1610      tab[2 * i - 1] = s5;
1611    }
1612
1613    s4 = (s0 << 16) + s1;
1614    s5 = (s2 << 16) + s3;
1615    tab[510] = s4;
1616    tab[511] = s5;
1617
1618    sl = (void *)src;
1619    dl = dst;
1620
1621    /* row loop */
1622    for (j = 0; j < ysize; j++) {
1623      mlib_u8 *sp = sl;
1624      mlib_s16 *dp = dl;
1625      mlib_s32 off, s0, size = xsize;
1626      mlib_s16 *ptr;
1627
1628      if (((mlib_addr) dp & 7) == 0) {
1629
1630        off = ((4 - (mlib_addr) sp & 3) & 3);
1631        off = (off < size) ? off : size;
1632
1633#pragma pipeloop(0)
1634        for (i = 0; i < off; i++) {
1635          s0 = (*sp++);
1636          *(mlib_u32 *) dp = tab[2 * s0];
1637          *(mlib_u32 *) (dp + 2) = tab[2 * s0 + 1];
1638          dp += 4;
1639        }
1640
1641        size -= off;
1642
1643        if (size > 0) {
1644          mlib_v_ImageLookUpSI_U8_S16_4_DstA8D1_D1(sp, dp, size,
1645                                                   (mlib_d64 *) tab);
1646        }
1647      }
1648      else {
1649
1650        off = (3 - ((mlib_addr) sp & 3));
1651        off = (off < size) ? off : size;
1652
1653        for (i = 0; i < off; i++) {
1654          ptr = (mlib_s16 *) (tab + 2 * sp[i]);
1655          dp[0] = ptr[0];
1656          dp[1] = ptr[1];
1657          dp[2] = ptr[2];
1658          dp[3] = ptr[3];
1659          dp += 4;
1660        }
1661
1662        sp += off;
1663        size -= off;
1664
1665        if (size > 0) {
1666          mlib_v_ImageLookUpSI_U8_S16_4_DstNonAl_D1(sp, dp, size,
1667                                                    (mlib_d64 *) tab);
1668        }
1669      }
1670
1671      sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
1672      dl = (mlib_s16 *) ((mlib_u8 *) dl + dlb);
1673    }
1674  }
1675}
1676
1677/***************************************************************/
1678