1/*
2 * Copyright (c) 2000, 2003, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26
27
28/*
29 * FUNCTION
30 *      Internal functions for mlib_ImageConv* on U8 type
31 *      and MLIB_EDGE_DST_NO_WRITE mask
32 */
33
34#include "vis_proto.h"
35#include "mlib_image.h"
36#include "mlib_ImageCheck.h"
37#include "mlib_ImageCopy.h"
38#include "mlib_ImageConv.h"
39#include "mlib_v_ImageConv.h"
40
41/***************************************************************/
42#define DTYPE mlib_u8
43
44/***************************************************************/
45#define NCHAN  nchan
46
47/***************************************************************/
48#define DEF_VARS                                                \
49  DTYPE    *sl, *sp, *dl;                                       \
50  mlib_s32 hgt = mlib_ImageGetHeight(src);                      \
51  mlib_s32 wid = mlib_ImageGetWidth(src);                       \
52  mlib_s32 sll = mlib_ImageGetStride(src) / sizeof(DTYPE);      \
53  mlib_s32 dll = mlib_ImageGetStride(dst) / sizeof(DTYPE);      \
54  DTYPE    *adr_src = (DTYPE *)mlib_ImageGetData(src);          \
55  DTYPE    *adr_dst = (DTYPE *)mlib_ImageGetData(dst);          \
56  mlib_s32 ssize, xsize, dsize, esize, emask, buff_ind = 0;     \
57  mlib_d64 *pbuff, *dp;                                         \
58  mlib_f32 *karr = (mlib_f32 *)kern;                            \
59  mlib_s32 gsr_scale = (31 - scale) << 3;                       \
60  mlib_d64 drnd = vis_to_double_dup(mlib_round_8[31 - scale]);  \
61  mlib_s32 i, j, l
62
63/***************************************************************/
64#define DEF_EXTRA_VARS                                          \
65  mlib_s32 nchan = mlib_ImageGetChannels(dst)
66
67/***************************************************************/
68static const mlib_s32 mlib_round_8[16] = {
69  0x00400040, 0x00200020, 0x00100010, 0x00080008,
70  0x00040004, 0x00020002, 0x00010001, 0x00000000,
71  0x00000000, 0x00000000, 0x00000000, 0x00000000,
72  0x00000000, 0x00000000, 0x00000000, 0x00000000
73};
74
75/***************************************************************/
76#define MAX_N   11
77
78mlib_status mlib_convMxN_8nw_f(mlib_image       *dst,
79                               const mlib_image *src,
80                               mlib_s32         m,
81                               mlib_s32         n,
82                               mlib_s32         dm,
83                               mlib_s32         dn,
84                               const mlib_s32   *kern,
85                               mlib_s32         scale)
86{
87  mlib_d64 *buffs_local[3 * (MAX_N + 1)], **buffs = buffs_local, **buff;
88  mlib_d64 *buff0, *buff1, *buff2, *buff3, *buffn, *buffd, *buffe;
89  mlib_d64 s00, s01, s10, s11, s20, s21, s30, s31, s0, s1, s2, s3;
90  mlib_d64 d00, d01, d10, d11, d20, d21, d30, d31;
91  mlib_d64 dd, d0, d1;
92  mlib_s32 ik, jk, ik_last, jk_size, coff, off, doff;
93  DEF_VARS;
94  DEF_EXTRA_VARS;
95
96  if (n > MAX_N) {
97    buffs = mlib_malloc(3 * (n + 1) * sizeof(mlib_d64 *));
98
99    if (buffs == NULL)
100      return MLIB_FAILURE;
101  }
102
103  buff = buffs + 2 * (n + 1);
104
105  sl = adr_src;
106  dl = adr_dst + dn * dll + dm * NCHAN;
107
108  ssize = NCHAN * wid;
109  dsize = (ssize + 7) / 8;
110  esize = dsize + 4;
111  pbuff = mlib_malloc((n + 4) * esize * sizeof(mlib_d64));
112
113  if (pbuff == NULL) {
114    if (buffs != buffs_local)
115      mlib_free(buffs);
116    return MLIB_FAILURE;
117  }
118
119  for (i = 0; i < (n + 1); i++)
120    buffs[i] = pbuff + i * esize;
121  for (i = 0; i < (n + 1); i++)
122    buffs[(n + 1) + i] = buffs[i];
123  buffd = buffs[n] + esize;
124  buffe = buffd + 2 * esize;
125
126  wid -= (m - 1);
127  hgt -= (n - 1);
128  xsize = ssize - NCHAN * (m - 1);
129  emask = (0xFF00 >> (xsize & 7)) & 0xFF;
130
131  vis_write_gsr(gsr_scale + 7);
132
133  for (l = 0; l < n; l++) {
134    mlib_d64 *buffn = buffs[l];
135    sp = sl + l * sll;
136
137    if ((mlib_addr) sp & 7)
138      mlib_ImageCopy_na((void *)sp, (void *)buffn, ssize);
139  }
140
141  /* init buffer */
142#pragma pipeloop(0)
143  for (i = 0; i < (xsize + 7) / 8; i++) {
144    buffd[2 * i] = drnd;
145    buffd[2 * i + 1] = drnd;
146  }
147
148  for (j = 0; j < hgt; j++) {
149    mlib_d64 **buffc = buffs + buff_ind;
150    mlib_f32 *pk = karr, k0, k1, k2, k3;
151    sp = sl + n * sll;
152
153    for (l = 0; l < n; l++) {
154      buff[l] = buffc[l];
155    }
156
157    buffn = buffc[n];
158
159    for (l = 0; l < n; l++) {
160      if ((((mlib_addr) (sl + l * sll)) & 7) == 0)
161        buff[l] = (mlib_d64 *) (sl + l * sll);
162    }
163
164    if ((mlib_addr) sp & 7)
165      mlib_ImageCopy_na((void *)sp, (void *)buffn, ssize);
166
167    ik_last = (m - 1);
168
169    for (jk = 0; jk < n; jk += jk_size) {
170      jk_size = n - jk;
171
172      if (jk_size >= 6)
173        jk_size = 4;
174
175      if (jk_size == 5)
176        jk_size = 3;
177
178      coff = 0;
179
180      if (jk_size == 1) {
181
182        for (ik = 0; ik < m; ik++, coff += NCHAN) {
183          if (!jk && ik == ik_last)
184            continue;
185
186          k0 = pk[ik];
187
188          doff = coff / 8;
189          buff0 = buff[jk] + doff;
190
191          off = coff & 7;
192          vis_write_gsr(gsr_scale + off);
193
194          s01 = buff0[0];
195#pragma pipeloop(0)
196          for (i = 0; i < (xsize + 7) / 8; i++) {
197            s00 = s01;
198            s01 = buff0[i + 1];
199            s0 = vis_faligndata(s00, s01);
200
201            d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
202            d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
203
204            d0 = buffd[2 * i];
205            d1 = buffd[2 * i + 1];
206            d0 = vis_fpadd16(d00, d0);
207            d1 = vis_fpadd16(d01, d1);
208            buffd[2 * i] = d0;
209            buffd[2 * i + 1] = d1;
210          }
211        }
212
213        pk += m;
214      }
215      else if (jk_size == 2) {
216
217        for (ik = 0; ik < m; ik++, coff += NCHAN) {
218          if (!jk && ik == ik_last)
219            continue;
220
221          k0 = pk[ik];
222          k1 = pk[ik + m];
223
224          doff = coff / 8;
225          buff0 = buff[jk] + doff;
226          buff1 = buff[jk + 1] + doff;
227
228          off = coff & 7;
229          vis_write_gsr(gsr_scale + off);
230
231          s01 = buff0[0];
232          s11 = buff1[0];
233#pragma pipeloop(0)
234          for (i = 0; i < (xsize + 7) / 8; i++) {
235            s00 = s01;
236            s10 = s11;
237            s01 = buff0[i + 1];
238            s11 = buff1[i + 1];
239            s0 = vis_faligndata(s00, s01);
240            s1 = vis_faligndata(s10, s11);
241
242            d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
243            d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
244            d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
245            d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
246
247            d0 = buffd[2 * i];
248            d1 = buffd[2 * i + 1];
249            d0 = vis_fpadd16(d00, d0);
250            d0 = vis_fpadd16(d10, d0);
251            d1 = vis_fpadd16(d01, d1);
252            d1 = vis_fpadd16(d11, d1);
253            buffd[2 * i] = d0;
254            buffd[2 * i + 1] = d1;
255          }
256        }
257
258        pk += 2 * m;
259      }
260      else if (jk_size == 3) {
261
262        for (ik = 0; ik < m; ik++, coff += NCHAN) {
263          if (!jk && ik == ik_last)
264            continue;
265
266          k0 = pk[ik];
267          k1 = pk[ik + m];
268          k2 = pk[ik + 2 * m];
269
270          doff = coff / 8;
271          buff0 = buff[jk] + doff;
272          buff1 = buff[jk + 1] + doff;
273          buff2 = buff[jk + 2] + doff;
274
275          off = coff & 7;
276          vis_write_gsr(gsr_scale + off);
277
278          if (off == 0) {
279#pragma pipeloop(0)
280            for (i = 0; i < (xsize + 7) / 8; i++) {
281              d0 = buffd[2 * i];
282              d1 = buffd[2 * i + 1];
283
284              s0 = buff0[i];
285              s1 = buff1[i];
286              s2 = buff2[i];
287
288              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
289              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
290              d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
291              d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
292              d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
293              d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
294
295              d00 = vis_fpadd16(d00, d10);
296              d0 = vis_fpadd16(d20, d0);
297              d0 = vis_fpadd16(d00, d0);
298              d01 = vis_fpadd16(d01, d11);
299              d1 = vis_fpadd16(d21, d1);
300              d1 = vis_fpadd16(d01, d1);
301              buffd[2 * i] = d0;
302              buffd[2 * i + 1] = d1;
303            }
304          }
305          else if (off == 4) {
306            s01 = buff0[0];
307            s11 = buff1[0];
308            s21 = buff2[0];
309#pragma pipeloop(0)
310            for (i = 0; i < (xsize + 7) / 8; i++) {
311              d0 = buffd[2 * i];
312              d1 = buffd[2 * i + 1];
313
314              s00 = s01;
315              s10 = s11;
316              s20 = s21;
317              s01 = buff0[i + 1];
318              s11 = buff1[i + 1];
319              s21 = buff2[i + 1];
320
321              d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
322              d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
323              d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
324              d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
325              d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
326              d21 = vis_fmul8x16au(vis_read_hi(s21), k2);
327
328              d00 = vis_fpadd16(d00, d10);
329              d0 = vis_fpadd16(d20, d0);
330              d0 = vis_fpadd16(d00, d0);
331              d01 = vis_fpadd16(d01, d11);
332              d1 = vis_fpadd16(d21, d1);
333              d1 = vis_fpadd16(d01, d1);
334              buffd[2 * i] = d0;
335              buffd[2 * i + 1] = d1;
336            }
337          }
338          else {
339            s01 = buff0[0];
340            s11 = buff1[0];
341            s21 = buff2[0];
342#pragma pipeloop(0)
343            for (i = 0; i < (xsize + 7) / 8; i++) {
344              d0 = buffd[2 * i];
345              d1 = buffd[2 * i + 1];
346
347              s00 = s01;
348              s10 = s11;
349              s20 = s21;
350              s01 = buff0[i + 1];
351              s11 = buff1[i + 1];
352              s21 = buff2[i + 1];
353              s0 = vis_faligndata(s00, s01);
354              s1 = vis_faligndata(s10, s11);
355              s2 = vis_faligndata(s20, s21);
356
357              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
358              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
359              d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
360              d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
361              d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
362              d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
363
364              d00 = vis_fpadd16(d00, d10);
365              d0 = vis_fpadd16(d20, d0);
366              d0 = vis_fpadd16(d00, d0);
367              d01 = vis_fpadd16(d01, d11);
368              d1 = vis_fpadd16(d21, d1);
369              d1 = vis_fpadd16(d01, d1);
370              buffd[2 * i] = d0;
371              buffd[2 * i + 1] = d1;
372            }
373          }
374        }
375
376        pk += 3 * m;
377      }
378      else {                                /* jk_size == 4 */
379
380        for (ik = 0; ik < m; ik++, coff += NCHAN) {
381          if (!jk && ik == ik_last)
382            continue;
383
384          k0 = pk[ik];
385          k1 = pk[ik + m];
386          k2 = pk[ik + 2 * m];
387          k3 = pk[ik + 3 * m];
388
389          doff = coff / 8;
390          buff0 = buff[jk] + doff;
391          buff1 = buff[jk + 1] + doff;
392          buff2 = buff[jk + 2] + doff;
393          buff3 = buff[jk + 3] + doff;
394
395          off = coff & 7;
396          vis_write_gsr(gsr_scale + off);
397
398          if (off == 0) {
399
400#pragma pipeloop(0)
401            for (i = 0; i < (xsize + 7) / 8; i++) {
402              d0 = buffd[2 * i];
403              d1 = buffd[2 * i + 1];
404
405              s0 = buff0[i];
406              s1 = buff1[i];
407              s2 = buff2[i];
408              s3 = buff3[i];
409
410              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
411              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
412              d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
413              d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
414              d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
415              d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
416              d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
417              d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
418
419              d00 = vis_fpadd16(d00, d10);
420              d20 = vis_fpadd16(d20, d30);
421              d0 = vis_fpadd16(d0, d00);
422              d0 = vis_fpadd16(d0, d20);
423              d01 = vis_fpadd16(d01, d11);
424              d21 = vis_fpadd16(d21, d31);
425              d1 = vis_fpadd16(d1, d01);
426              d1 = vis_fpadd16(d1, d21);
427              buffd[2 * i] = d0;
428              buffd[2 * i + 1] = d1;
429            }
430          }
431          else if (off == 4) {
432
433            s01 = buff0[0];
434            s11 = buff1[0];
435            s21 = buff2[0];
436            s31 = buff3[0];
437#pragma pipeloop(0)
438            for (i = 0; i < (xsize + 7) / 8; i++) {
439              d0 = buffd[2 * i];
440              d1 = buffd[2 * i + 1];
441
442              s00 = s01;
443              s10 = s11;
444              s20 = s21;
445              s30 = s31;
446              s01 = buff0[i + 1];
447              s11 = buff1[i + 1];
448              s21 = buff2[i + 1];
449              s31 = buff3[i + 1];
450
451              d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
452              d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
453              d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
454              d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
455              d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
456              d21 = vis_fmul8x16au(vis_read_hi(s21), k2);
457              d30 = vis_fmul8x16au(vis_read_lo(s30), k3);
458              d31 = vis_fmul8x16au(vis_read_hi(s31), k3);
459
460              d00 = vis_fpadd16(d00, d10);
461              d20 = vis_fpadd16(d20, d30);
462              d0 = vis_fpadd16(d0, d00);
463              d0 = vis_fpadd16(d0, d20);
464              d01 = vis_fpadd16(d01, d11);
465              d21 = vis_fpadd16(d21, d31);
466              d1 = vis_fpadd16(d1, d01);
467              d1 = vis_fpadd16(d1, d21);
468              buffd[2 * i] = d0;
469              buffd[2 * i + 1] = d1;
470            }
471          }
472          else {
473
474            s01 = buff0[0];
475            s11 = buff1[0];
476            s21 = buff2[0];
477            s31 = buff3[0];
478#pragma pipeloop(0)
479            for (i = 0; i < (xsize + 7) / 8; i++) {
480              d0 = buffd[2 * i];
481              d1 = buffd[2 * i + 1];
482
483              s00 = s01;
484              s10 = s11;
485              s20 = s21;
486              s30 = s31;
487              s01 = buff0[i + 1];
488              s11 = buff1[i + 1];
489              s21 = buff2[i + 1];
490              s31 = buff3[i + 1];
491              s0 = vis_faligndata(s00, s01);
492              s1 = vis_faligndata(s10, s11);
493              s2 = vis_faligndata(s20, s21);
494              s3 = vis_faligndata(s30, s31);
495
496              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
497              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
498              d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
499              d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
500              d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
501              d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
502              d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
503              d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
504
505              d00 = vis_fpadd16(d00, d10);
506              d20 = vis_fpadd16(d20, d30);
507              d0 = vis_fpadd16(d0, d00);
508              d0 = vis_fpadd16(d0, d20);
509              d01 = vis_fpadd16(d01, d11);
510              d21 = vis_fpadd16(d21, d31);
511              d1 = vis_fpadd16(d1, d01);
512              d1 = vis_fpadd16(d1, d21);
513              buffd[2 * i] = d0;
514              buffd[2 * i + 1] = d1;
515            }
516          }
517        }
518
519        pk += 4 * m;
520      }
521    }
522
523    /*****************************************
524     *****************************************
525     **          Final iteration            **
526     *****************************************
527     *****************************************/
528
529    jk_size = n;
530
531    if (jk_size >= 6)
532      jk_size = 4;
533
534    if (jk_size == 5)
535      jk_size = 3;
536
537    k0 = karr[ik_last];
538    k1 = karr[ik_last + m];
539    k2 = karr[ik_last + 2 * m];
540    k3 = karr[ik_last + 3 * m];
541
542    off = ik_last * NCHAN;
543    doff = off / 8;
544    off &= 7;
545    buff0 = buff[0] + doff;
546    buff1 = buff[1] + doff;
547    buff2 = buff[2] + doff;
548    buff3 = buff[3] + doff;
549    vis_write_gsr(gsr_scale + off);
550
551    if (jk_size == 1) {
552      dp = ((mlib_addr) dl & 7) ? buffe : (mlib_d64 *) dl;
553
554      s01 = buff0[0];
555#pragma pipeloop(0)
556      for (i = 0; i < xsize / 8; i++) {
557        s00 = s01;
558        s01 = buff0[i + 1];
559        s0 = vis_faligndata(s00, s01);
560
561        d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
562        d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
563
564        d0 = buffd[2 * i];
565        d1 = buffd[2 * i + 1];
566        d0 = vis_fpadd16(d0, d00);
567        d1 = vis_fpadd16(d1, d01);
568
569        dd = vis_fpack16_pair(d0, d1);
570        dp[i] = dd;
571
572        buffd[2 * i] = drnd;
573        buffd[2 * i + 1] = drnd;
574      }
575
576      if (emask) {
577        s00 = s01;
578        s01 = buff0[i + 1];
579        s0 = vis_faligndata(s00, s01);
580
581        d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
582        d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
583
584        d0 = buffd[2 * i];
585        d1 = buffd[2 * i + 1];
586        d0 = vis_fpadd16(d0, d00);
587        d1 = vis_fpadd16(d1, d01);
588
589        dd = vis_fpack16_pair(d0, d1);
590        vis_pst_8(dd, dp + i, emask);
591
592        buffd[2 * i] = drnd;
593        buffd[2 * i + 1] = drnd;
594      }
595
596      if ((mlib_u8 *) dp != dl)
597        mlib_ImageCopy_na((void *)buffe, dl, xsize);
598    }
599    else if (jk_size == 2) {
600      dp = ((mlib_addr) dl & 7) ? buffe : (mlib_d64 *) dl;
601
602      s01 = buff0[0];
603      s11 = buff1[0];
604#pragma pipeloop(0)
605      for (i = 0; i < xsize / 8; i++) {
606        s00 = s01;
607        s10 = s11;
608        s01 = buff0[i + 1];
609        s11 = buff1[i + 1];
610        s0 = vis_faligndata(s00, s01);
611        s1 = vis_faligndata(s10, s11);
612
613        d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
614        d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
615        d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
616        d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
617
618        d0 = buffd[2 * i];
619        d1 = buffd[2 * i + 1];
620        d0 = vis_fpadd16(d0, d00);
621        d0 = vis_fpadd16(d0, d10);
622        d1 = vis_fpadd16(d1, d01);
623        d1 = vis_fpadd16(d1, d11);
624
625        dd = vis_fpack16_pair(d0, d1);
626        dp[i] = dd;
627
628        buffd[2 * i] = drnd;
629        buffd[2 * i + 1] = drnd;
630      }
631
632      if (emask) {
633        s00 = s01;
634        s10 = s11;
635        s01 = buff0[i + 1];
636        s11 = buff1[i + 1];
637        s0 = vis_faligndata(s00, s01);
638        s1 = vis_faligndata(s10, s11);
639
640        d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
641        d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
642        d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
643        d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
644
645        d0 = buffd[2 * i];
646        d1 = buffd[2 * i + 1];
647        d0 = vis_fpadd16(d0, d00);
648        d0 = vis_fpadd16(d0, d10);
649        d1 = vis_fpadd16(d1, d01);
650        d1 = vis_fpadd16(d1, d11);
651
652        dd = vis_fpack16_pair(d0, d1);
653        vis_pst_8(dd, dp + i, emask);
654
655        buffd[2 * i] = drnd;
656        buffd[2 * i + 1] = drnd;
657      }
658
659      if ((mlib_u8 *) dp != dl)
660        mlib_ImageCopy_na((void *)buffe, dl, xsize);
661    }
662    else if (jk_size == 3) {
663
664      dp = ((mlib_addr) dl & 7) ? buffe : (mlib_d64 *) dl;
665
666      s01 = buff0[0];
667      s11 = buff1[0];
668      s21 = buff2[0];
669#pragma pipeloop(0)
670      for (i = 0; i < xsize / 8; i++) {
671        s00 = s01;
672        s10 = s11;
673        s20 = s21;
674        s01 = buff0[i + 1];
675        s11 = buff1[i + 1];
676        s21 = buff2[i + 1];
677        s0 = vis_faligndata(s00, s01);
678        s1 = vis_faligndata(s10, s11);
679        s2 = vis_faligndata(s20, s21);
680
681        d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
682        d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
683        d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
684        d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
685        d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
686        d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
687
688        d0 = buffd[2 * i];
689        d1 = buffd[2 * i + 1];
690        d0 = vis_fpadd16(d0, d00);
691        d0 = vis_fpadd16(d0, d10);
692        d0 = vis_fpadd16(d0, d20);
693        d1 = vis_fpadd16(d1, d01);
694        d1 = vis_fpadd16(d1, d11);
695        d1 = vis_fpadd16(d1, d21);
696
697        dd = vis_fpack16_pair(d0, d1);
698        dp[i] = dd;
699
700        buffd[2 * i] = drnd;
701        buffd[2 * i + 1] = drnd;
702      }
703
704      if (emask) {
705        s00 = s01;
706        s10 = s11;
707        s20 = s21;
708        s01 = buff0[i + 1];
709        s11 = buff1[i + 1];
710        s21 = buff2[i + 1];
711        s0 = vis_faligndata(s00, s01);
712        s1 = vis_faligndata(s10, s11);
713        s2 = vis_faligndata(s20, s21);
714
715        d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
716        d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
717        d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
718        d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
719        d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
720        d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
721
722        d0 = buffd[2 * i];
723        d1 = buffd[2 * i + 1];
724        d0 = vis_fpadd16(d0, d00);
725        d0 = vis_fpadd16(d0, d10);
726        d0 = vis_fpadd16(d0, d20);
727        d1 = vis_fpadd16(d1, d01);
728        d1 = vis_fpadd16(d1, d11);
729        d1 = vis_fpadd16(d1, d21);
730
731        dd = vis_fpack16_pair(d0, d1);
732        vis_pst_8(dd, dp + i, emask);
733
734        buffd[2 * i] = drnd;
735        buffd[2 * i + 1] = drnd;
736      }
737
738      if ((mlib_u8 *) dp != dl)
739        mlib_ImageCopy_na((void *)buffe, dl, xsize);
740    }
741    else {                                  /* if (jk_size == 4) */
742
743      dp = ((mlib_addr) dl & 7) ? buffe : (mlib_d64 *) dl;
744
745      s01 = buff0[0];
746      s11 = buff1[0];
747      s21 = buff2[0];
748      s31 = buff3[0];
749#pragma pipeloop(0)
750      for (i = 0; i < xsize / 8; i++) {
751        s00 = s01;
752        s10 = s11;
753        s20 = s21;
754        s30 = s31;
755        s01 = buff0[i + 1];
756        s11 = buff1[i + 1];
757        s21 = buff2[i + 1];
758        s31 = buff3[i + 1];
759        s0 = vis_faligndata(s00, s01);
760        s1 = vis_faligndata(s10, s11);
761        s2 = vis_faligndata(s20, s21);
762        s3 = vis_faligndata(s30, s31);
763
764        d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
765        d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
766        d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
767        d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
768        d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
769        d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
770        d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
771        d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
772
773        d0 = buffd[2 * i];
774        d1 = buffd[2 * i + 1];
775        d0 = vis_fpadd16(d0, d00);
776        d0 = vis_fpadd16(d0, d10);
777        d0 = vis_fpadd16(d0, d20);
778        d0 = vis_fpadd16(d0, d30);
779        d1 = vis_fpadd16(d1, d01);
780        d1 = vis_fpadd16(d1, d11);
781        d1 = vis_fpadd16(d1, d21);
782        d1 = vis_fpadd16(d1, d31);
783
784        dd = vis_fpack16_pair(d0, d1);
785        dp[i] = dd;
786
787        buffd[2 * i] = drnd;
788        buffd[2 * i + 1] = drnd;
789      }
790
791      if (emask) {
792        s00 = s01;
793        s10 = s11;
794        s20 = s21;
795        s30 = s31;
796        s01 = buff0[i + 1];
797        s11 = buff1[i + 1];
798        s21 = buff2[i + 1];
799        s31 = buff3[i + 1];
800        s0 = vis_faligndata(s00, s01);
801        s1 = vis_faligndata(s10, s11);
802        s2 = vis_faligndata(s20, s21);
803        s3 = vis_faligndata(s30, s31);
804
805        d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
806        d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
807        d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
808        d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
809        d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
810        d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
811        d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
812        d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
813
814        d0 = buffd[2 * i];
815        d1 = buffd[2 * i + 1];
816        d0 = vis_fpadd16(d0, d00);
817        d0 = vis_fpadd16(d0, d10);
818        d0 = vis_fpadd16(d0, d20);
819        d0 = vis_fpadd16(d0, d30);
820        d1 = vis_fpadd16(d1, d01);
821        d1 = vis_fpadd16(d1, d11);
822        d1 = vis_fpadd16(d1, d21);
823        d1 = vis_fpadd16(d1, d31);
824
825        dd = vis_fpack16_pair(d0, d1);
826        vis_pst_8(dd, dp + i, emask);
827
828        buffd[2 * i] = drnd;
829        buffd[2 * i + 1] = drnd;
830      }
831
832      if ((mlib_u8 *) dp != dl)
833        mlib_ImageCopy_na((void *)buffe, dl, xsize);
834    }
835
836    sl += sll;
837    dl += dll;
838
839    buff_ind++;
840
841    if (buff_ind >= (n + 1))
842      buff_ind = 0;
843  }
844
845  mlib_free(pbuff);
846
847  if (buffs != buffs_local)
848    mlib_free(buffs);
849
850  return MLIB_SUCCESS;
851}
852
853/***************************************************************/
854