1/*
2 * Copyright (c) 1998, 2003, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26
27
28/*
29 * FUNCTION
30 *      mlib_v_convMxN_8ext - convolve a 8-bit image, MxN kernel,
31 *                            edge = src extended
32 *
33 * SYNOPSIS
34 *      mlib_status mlib_v_convMxNext_u8(mlib_image       *dst,
35 *                                       cosmt mlib_image *dst,
36 *                                       mlib_s32         kwid,
37 *                                       mlib_s32         khgt,
38 *                                       mlib_s32         dx_l,
39 *                                       mlib_s32         dx_r,
40 *                                       mlib_s32         dy_t,
41 *                                       mlib_s32         dy_b,
42 *                                       const mlib_s32   *skernel,
43 *                                       mlib_s32         discardbits,
44 *                                       mlib_s32         cmask)
45 *
46 * ARGUMENT
47 *      src       Ptr to source image structure
48 *      dst       Ptr to destination image structure
49 *      khgt         Kernel height (# of rows)
50 *      kwid         Kernel width (# of cols)
51 *      skernel      Ptr to convolution kernel
52 *      discardbits  The number of LSBits of the 32-bit accumulator that
53 *                   are discarded when the 32-bit accumulator is converted
54 *                   to 16-bit output data; discardbits must be 1-15 (it
55 *                   cannot be zero). Same as exponent N for scalefac=2**N.
56 *      cmask        Channel mask to indicate the channels to be convolved.
57 *                   Each bit of which represents a channel in the image. The
58 *                   channels corresponded to 1 bits are those to be processed.
59 *
60 * DESCRIPTION
61 *      A 2-D convolution (MxN kernel) for 8-bit images.
62 *
63 */
64
65#include "vis_proto.h"
66#include "mlib_image.h"
67#include "mlib_ImageCopy.h"
68#include "mlib_ImageConv.h"
69#include "mlib_c_ImageConv.h"
70#include "mlib_v_ImageChannelExtract.h"
71#include "mlib_v_ImageChannelInsert.h"
72
73/***************************************************************/
74static mlib_status mlib_convMxN_8ext_f(mlib_image       *dst,
75                                       const mlib_image *src,
76                                       mlib_s32         m,
77                                       mlib_s32         n,
78                                       mlib_s32         dx_l,
79                                       mlib_s32         dx_r,
80                                       mlib_s32         dy_t,
81                                       mlib_s32         dy_b,
82                                       const mlib_s32   *kern,
83                                       mlib_s32         scale);
84
85static mlib_status mlib_convMxN_8ext_mask(mlib_image       *dst,
86                                          const mlib_image *src,
87                                          mlib_s32         m,
88                                          mlib_s32         n,
89                                          mlib_s32         dx_l,
90                                          mlib_s32         dx_r,
91                                          mlib_s32         dy_t,
92                                          mlib_s32         dy_b,
93                                          const mlib_s32   *kern,
94                                          mlib_s32         scale,
95                                          mlib_s32         cmask);
96
97/***************************************************************/
98static mlib_s32 mlib_round_8[16] = {
99  0x00400040, 0x00200020, 0x00100010, 0x00080008,
100  0x00040004, 0x00020002, 0x00010001, 0x00000000,
101  0x00000000, 0x00000000, 0x00000000, 0x00000000,
102  0x00000000, 0x00000000, 0x00000000, 0x00000000
103};
104
105/***************************************************************/
106mlib_status mlib_convMxNext_u8(mlib_image       *dst,
107                               const mlib_image *src,
108                               const mlib_s32   *kernel,
109                               mlib_s32         kwid,
110                               mlib_s32         khgt,
111                               mlib_s32         dx_l,
112                               mlib_s32         dx_r,
113                               mlib_s32         dy_t,
114                               mlib_s32         dy_b,
115                               mlib_s32         discardbits,
116                               mlib_s32         cmask)
117{
118  mlib_s32 nchannel, amask;
119
120  if (mlib_ImageConvVersion(kwid, khgt, discardbits, MLIB_BYTE) == 0)
121    return mlib_c_convMxNext_u8(dst, src, kernel, kwid, khgt,
122                                dx_l, dx_r, dy_t, dy_b, discardbits, cmask);
123
124  nchannel = mlib_ImageGetChannels(src);
125
126  if (nchannel == 1)
127    cmask = 1;
128  amask = (1 << nchannel) - 1;
129
130  if ((cmask & amask) == amask) {
131    return mlib_convMxN_8ext_f(dst, src, kwid, khgt, dx_l, dx_r, dy_t, dy_b, kernel,
132                               discardbits);
133  }
134  else {
135    return mlib_convMxN_8ext_mask(dst, src, kwid, khgt, dx_l, dx_r, dy_t, dy_b, kernel,
136                                  discardbits, cmask);
137  }
138}
139
140#define MAX_N   11
141
142/***************************************************************/
143mlib_status mlib_convMxN_8ext_f(mlib_image       *dst,
144                                const mlib_image *src,
145                                mlib_s32         m,
146                                mlib_s32         n,
147                                mlib_s32         dx_l,
148                                mlib_s32         dx_r,
149                                mlib_s32         dy_t,
150                                mlib_s32         dy_b,
151                                const mlib_s32   *kern,
152                                mlib_s32         scale)
153{
154  mlib_d64 *buffs_local[3 * (MAX_N + 1)], **buffs = buffs_local, **buff;
155  mlib_d64 *buff0, *buff1, *buff2, *buff3, *buffn, *buffd, *buffe;
156  mlib_d64 s00, s01, s10, s11, s20, s21, s30, s31, s0, s1, s2, s3;
157  mlib_d64 d00, d01, d10, d11, d20, d21, d30, d31;
158  mlib_d64 dd, d0, d1;
159  mlib_s32 ik, jk, ik_last, jk_size, coff, off, doff;
160  mlib_u8 *sl, *dl;
161  mlib_s32 hgt = mlib_ImageGetHeight(src);
162  mlib_s32 wid = mlib_ImageGetWidth(src);
163  mlib_s32 sll = mlib_ImageGetStride(src);
164  mlib_s32 dll = mlib_ImageGetStride(dst);
165  mlib_u8 *adr_src = (mlib_u8 *) mlib_ImageGetData(src);
166  mlib_u8 *adr_dst = (mlib_u8 *) mlib_ImageGetData(dst);
167  mlib_s32 ssize, xsize, dsize, esize, buff_ind = 0;
168  mlib_d64 *pbuff, *dp;
169  mlib_f32 *karr = (mlib_f32 *) kern;
170  mlib_s32 gsr_scale = (31 - scale) << 3;
171  mlib_d64 drnd = vis_to_double_dup(mlib_round_8[31 - scale]);
172  mlib_s32 i, j, l, ii;
173  mlib_s32 nchan = mlib_ImageGetChannels(dst);
174
175  if (n > MAX_N) {
176    buffs = mlib_malloc(3 * (n + 1) * sizeof(mlib_d64 *));
177
178    if (buffs == NULL)
179      return MLIB_FAILURE;
180  }
181
182  buff = buffs + 2 * (n + 1);
183
184  sl = adr_src;
185  dl = adr_dst;
186
187  ssize = nchan * (wid + (m - 1));
188  dsize = (ssize + 7) / 8;
189  esize = dsize + 4;
190  pbuff = mlib_malloc((n + 4) * esize * sizeof(mlib_d64));
191
192  if (pbuff == NULL) {
193    if (buffs != buffs_local)
194      mlib_free(buffs);
195    return MLIB_FAILURE;
196  }
197
198  for (i = 0; i < (n + 1); i++)
199    buffs[i] = pbuff + i * esize;
200  for (i = 0; i < (n + 1); i++)
201    buffs[(n + 1) + i] = buffs[i];
202  buffd = buffs[n] + esize;
203  buffe = buffd + 2 * esize;
204
205  xsize = ssize - nchan * (m - 1);
206  ssize -= nchan * (dx_l + dx_r);
207
208  vis_write_gsr(gsr_scale + 7);
209
210  for (l = 0; l < n; l++) {
211    mlib_d64 *buffn = buffs[l];
212
213    mlib_ImageCopy_na((mlib_u8 *) sl, (mlib_u8 *) buffn + dx_l * nchan, ssize);
214
215    for (i = 0; i < nchan; i++) {
216      for (ii = 0; ii < dx_l; ii++) {
217        *((mlib_u8 *) buffn + i + nchan * ii) = *((mlib_u8 *) buffn + i + nchan * dx_l);
218      }
219    }
220
221    for (i = 0; i < nchan; i++) {
222      for (ii = 0; ii < dx_r; ii++) {
223        *((mlib_u8 *) buffn + i + nchan * ii + ssize + dx_l * nchan) =
224          *((mlib_u8 *) buffn + i + nchan * (dx_l - 1) + ssize);
225      }
226    }
227
228    if ((l >= dy_t) && (l < hgt + n - dy_b - 2))
229      sl += sll;
230  }
231
232  /* init buffer */
233#pragma pipeloop(0)
234  for (i = 0; i < (xsize + 7) / 8; i++) {
235    buffd[2 * i] = drnd;
236    buffd[2 * i + 1] = drnd;
237  }
238
239  for (j = 0; j < hgt; j++) {
240    mlib_d64 **buffc = buffs + buff_ind;
241    mlib_f32 *pk = karr, k0, k1, k2, k3;
242
243    for (l = 0; l < n; l++) {
244      buff[l] = buffc[l];
245    }
246
247    buffn = buffc[n];
248
249    mlib_ImageCopy_na((mlib_u8 *) sl, (mlib_u8 *) buffn + dx_l * nchan, ssize);
250
251    for (i = 0; i < nchan; i++) {
252      for (ii = 0; ii < dx_l; ii++) {
253        *((mlib_u8 *) buffn + i + nchan * ii) = *((mlib_u8 *) buffn + i + nchan * dx_l);
254      }
255    }
256
257    for (i = 0; i < nchan; i++) {
258      for (ii = 0; ii < dx_r; ii++) {
259        *((mlib_u8 *) buffn + i + nchan * ii + ssize + dx_l * nchan) =
260          *((mlib_u8 *) buffn + i + nchan * (dx_l - 1) + ssize);
261      }
262    }
263
264    ik_last = (m - 1);
265
266    for (jk = 0; jk < n; jk += jk_size) {
267      jk_size = n - jk;
268
269      if (jk_size >= 6)
270        jk_size = 4;
271      if (jk_size == 5)
272        jk_size = 3;
273
274      coff = 0;
275
276      if (jk_size == 1) {
277
278        for (ik = 0; ik < m; ik++, coff += nchan) {
279          if (!jk && ik == ik_last)
280            continue;
281
282          k0 = pk[ik];
283
284          doff = coff / 8;
285          buff0 = buff[jk] + doff;
286
287          off = coff & 7;
288          vis_write_gsr(gsr_scale + off);
289
290          s01 = buff0[0];
291#pragma pipeloop(0)
292          for (i = 0; i < (xsize + 7) / 8; i++) {
293            s00 = s01;
294            s01 = buff0[i + 1];
295            s0 = vis_faligndata(s00, s01);
296
297            d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
298            d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
299
300            d0 = buffd[2 * i];
301            d1 = buffd[2 * i + 1];
302            d0 = vis_fpadd16(d00, d0);
303            d1 = vis_fpadd16(d01, d1);
304            buffd[2 * i] = d0;
305            buffd[2 * i + 1] = d1;
306          }
307        }
308
309        pk += m;
310
311      }
312      else if (jk_size == 2) {
313
314        for (ik = 0; ik < m; ik++, coff += nchan) {
315          if (!jk && ik == ik_last)
316            continue;
317
318          k0 = pk[ik];
319          k1 = pk[ik + m];
320
321          doff = coff / 8;
322          buff0 = buff[jk] + doff;
323          buff1 = buff[jk + 1] + doff;
324
325          off = coff & 7;
326          vis_write_gsr(gsr_scale + off);
327
328          s01 = buff0[0];
329          s11 = buff1[0];
330#pragma pipeloop(0)
331          for (i = 0; i < (xsize + 7) / 8; i++) {
332            s00 = s01;
333            s10 = s11;
334            s01 = buff0[i + 1];
335            s11 = buff1[i + 1];
336            s0 = vis_faligndata(s00, s01);
337            s1 = vis_faligndata(s10, s11);
338
339            d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
340            d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
341            d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
342            d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
343
344            d0 = buffd[2 * i];
345            d1 = buffd[2 * i + 1];
346            d0 = vis_fpadd16(d00, d0);
347            d0 = vis_fpadd16(d10, d0);
348            d1 = vis_fpadd16(d01, d1);
349            d1 = vis_fpadd16(d11, d1);
350            buffd[2 * i] = d0;
351            buffd[2 * i + 1] = d1;
352          }
353        }
354
355        pk += 2 * m;
356
357      }
358      else if (jk_size == 3) {
359
360        for (ik = 0; ik < m; ik++, coff += nchan) {
361          if (!jk && ik == ik_last)
362            continue;
363
364          k0 = pk[ik];
365          k1 = pk[ik + m];
366          k2 = pk[ik + 2 * m];
367
368          doff = coff / 8;
369          buff0 = buff[jk] + doff;
370          buff1 = buff[jk + 1] + doff;
371          buff2 = buff[jk + 2] + doff;
372
373          off = coff & 7;
374          vis_write_gsr(gsr_scale + off);
375
376          if (off == 0) {
377#pragma pipeloop(0)
378            for (i = 0; i < (xsize + 7) / 8; i++) {
379              d0 = buffd[2 * i];
380              d1 = buffd[2 * i + 1];
381
382              s0 = buff0[i];
383              s1 = buff1[i];
384              s2 = buff2[i];
385
386              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
387              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
388              d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
389              d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
390              d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
391              d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
392
393              d00 = vis_fpadd16(d00, d10);
394              d0 = vis_fpadd16(d20, d0);
395              d0 = vis_fpadd16(d00, d0);
396              d01 = vis_fpadd16(d01, d11);
397              d1 = vis_fpadd16(d21, d1);
398              d1 = vis_fpadd16(d01, d1);
399              buffd[2 * i] = d0;
400              buffd[2 * i + 1] = d1;
401            }
402
403          }
404          else if (off == 4) {
405            s01 = buff0[0];
406            s11 = buff1[0];
407            s21 = buff2[0];
408#pragma pipeloop(0)
409            for (i = 0; i < (xsize + 7) / 8; i++) {
410              d0 = buffd[2 * i];
411              d1 = buffd[2 * i + 1];
412
413              s00 = s01;
414              s10 = s11;
415              s20 = s21;
416              s01 = buff0[i + 1];
417              s11 = buff1[i + 1];
418              s21 = buff2[i + 1];
419
420              d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
421              d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
422              d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
423              d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
424              d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
425              d21 = vis_fmul8x16au(vis_read_hi(s21), k2);
426
427              d00 = vis_fpadd16(d00, d10);
428              d0 = vis_fpadd16(d20, d0);
429              d0 = vis_fpadd16(d00, d0);
430              d01 = vis_fpadd16(d01, d11);
431              d1 = vis_fpadd16(d21, d1);
432              d1 = vis_fpadd16(d01, d1);
433              buffd[2 * i] = d0;
434              buffd[2 * i + 1] = d1;
435            }
436
437          }
438          else {
439            s01 = buff0[0];
440            s11 = buff1[0];
441            s21 = buff2[0];
442#pragma pipeloop(0)
443            for (i = 0; i < (xsize + 7) / 8; i++) {
444              d0 = buffd[2 * i];
445              d1 = buffd[2 * i + 1];
446
447              s00 = s01;
448              s10 = s11;
449              s20 = s21;
450              s01 = buff0[i + 1];
451              s11 = buff1[i + 1];
452              s21 = buff2[i + 1];
453              s0 = vis_faligndata(s00, s01);
454              s1 = vis_faligndata(s10, s11);
455              s2 = vis_faligndata(s20, s21);
456
457              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
458              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
459              d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
460              d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
461              d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
462              d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
463
464              d00 = vis_fpadd16(d00, d10);
465              d0 = vis_fpadd16(d20, d0);
466              d0 = vis_fpadd16(d00, d0);
467              d01 = vis_fpadd16(d01, d11);
468              d1 = vis_fpadd16(d21, d1);
469              d1 = vis_fpadd16(d01, d1);
470              buffd[2 * i] = d0;
471              buffd[2 * i + 1] = d1;
472            }
473          }
474        }
475
476        pk += 3 * m;
477
478      }
479      else {                                /* jk_size == 4 */
480
481        for (ik = 0; ik < m; ik++, coff += nchan) {
482          if (!jk && ik == ik_last)
483            continue;
484
485          k0 = pk[ik];
486          k1 = pk[ik + m];
487          k2 = pk[ik + 2 * m];
488          k3 = pk[ik + 3 * m];
489
490          doff = coff / 8;
491          buff0 = buff[jk] + doff;
492          buff1 = buff[jk + 1] + doff;
493          buff2 = buff[jk + 2] + doff;
494          buff3 = buff[jk + 3] + doff;
495
496          off = coff & 7;
497          vis_write_gsr(gsr_scale + off);
498
499          if (off == 0) {
500
501#pragma pipeloop(0)
502            for (i = 0; i < (xsize + 7) / 8; i++) {
503              d0 = buffd[2 * i];
504              d1 = buffd[2 * i + 1];
505
506              s0 = buff0[i];
507              s1 = buff1[i];
508              s2 = buff2[i];
509              s3 = buff3[i];
510
511              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
512              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
513              d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
514              d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
515              d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
516              d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
517              d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
518              d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
519
520              d00 = vis_fpadd16(d00, d10);
521              d20 = vis_fpadd16(d20, d30);
522              d0 = vis_fpadd16(d0, d00);
523              d0 = vis_fpadd16(d0, d20);
524              d01 = vis_fpadd16(d01, d11);
525              d21 = vis_fpadd16(d21, d31);
526              d1 = vis_fpadd16(d1, d01);
527              d1 = vis_fpadd16(d1, d21);
528              buffd[2 * i] = d0;
529              buffd[2 * i + 1] = d1;
530            }
531
532          }
533          else if (off == 4) {
534
535            s01 = buff0[0];
536            s11 = buff1[0];
537            s21 = buff2[0];
538            s31 = buff3[0];
539#pragma pipeloop(0)
540            for (i = 0; i < (xsize + 7) / 8; i++) {
541              d0 = buffd[2 * i];
542              d1 = buffd[2 * i + 1];
543
544              s00 = s01;
545              s10 = s11;
546              s20 = s21;
547              s30 = s31;
548              s01 = buff0[i + 1];
549              s11 = buff1[i + 1];
550              s21 = buff2[i + 1];
551              s31 = buff3[i + 1];
552
553              d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
554              d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
555              d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
556              d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
557              d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
558              d21 = vis_fmul8x16au(vis_read_hi(s21), k2);
559              d30 = vis_fmul8x16au(vis_read_lo(s30), k3);
560              d31 = vis_fmul8x16au(vis_read_hi(s31), k3);
561
562              d00 = vis_fpadd16(d00, d10);
563              d20 = vis_fpadd16(d20, d30);
564              d0 = vis_fpadd16(d0, d00);
565              d0 = vis_fpadd16(d0, d20);
566              d01 = vis_fpadd16(d01, d11);
567              d21 = vis_fpadd16(d21, d31);
568              d1 = vis_fpadd16(d1, d01);
569              d1 = vis_fpadd16(d1, d21);
570              buffd[2 * i] = d0;
571              buffd[2 * i + 1] = d1;
572            }
573
574          }
575          else {
576
577            s01 = buff0[0];
578            s11 = buff1[0];
579            s21 = buff2[0];
580            s31 = buff3[0];
581#pragma pipeloop(0)
582            for (i = 0; i < (xsize + 7) / 8; i++) {
583              d0 = buffd[2 * i];
584              d1 = buffd[2 * i + 1];
585
586              s00 = s01;
587              s10 = s11;
588              s20 = s21;
589              s30 = s31;
590              s01 = buff0[i + 1];
591              s11 = buff1[i + 1];
592              s21 = buff2[i + 1];
593              s31 = buff3[i + 1];
594              s0 = vis_faligndata(s00, s01);
595              s1 = vis_faligndata(s10, s11);
596              s2 = vis_faligndata(s20, s21);
597              s3 = vis_faligndata(s30, s31);
598
599              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
600              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
601              d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
602              d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
603              d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
604              d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
605              d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
606              d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
607
608              d00 = vis_fpadd16(d00, d10);
609              d20 = vis_fpadd16(d20, d30);
610              d0 = vis_fpadd16(d0, d00);
611              d0 = vis_fpadd16(d0, d20);
612              d01 = vis_fpadd16(d01, d11);
613              d21 = vis_fpadd16(d21, d31);
614              d1 = vis_fpadd16(d1, d01);
615              d1 = vis_fpadd16(d1, d21);
616              buffd[2 * i] = d0;
617              buffd[2 * i + 1] = d1;
618            }
619          }
620        }
621
622        pk += 4 * m;
623      }
624    }
625
626    /*****************************************
627     *****************************************
628     **          Final iteration            **
629     *****************************************
630     *****************************************/
631
632    jk_size = n;
633
634    if (jk_size >= 6)
635      jk_size = 4;
636    if (jk_size == 5)
637      jk_size = 3;
638
639    k0 = karr[ik_last];
640    k1 = karr[ik_last + m];
641    k2 = karr[ik_last + 2 * m];
642    k3 = karr[ik_last + 3 * m];
643
644    off = ik_last * nchan;
645    doff = off / 8;
646    off &= 7;
647    buff0 = buff[0] + doff;
648    buff1 = buff[1] + doff;
649    buff2 = buff[2] + doff;
650    buff3 = buff[3] + doff;
651    vis_write_gsr(gsr_scale + off);
652
653    if (jk_size == 1) {
654      dp = buffe;
655
656      s01 = buff0[0];
657#pragma pipeloop(0)
658      for (i = 0; i < (xsize + 7) / 8; i++) {
659        s00 = s01;
660        s01 = buff0[i + 1];
661        s0 = vis_faligndata(s00, s01);
662
663        d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
664        d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
665
666        d0 = buffd[2 * i];
667        d1 = buffd[2 * i + 1];
668        d0 = vis_fpadd16(d0, d00);
669        d1 = vis_fpadd16(d1, d01);
670
671        dd = vis_fpack16_pair(d0, d1);
672        dp[i] = dd;
673
674        buffd[2 * i] = drnd;
675        buffd[2 * i + 1] = drnd;
676      }
677
678    }
679    else if (jk_size == 2) {
680      dp = buffe;
681
682      s01 = buff0[0];
683      s11 = buff1[0];
684#pragma pipeloop(0)
685      for (i = 0; i < (xsize + 7) / 8; i++) {
686        s00 = s01;
687        s10 = s11;
688        s01 = buff0[i + 1];
689        s11 = buff1[i + 1];
690        s0 = vis_faligndata(s00, s01);
691        s1 = vis_faligndata(s10, s11);
692
693        d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
694        d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
695        d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
696        d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
697
698        d0 = buffd[2 * i];
699        d1 = buffd[2 * i + 1];
700        d0 = vis_fpadd16(d0, d00);
701        d0 = vis_fpadd16(d0, d10);
702        d1 = vis_fpadd16(d1, d01);
703        d1 = vis_fpadd16(d1, d11);
704
705        dd = vis_fpack16_pair(d0, d1);
706        dp[i] = dd;
707
708        buffd[2 * i] = drnd;
709        buffd[2 * i + 1] = drnd;
710      }
711
712    }
713    else if (jk_size == 3) {
714
715      dp = buffe;
716
717      s01 = buff0[0];
718      s11 = buff1[0];
719      s21 = buff2[0];
720#pragma pipeloop(0)
721      for (i = 0; i < (xsize + 7) / 8; i++) {
722        s00 = s01;
723        s10 = s11;
724        s20 = s21;
725        s01 = buff0[i + 1];
726        s11 = buff1[i + 1];
727        s21 = buff2[i + 1];
728        s0 = vis_faligndata(s00, s01);
729        s1 = vis_faligndata(s10, s11);
730        s2 = vis_faligndata(s20, s21);
731
732        d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
733        d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
734        d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
735        d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
736        d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
737        d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
738
739        d0 = buffd[2 * i];
740        d1 = buffd[2 * i + 1];
741        d0 = vis_fpadd16(d0, d00);
742        d0 = vis_fpadd16(d0, d10);
743        d0 = vis_fpadd16(d0, d20);
744        d1 = vis_fpadd16(d1, d01);
745        d1 = vis_fpadd16(d1, d11);
746        d1 = vis_fpadd16(d1, d21);
747
748        dd = vis_fpack16_pair(d0, d1);
749        dp[i] = dd;
750
751        buffd[2 * i] = drnd;
752        buffd[2 * i + 1] = drnd;
753      }
754
755    }
756    else {                                  /* if (jk_size == 4) */
757
758      dp = buffe;
759
760      s01 = buff0[0];
761      s11 = buff1[0];
762      s21 = buff2[0];
763      s31 = buff3[0];
764#pragma pipeloop(0)
765      for (i = 0; i < (xsize + 7) / 8; i++) {
766        s00 = s01;
767        s10 = s11;
768        s20 = s21;
769        s30 = s31;
770        s01 = buff0[i + 1];
771        s11 = buff1[i + 1];
772        s21 = buff2[i + 1];
773        s31 = buff3[i + 1];
774        s0 = vis_faligndata(s00, s01);
775        s1 = vis_faligndata(s10, s11);
776        s2 = vis_faligndata(s20, s21);
777        s3 = vis_faligndata(s30, s31);
778
779        d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
780        d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
781        d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
782        d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
783        d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
784        d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
785        d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
786        d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
787
788        d0 = buffd[2 * i];
789        d1 = buffd[2 * i + 1];
790        d0 = vis_fpadd16(d0, d00);
791        d0 = vis_fpadd16(d0, d10);
792        d0 = vis_fpadd16(d0, d20);
793        d0 = vis_fpadd16(d0, d30);
794        d1 = vis_fpadd16(d1, d01);
795        d1 = vis_fpadd16(d1, d11);
796        d1 = vis_fpadd16(d1, d21);
797        d1 = vis_fpadd16(d1, d31);
798
799        dd = vis_fpack16_pair(d0, d1);
800        dp[i] = dd;
801
802        buffd[2 * i] = drnd;
803        buffd[2 * i + 1] = drnd;
804      }
805    }
806
807    mlib_ImageCopy_na((mlib_u8 *) buffe, dl, xsize);
808
809    if (j < hgt - dy_b - 2)
810      sl += sll;
811    dl += dll;
812
813    buff_ind++;
814
815    if (buff_ind >= (n + 1))
816      buff_ind = 0;
817  }
818
819  mlib_free(pbuff);
820
821  if (buffs != buffs_local)
822    mlib_free(buffs);
823
824  return MLIB_SUCCESS;
825}
826
827/***************************************************************/
828mlib_status mlib_convMxN_8ext_mask(mlib_image       *dst,
829                                   const mlib_image *src,
830                                   mlib_s32         m,
831                                   mlib_s32         n,
832                                   mlib_s32         dx_l,
833                                   mlib_s32         dx_r,
834                                   mlib_s32         dy_t,
835                                   mlib_s32         dy_b,
836                                   const mlib_s32   *kern,
837                                   mlib_s32         scale,
838                                   mlib_s32         cmask)
839{
840  mlib_d64 *buffs_local[3 * (MAX_N + 1)], **buffs = buffs_local, **buff;
841  mlib_d64 *buff0, *buff1, *buff2, *buff3, *buffn, *buffd, *buffe;
842  mlib_d64 s00, s01, s10, s11, s20, s21, s30, s31, s0, s1, s2, s3;
843  mlib_d64 d00, d01, d10, d11, d20, d21, d30, d31;
844  mlib_d64 dd, d0, d1;
845  mlib_s32 ik, jk, ik_last, jk_size, coff, off, doff;
846  mlib_u8 *sl, *dl;
847  mlib_s32 hgt = mlib_ImageGetHeight(src);
848  mlib_s32 wid = mlib_ImageGetWidth(src);
849  mlib_s32 sll = mlib_ImageGetStride(src);
850  mlib_s32 dll = mlib_ImageGetStride(dst);
851  mlib_u8 *adr_src = (mlib_u8 *) mlib_ImageGetData(src);
852  mlib_u8 *adr_dst = (mlib_u8 *) mlib_ImageGetData(dst);
853  mlib_s32 ssize, xsize, dsize, esize, buff_ind;
854  mlib_d64 *pbuff, *dp;
855  mlib_f32 *karr = (mlib_f32 *) kern;
856  mlib_s32 gsr_scale = (31 - scale) << 3;
857  mlib_d64 drnd = vis_to_double_dup(mlib_round_8[31 - scale]);
858  mlib_s32 i, j, l, chan, testchan;
859  mlib_s32 nchan = mlib_ImageGetChannels(dst);
860  void (*p_proc_load) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32);
861  void (*p_proc_store) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32);
862
863  if (n > MAX_N) {
864    buffs = mlib_malloc(3 * (n + 1) * sizeof(mlib_d64 *));
865
866    if (buffs == NULL)
867      return MLIB_FAILURE;
868  }
869
870  buff = buffs + 2 * (n + 1);
871
872  ssize = (wid + (m - 1));
873  dsize = (ssize + 7) / 8;
874  esize = dsize + 4;
875  pbuff = mlib_malloc((n + 4) * esize * sizeof(mlib_d64));
876
877  if (pbuff == NULL) {
878    if (buffs != buffs_local)
879      mlib_free(buffs);
880    return MLIB_FAILURE;
881  }
882
883  for (i = 0; i < (n + 1); i++)
884    buffs[i] = pbuff + i * esize;
885  for (i = 0; i < (n + 1); i++)
886    buffs[(n + 1) + i] = buffs[i];
887  buffd = buffs[n] + esize;
888  buffe = buffd + 2 * esize;
889
890  xsize = wid;
891  ssize -= (dx_l + dx_r);
892
893  vis_write_gsr(gsr_scale + 7);
894
895  if (nchan == 2) {
896    p_proc_load = &mlib_v_ImageChannelExtract_U8_21_D1;
897    p_proc_store = &mlib_v_ImageChannelInsert_U8_12_D1;
898  }
899  else if (nchan == 3) {
900    p_proc_load = &mlib_v_ImageChannelExtract_U8_31_D1;
901    p_proc_store = &mlib_v_ImageChannelInsert_U8_13_D1;
902  }
903  else {
904    p_proc_load = &mlib_v_ImageChannelExtract_U8_41_D1;
905    p_proc_store = &mlib_v_ImageChannelInsert_U8_14_D1;
906  }
907
908  testchan = 1;
909  for (chan = 0; chan < nchan; chan++) {
910    buff_ind = 0;
911    sl = adr_src;
912    dl = adr_dst;
913
914    if ((cmask & testchan) == 0) {
915      testchan <<= 1;
916      continue;
917    }
918
919    for (l = 0; l < n; l++) {
920      mlib_d64 *buffn = buffs[l];
921
922      (*p_proc_load) ((mlib_u8 *) sl, (mlib_u8 *) buffn + dx_l, ssize, testchan);
923
924      for (i = 0; i < dx_l; i++) {
925        *((mlib_u8 *) buffn + i) = *((mlib_u8 *) buffn + dx_l);
926      }
927
928      for (i = 0; i < dx_r; i++) {
929        *((mlib_u8 *) buffn + i + ssize + dx_l) =
930          *((mlib_u8 *) buffn + (dx_l - 1) + ssize);
931      }
932
933      if ((l >= dy_t) && (l < hgt + n - dy_b - 2))
934        sl += sll;
935    }
936
937    /* init buffer */
938#pragma pipeloop(0)
939    for (i = 0; i < (xsize + 7) / 8; i++) {
940      buffd[2 * i] = drnd;
941      buffd[2 * i + 1] = drnd;
942    }
943
944    for (j = 0; j < hgt; j++) {
945      mlib_d64 **buffc = buffs + buff_ind;
946      mlib_f32 *pk = karr, k0, k1, k2, k3;
947
948      for (l = 0; l < n; l++) {
949        buff[l] = buffc[l];
950      }
951
952      buffn = buffc[n];
953
954      (*p_proc_load) ((mlib_u8 *) sl, (mlib_u8 *) buffn + dx_l, ssize, testchan);
955
956      for (i = 0; i < dx_l; i++) {
957        *((mlib_u8 *) buffn + i) = *((mlib_u8 *) buffn + dx_l);
958      }
959
960      for (i = 0; i < dx_r; i++) {
961        *((mlib_u8 *) buffn + i + ssize + dx_l) =
962          *((mlib_u8 *) buffn + (dx_l - 1) + ssize);
963      }
964
965      ik_last = (m - 1);
966
967      for (jk = 0; jk < n; jk += jk_size) {
968        jk_size = n - jk;
969
970        if (jk_size >= 6)
971          jk_size = 4;
972        if (jk_size == 5)
973          jk_size = 3;
974
975        coff = 0;
976
977        if (jk_size == 1) {
978
979          for (ik = 0; ik < m; ik++, coff++) {
980            if (!jk && ik == ik_last)
981              continue;
982
983            k0 = pk[ik];
984
985            doff = coff / 8;
986            buff0 = buff[jk] + doff;
987
988            off = coff & 7;
989            vis_write_gsr(gsr_scale + off);
990
991            s01 = buff0[0];
992#pragma pipeloop(0)
993            for (i = 0; i < (xsize + 7) / 8; i++) {
994              s00 = s01;
995              s01 = buff0[i + 1];
996              s0 = vis_faligndata(s00, s01);
997
998              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
999              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1000
1001              d0 = buffd[2 * i];
1002              d1 = buffd[2 * i + 1];
1003              d0 = vis_fpadd16(d00, d0);
1004              d1 = vis_fpadd16(d01, d1);
1005              buffd[2 * i] = d0;
1006              buffd[2 * i + 1] = d1;
1007            }
1008          }
1009
1010          pk += m;
1011
1012        }
1013        else if (jk_size == 2) {
1014
1015          for (ik = 0; ik < m; ik++, coff++) {
1016            if (!jk && ik == ik_last)
1017              continue;
1018
1019            k0 = pk[ik];
1020            k1 = pk[ik + m];
1021
1022            doff = coff / 8;
1023            buff0 = buff[jk] + doff;
1024            buff1 = buff[jk + 1] + doff;
1025
1026            off = coff & 7;
1027            vis_write_gsr(gsr_scale + off);
1028
1029            s01 = buff0[0];
1030            s11 = buff1[0];
1031#pragma pipeloop(0)
1032            for (i = 0; i < (xsize + 7) / 8; i++) {
1033              s00 = s01;
1034              s10 = s11;
1035              s01 = buff0[i + 1];
1036              s11 = buff1[i + 1];
1037              s0 = vis_faligndata(s00, s01);
1038              s1 = vis_faligndata(s10, s11);
1039
1040              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
1041              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1042              d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
1043              d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
1044
1045              d0 = buffd[2 * i];
1046              d1 = buffd[2 * i + 1];
1047              d0 = vis_fpadd16(d00, d0);
1048              d0 = vis_fpadd16(d10, d0);
1049              d1 = vis_fpadd16(d01, d1);
1050              d1 = vis_fpadd16(d11, d1);
1051              buffd[2 * i] = d0;
1052              buffd[2 * i + 1] = d1;
1053            }
1054          }
1055
1056          pk += 2 * m;
1057
1058        }
1059        else if (jk_size == 3) {
1060
1061          for (ik = 0; ik < m; ik++, coff++) {
1062            if (!jk && ik == ik_last)
1063              continue;
1064
1065            k0 = pk[ik];
1066            k1 = pk[ik + m];
1067            k2 = pk[ik + 2 * m];
1068
1069            doff = coff / 8;
1070            buff0 = buff[jk] + doff;
1071            buff1 = buff[jk + 1] + doff;
1072            buff2 = buff[jk + 2] + doff;
1073
1074            off = coff & 7;
1075            vis_write_gsr(gsr_scale + off);
1076
1077            if (off == 0) {
1078#pragma pipeloop(0)
1079              for (i = 0; i < (xsize + 7) / 8; i++) {
1080                d0 = buffd[2 * i];
1081                d1 = buffd[2 * i + 1];
1082
1083                s0 = buff0[i];
1084                s1 = buff1[i];
1085                s2 = buff2[i];
1086
1087                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
1088                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1089                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
1090                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
1091                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
1092                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
1093
1094                d00 = vis_fpadd16(d00, d10);
1095                d0 = vis_fpadd16(d20, d0);
1096                d0 = vis_fpadd16(d00, d0);
1097                d01 = vis_fpadd16(d01, d11);
1098                d1 = vis_fpadd16(d21, d1);
1099                d1 = vis_fpadd16(d01, d1);
1100                buffd[2 * i] = d0;
1101                buffd[2 * i + 1] = d1;
1102              }
1103
1104            }
1105            else if (off == 4) {
1106              s01 = buff0[0];
1107              s11 = buff1[0];
1108              s21 = buff2[0];
1109#pragma pipeloop(0)
1110              for (i = 0; i < (xsize + 7) / 8; i++) {
1111                d0 = buffd[2 * i];
1112                d1 = buffd[2 * i + 1];
1113
1114                s00 = s01;
1115                s10 = s11;
1116                s20 = s21;
1117                s01 = buff0[i + 1];
1118                s11 = buff1[i + 1];
1119                s21 = buff2[i + 1];
1120
1121                d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
1122                d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
1123                d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
1124                d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
1125                d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
1126                d21 = vis_fmul8x16au(vis_read_hi(s21), k2);
1127
1128                d00 = vis_fpadd16(d00, d10);
1129                d0 = vis_fpadd16(d20, d0);
1130                d0 = vis_fpadd16(d00, d0);
1131                d01 = vis_fpadd16(d01, d11);
1132                d1 = vis_fpadd16(d21, d1);
1133                d1 = vis_fpadd16(d01, d1);
1134                buffd[2 * i] = d0;
1135                buffd[2 * i + 1] = d1;
1136              }
1137
1138            }
1139            else {
1140              s01 = buff0[0];
1141              s11 = buff1[0];
1142              s21 = buff2[0];
1143#pragma pipeloop(0)
1144              for (i = 0; i < (xsize + 7) / 8; i++) {
1145                d0 = buffd[2 * i];
1146                d1 = buffd[2 * i + 1];
1147
1148                s00 = s01;
1149                s10 = s11;
1150                s20 = s21;
1151                s01 = buff0[i + 1];
1152                s11 = buff1[i + 1];
1153                s21 = buff2[i + 1];
1154                s0 = vis_faligndata(s00, s01);
1155                s1 = vis_faligndata(s10, s11);
1156                s2 = vis_faligndata(s20, s21);
1157
1158                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
1159                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1160                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
1161                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
1162                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
1163                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
1164
1165                d00 = vis_fpadd16(d00, d10);
1166                d0 = vis_fpadd16(d20, d0);
1167                d0 = vis_fpadd16(d00, d0);
1168                d01 = vis_fpadd16(d01, d11);
1169                d1 = vis_fpadd16(d21, d1);
1170                d1 = vis_fpadd16(d01, d1);
1171                buffd[2 * i] = d0;
1172                buffd[2 * i + 1] = d1;
1173              }
1174            }
1175          }
1176
1177          pk += 3 * m;
1178
1179        }
1180        else {                              /* jk_size == 4 */
1181
1182          for (ik = 0; ik < m; ik++, coff++) {
1183            if (!jk && ik == ik_last)
1184              continue;
1185
1186            k0 = pk[ik];
1187            k1 = pk[ik + m];
1188            k2 = pk[ik + 2 * m];
1189            k3 = pk[ik + 3 * m];
1190
1191            doff = coff / 8;
1192            buff0 = buff[jk] + doff;
1193            buff1 = buff[jk + 1] + doff;
1194            buff2 = buff[jk + 2] + doff;
1195            buff3 = buff[jk + 3] + doff;
1196
1197            off = coff & 7;
1198            vis_write_gsr(gsr_scale + off);
1199
1200            if (off == 0) {
1201
1202#pragma pipeloop(0)
1203              for (i = 0; i < (xsize + 7) / 8; i++) {
1204                d0 = buffd[2 * i];
1205                d1 = buffd[2 * i + 1];
1206
1207                s0 = buff0[i];
1208                s1 = buff1[i];
1209                s2 = buff2[i];
1210                s3 = buff3[i];
1211
1212                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
1213                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1214                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
1215                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
1216                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
1217                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
1218                d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
1219                d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
1220
1221                d00 = vis_fpadd16(d00, d10);
1222                d20 = vis_fpadd16(d20, d30);
1223                d0 = vis_fpadd16(d0, d00);
1224                d0 = vis_fpadd16(d0, d20);
1225                d01 = vis_fpadd16(d01, d11);
1226                d21 = vis_fpadd16(d21, d31);
1227                d1 = vis_fpadd16(d1, d01);
1228                d1 = vis_fpadd16(d1, d21);
1229                buffd[2 * i] = d0;
1230                buffd[2 * i + 1] = d1;
1231              }
1232
1233            }
1234            else if (off == 4) {
1235
1236              s01 = buff0[0];
1237              s11 = buff1[0];
1238              s21 = buff2[0];
1239              s31 = buff3[0];
1240#pragma pipeloop(0)
1241              for (i = 0; i < (xsize + 7) / 8; i++) {
1242                d0 = buffd[2 * i];
1243                d1 = buffd[2 * i + 1];
1244
1245                s00 = s01;
1246                s10 = s11;
1247                s20 = s21;
1248                s30 = s31;
1249                s01 = buff0[i + 1];
1250                s11 = buff1[i + 1];
1251                s21 = buff2[i + 1];
1252                s31 = buff3[i + 1];
1253
1254                d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
1255                d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
1256                d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
1257                d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
1258                d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
1259                d21 = vis_fmul8x16au(vis_read_hi(s21), k2);
1260                d30 = vis_fmul8x16au(vis_read_lo(s30), k3);
1261                d31 = vis_fmul8x16au(vis_read_hi(s31), k3);
1262
1263                d00 = vis_fpadd16(d00, d10);
1264                d20 = vis_fpadd16(d20, d30);
1265                d0 = vis_fpadd16(d0, d00);
1266                d0 = vis_fpadd16(d0, d20);
1267                d01 = vis_fpadd16(d01, d11);
1268                d21 = vis_fpadd16(d21, d31);
1269                d1 = vis_fpadd16(d1, d01);
1270                d1 = vis_fpadd16(d1, d21);
1271                buffd[2 * i] = d0;
1272                buffd[2 * i + 1] = d1;
1273              }
1274
1275            }
1276            else {
1277
1278              s01 = buff0[0];
1279              s11 = buff1[0];
1280              s21 = buff2[0];
1281              s31 = buff3[0];
1282#pragma pipeloop(0)
1283              for (i = 0; i < (xsize + 7) / 8; i++) {
1284                d0 = buffd[2 * i];
1285                d1 = buffd[2 * i + 1];
1286
1287                s00 = s01;
1288                s10 = s11;
1289                s20 = s21;
1290                s30 = s31;
1291                s01 = buff0[i + 1];
1292                s11 = buff1[i + 1];
1293                s21 = buff2[i + 1];
1294                s31 = buff3[i + 1];
1295                s0 = vis_faligndata(s00, s01);
1296                s1 = vis_faligndata(s10, s11);
1297                s2 = vis_faligndata(s20, s21);
1298                s3 = vis_faligndata(s30, s31);
1299
1300                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
1301                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1302                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
1303                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
1304                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
1305                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
1306                d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
1307                d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
1308
1309                d00 = vis_fpadd16(d00, d10);
1310                d20 = vis_fpadd16(d20, d30);
1311                d0 = vis_fpadd16(d0, d00);
1312                d0 = vis_fpadd16(d0, d20);
1313                d01 = vis_fpadd16(d01, d11);
1314                d21 = vis_fpadd16(d21, d31);
1315                d1 = vis_fpadd16(d1, d01);
1316                d1 = vis_fpadd16(d1, d21);
1317                buffd[2 * i] = d0;
1318                buffd[2 * i + 1] = d1;
1319              }
1320            }
1321          }
1322
1323          pk += 4 * m;
1324        }
1325      }
1326
1327      /*****************************************
1328       *****************************************
1329       **          Final iteration            **
1330       *****************************************
1331       *****************************************/
1332
1333      jk_size = n;
1334
1335      if (jk_size >= 6)
1336        jk_size = 4;
1337      if (jk_size == 5)
1338        jk_size = 3;
1339
1340      k0 = karr[ik_last];
1341      k1 = karr[ik_last + m];
1342      k2 = karr[ik_last + 2 * m];
1343      k3 = karr[ik_last + 3 * m];
1344
1345      off = ik_last;
1346      doff = off / 8;
1347      off &= 7;
1348      buff0 = buff[0] + doff;
1349      buff1 = buff[1] + doff;
1350      buff2 = buff[2] + doff;
1351      buff3 = buff[3] + doff;
1352      vis_write_gsr(gsr_scale + off);
1353
1354      if (jk_size == 1) {
1355        dp = buffe;
1356
1357        s01 = buff0[0];
1358#pragma pipeloop(0)
1359        for (i = 0; i < (xsize + 7) / 8; i++) {
1360          s00 = s01;
1361          s01 = buff0[i + 1];
1362          s0 = vis_faligndata(s00, s01);
1363
1364          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
1365          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1366
1367          d0 = buffd[2 * i];
1368          d1 = buffd[2 * i + 1];
1369          d0 = vis_fpadd16(d0, d00);
1370          d1 = vis_fpadd16(d1, d01);
1371
1372          dd = vis_fpack16_pair(d0, d1);
1373          dp[i] = dd;
1374
1375          buffd[2 * i] = drnd;
1376          buffd[2 * i + 1] = drnd;
1377        }
1378
1379      }
1380      else if (jk_size == 2) {
1381        dp = buffe;
1382
1383        s01 = buff0[0];
1384        s11 = buff1[0];
1385#pragma pipeloop(0)
1386        for (i = 0; i < (xsize + 7) / 8; i++) {
1387          s00 = s01;
1388          s10 = s11;
1389          s01 = buff0[i + 1];
1390          s11 = buff1[i + 1];
1391          s0 = vis_faligndata(s00, s01);
1392          s1 = vis_faligndata(s10, s11);
1393
1394          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
1395          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1396          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
1397          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
1398
1399          d0 = buffd[2 * i];
1400          d1 = buffd[2 * i + 1];
1401          d0 = vis_fpadd16(d0, d00);
1402          d0 = vis_fpadd16(d0, d10);
1403          d1 = vis_fpadd16(d1, d01);
1404          d1 = vis_fpadd16(d1, d11);
1405
1406          dd = vis_fpack16_pair(d0, d1);
1407          dp[i] = dd;
1408
1409          buffd[2 * i] = drnd;
1410          buffd[2 * i + 1] = drnd;
1411        }
1412
1413      }
1414      else if (jk_size == 3) {
1415
1416        dp = buffe;
1417
1418        s01 = buff0[0];
1419        s11 = buff1[0];
1420        s21 = buff2[0];
1421#pragma pipeloop(0)
1422        for (i = 0; i < (xsize + 7) / 8; i++) {
1423          s00 = s01;
1424          s10 = s11;
1425          s20 = s21;
1426          s01 = buff0[i + 1];
1427          s11 = buff1[i + 1];
1428          s21 = buff2[i + 1];
1429          s0 = vis_faligndata(s00, s01);
1430          s1 = vis_faligndata(s10, s11);
1431          s2 = vis_faligndata(s20, s21);
1432
1433          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
1434          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1435          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
1436          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
1437          d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
1438          d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
1439
1440          d0 = buffd[2 * i];
1441          d1 = buffd[2 * i + 1];
1442          d0 = vis_fpadd16(d0, d00);
1443          d0 = vis_fpadd16(d0, d10);
1444          d0 = vis_fpadd16(d0, d20);
1445          d1 = vis_fpadd16(d1, d01);
1446          d1 = vis_fpadd16(d1, d11);
1447          d1 = vis_fpadd16(d1, d21);
1448
1449          dd = vis_fpack16_pair(d0, d1);
1450          dp[i] = dd;
1451
1452          buffd[2 * i] = drnd;
1453          buffd[2 * i + 1] = drnd;
1454        }
1455
1456      }
1457      else {                                /* if (jk_size == 4) */
1458
1459        dp = buffe;
1460
1461        s01 = buff0[0];
1462        s11 = buff1[0];
1463        s21 = buff2[0];
1464        s31 = buff3[0];
1465#pragma pipeloop(0)
1466        for (i = 0; i < (xsize + 7) / 8; i++) {
1467          s00 = s01;
1468          s10 = s11;
1469          s20 = s21;
1470          s30 = s31;
1471          s01 = buff0[i + 1];
1472          s11 = buff1[i + 1];
1473          s21 = buff2[i + 1];
1474          s31 = buff3[i + 1];
1475          s0 = vis_faligndata(s00, s01);
1476          s1 = vis_faligndata(s10, s11);
1477          s2 = vis_faligndata(s20, s21);
1478          s3 = vis_faligndata(s30, s31);
1479
1480          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
1481          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
1482          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
1483          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
1484          d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
1485          d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
1486          d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
1487          d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
1488
1489          d0 = buffd[2 * i];
1490          d1 = buffd[2 * i + 1];
1491          d0 = vis_fpadd16(d0, d00);
1492          d0 = vis_fpadd16(d0, d10);
1493          d0 = vis_fpadd16(d0, d20);
1494          d0 = vis_fpadd16(d0, d30);
1495          d1 = vis_fpadd16(d1, d01);
1496          d1 = vis_fpadd16(d1, d11);
1497          d1 = vis_fpadd16(d1, d21);
1498          d1 = vis_fpadd16(d1, d31);
1499
1500          dd = vis_fpack16_pair(d0, d1);
1501          dp[i] = dd;
1502
1503          buffd[2 * i] = drnd;
1504          buffd[2 * i + 1] = drnd;
1505        }
1506      }
1507
1508      (*p_proc_store) ((mlib_u8 *) buffe, (mlib_u8 *) dl, xsize, testchan);
1509
1510      if (j < hgt - dy_b - 2)
1511        sl += sll;
1512      dl += dll;
1513
1514      buff_ind++;
1515
1516      if (buff_ind >= (n + 1))
1517        buff_ind = 0;
1518    }
1519
1520    testchan <<= 1;
1521  }
1522
1523  mlib_free(pbuff);
1524
1525  if (buffs != buffs_local)
1526    mlib_free(buffs);
1527
1528  return MLIB_SUCCESS;
1529}
1530
1531/***************************************************************/
1532