1/*
2 * Copyright (c) 1998, 2003, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26
27
28/*
29 * FUNCTION
30 *      mlib_convMxN_8nw - convolve a 8-bit image, MxN kernel,
31 *                         edge = no write
32 *
33 * SYNOPSIS
34 *      mlib_status mlib_convMxNnw_u8(mlib_image       *dst,
35 *                                    const mlib_image *src,
36 *                                    mlib_s32         kwid,
37 *                                    mlib_s32         khgt,
38 *                                    mlib_s32         khw,
39 *                                    mlib_s32         khh,
40 *                                    const mlib_s32   *skernel,
41 *                                    mlib_s32         discardbits,
42 *                                    mlib_s32         cmask)
43 *
44 * ARGUMENT
45 *      src       Ptr to source image structure
46 *      dst       Ptr to destination image structure
47 *      khgt         Kernel height (# of rows)
48 *      kwid         Kernel width (# of cols)
49 *      skernel      Ptr to convolution kernel
50 *      discardbits  The number of LSBits of the 32-bit accumulator that
51 *                   are discarded when the 32-bit accumulator is converted
52 *                   to 16-bit output data; discardbits must be 1-15 (it
53 *                   cannot be zero). Same as exponent N for scalefac=2**N.
54 *      cmask        Channel mask to indicate the channels to be convolved.
55 *                   Each bit of which represents a channel in the image. The
56 *                   channels corresponded to 1 bits are those to be processed.
57 *
58 * DESCRIPTION
59 *      A 2-D convolution (MxN kernel) for 8-bit images.
60 *
61 */
62
63#include "vis_proto.h"
64#include "mlib_image.h"
65#include "mlib_ImageConv.h"
66#include "mlib_c_ImageConv.h"
67#include "mlib_v_ImageConv.h"
68#include "mlib_v_ImageChannelExtract.h"
69#include "mlib_v_ImageChannelInsert.h"
70
71/***************************************************************/
72static mlib_status mlib_convMxN_8nw_mask(mlib_image       *dst,
73                                         const mlib_image *src,
74                                         mlib_s32         m,
75                                         mlib_s32         n,
76                                         mlib_s32         dm,
77                                         mlib_s32         dn,
78                                         const mlib_s32   *kern,
79                                         mlib_s32         scale,
80                                         mlib_s32         cmask);
81
82/***************************************************************/
83static const mlib_s32 mlib_round_8[16] = {
84  0x00400040, 0x00200020, 0x00100010, 0x00080008,
85  0x00040004, 0x00020002, 0x00010001, 0x00000000,
86  0x00000000, 0x00000000, 0x00000000, 0x00000000,
87  0x00000000, 0x00000000, 0x00000000, 0x00000000
88};
89
90/***************************************************************/
91mlib_status mlib_convMxNnw_u8(mlib_image       *dst,
92                              const mlib_image *src,
93                              const mlib_s32   *kernel,
94                              mlib_s32         kwid,
95                              mlib_s32         khgt,
96                              mlib_s32         khw,
97                              mlib_s32         khh,
98                              mlib_s32         discardbits,
99                              mlib_s32         cmask)
100{
101  mlib_s32 nchannel, amask;
102
103  if (mlib_ImageConvVersion(kwid, khgt, discardbits, MLIB_BYTE) == 0)
104    return mlib_c_convMxNnw_u8(dst, src, kernel, kwid, khgt, khw, khh,
105                               discardbits, cmask);
106
107  nchannel = mlib_ImageGetChannels(src);
108
109  if (nchannel == 1)
110    cmask = 1;
111  amask = (1 << nchannel) - 1;
112
113  if ((cmask & amask) == amask) {
114    return mlib_convMxN_8nw_f(dst, src, kwid, khgt, khw, khh, kernel, discardbits);
115  }
116  else {
117    return mlib_convMxN_8nw_mask(dst, src, kwid, khgt, khw, khh, kernel,
118                                 discardbits, cmask);
119  }
120}
121
122#define MAX_N   11
123
124/***************************************************************/
125mlib_status mlib_convMxN_8nw_mask(mlib_image       *dst,
126                                  const mlib_image *src,
127                                  mlib_s32         m,
128                                  mlib_s32         n,
129                                  mlib_s32         dm,
130                                  mlib_s32         dn,
131                                  const mlib_s32   *kern,
132                                  mlib_s32         scale,
133                                  mlib_s32         cmask)
134{
135  mlib_d64 *buffs_local[3 * (MAX_N + 1)], **buffs = buffs_local, **buff;
136  mlib_d64 *buff0, *buff1, *buff2, *buff3, *buffn, *buffd, *buffe;
137  mlib_d64 s00, s01, s10, s11, s20, s21, s30, s31, s0, s1, s2, s3;
138  mlib_d64 d00, d01, d10, d11, d20, d21, d30, d31;
139  mlib_d64 dd, d0, d1;
140  mlib_s32 ik, jk, ik_last, jk_size, coff, off, doff;
141  mlib_u8 *sl, *sp, *dl;
142  mlib_s32 hgt = mlib_ImageGetHeight(src);
143  mlib_s32 wid = mlib_ImageGetWidth(src);
144  mlib_s32 sll = mlib_ImageGetStride(src);
145  mlib_s32 dll = mlib_ImageGetStride(dst);
146  mlib_u8 *adr_src = (mlib_u8 *) mlib_ImageGetData(src);
147  mlib_u8 *adr_dst = (mlib_u8 *) mlib_ImageGetData(dst);
148  mlib_s32 ssize, xsize, dsize, esize, buff_ind;
149  mlib_d64 *pbuff, *dp;
150  mlib_f32 *karr = (mlib_f32 *) kern;
151  mlib_s32 gsr_scale = (31 - scale) << 3;
152  mlib_d64 drnd = vis_to_double_dup(mlib_round_8[31 - scale]);
153  mlib_s32 i, j, l, chan, testchan;
154  mlib_s32 nchan = mlib_ImageGetChannels(dst);
155  void (*p_proc_load) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32);
156  void (*p_proc_store) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32);
157
158  if (n > MAX_N) {
159    buffs = mlib_malloc(3 * (n + 1) * sizeof(mlib_d64 *));
160
161    if (buffs == NULL)
162      return MLIB_FAILURE;
163  }
164
165  buff = buffs + 2 * (n + 1);
166
167  adr_dst += dn * dll + dm * nchan;
168
169  ssize = wid;
170  dsize = (ssize + 7) / 8;
171  esize = dsize + 4;
172  pbuff = mlib_malloc((n + 4) * esize * sizeof(mlib_d64));
173
174  if (pbuff == NULL) {
175    if (buffs != buffs_local)
176      mlib_free(buffs);
177    return MLIB_FAILURE;
178  }
179
180  for (i = 0; i < (n + 1); i++)
181    buffs[i] = pbuff + i * esize;
182  for (i = 0; i < (n + 1); i++)
183    buffs[(n + 1) + i] = buffs[i];
184  buffd = buffs[n] + esize;
185  buffe = buffd + 2 * esize;
186
187  hgt -= (n - 1);
188  xsize = ssize - (m - 1);
189
190  vis_write_gsr(gsr_scale + 7);
191
192  if (nchan == 2) {
193    p_proc_load = &mlib_v_ImageChannelExtract_U8_21_D1;
194    p_proc_store = &mlib_v_ImageChannelInsert_U8_12_D1;
195  }
196  else if (nchan == 3) {
197    p_proc_load = &mlib_v_ImageChannelExtract_U8_31_D1;
198    p_proc_store = &mlib_v_ImageChannelInsert_U8_13_D1;
199  }
200  else {
201    p_proc_load = &mlib_v_ImageChannelExtract_U8_41_D1;
202    p_proc_store = &mlib_v_ImageChannelInsert_U8_14_D1;
203  }
204
205  testchan = 1;
206  for (chan = 0; chan < nchan; chan++) {
207    buff_ind = 0;
208    sl = adr_src;
209    dl = adr_dst;
210
211    if ((cmask & testchan) == 0) {
212      testchan <<= 1;
213      continue;
214    }
215
216    for (l = 0; l < n; l++) {
217      mlib_d64 *buffn = buffs[l];
218      sp = sl + l * sll;
219
220      (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan);
221    }
222
223    /* init buffer */
224#pragma pipeloop(0)
225    for (i = 0; i < (xsize + 7) / 8; i++) {
226      buffd[2 * i] = drnd;
227      buffd[2 * i + 1] = drnd;
228    }
229
230    for (j = 0; j < hgt; j++) {
231      mlib_d64 **buffc = buffs + buff_ind;
232      mlib_f32 *pk = karr, k0, k1, k2, k3;
233      sp = sl + n * sll;
234
235      for (l = 0; l < n; l++) {
236        buff[l] = buffc[l];
237      }
238
239      buffn = buffc[n];
240
241      (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan);
242
243      ik_last = (m - 1);
244
245      for (jk = 0; jk < n; jk += jk_size) {
246        jk_size = n - jk;
247
248        if (jk_size >= 6)
249          jk_size = 4;
250
251        if (jk_size == 5)
252          jk_size = 3;
253
254        coff = 0;
255
256        if (jk_size == 1) {
257
258          for (ik = 0; ik < m; ik++, coff++) {
259            if (!jk && ik == ik_last)
260              continue;
261
262            k0 = pk[ik];
263
264            doff = coff / 8;
265            buff0 = buff[jk] + doff;
266
267            off = coff & 7;
268            vis_write_gsr(gsr_scale + off);
269
270            s01 = buff0[0];
271#pragma pipeloop(0)
272            for (i = 0; i < (xsize + 7) / 8; i++) {
273              s00 = s01;
274              s01 = buff0[i + 1];
275              s0 = vis_faligndata(s00, s01);
276
277              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
278              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
279
280              d0 = buffd[2 * i];
281              d1 = buffd[2 * i + 1];
282              d0 = vis_fpadd16(d00, d0);
283              d1 = vis_fpadd16(d01, d1);
284              buffd[2 * i] = d0;
285              buffd[2 * i + 1] = d1;
286            }
287          }
288
289          pk += m;
290        }
291        else if (jk_size == 2) {
292
293          for (ik = 0; ik < m; ik++, coff++) {
294            if (!jk && ik == ik_last)
295              continue;
296
297            k0 = pk[ik];
298            k1 = pk[ik + m];
299
300            doff = coff / 8;
301            buff0 = buff[jk] + doff;
302            buff1 = buff[jk + 1] + doff;
303
304            off = coff & 7;
305            vis_write_gsr(gsr_scale + off);
306
307            s01 = buff0[0];
308            s11 = buff1[0];
309#pragma pipeloop(0)
310            for (i = 0; i < (xsize + 7) / 8; i++) {
311              s00 = s01;
312              s10 = s11;
313              s01 = buff0[i + 1];
314              s11 = buff1[i + 1];
315              s0 = vis_faligndata(s00, s01);
316              s1 = vis_faligndata(s10, s11);
317
318              d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
319              d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
320              d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
321              d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
322
323              d0 = buffd[2 * i];
324              d1 = buffd[2 * i + 1];
325              d0 = vis_fpadd16(d00, d0);
326              d0 = vis_fpadd16(d10, d0);
327              d1 = vis_fpadd16(d01, d1);
328              d1 = vis_fpadd16(d11, d1);
329              buffd[2 * i] = d0;
330              buffd[2 * i + 1] = d1;
331            }
332          }
333
334          pk += 2 * m;
335        }
336        else if (jk_size == 3) {
337
338          for (ik = 0; ik < m; ik++, coff++) {
339            if (!jk && ik == ik_last)
340              continue;
341
342            k0 = pk[ik];
343            k1 = pk[ik + m];
344            k2 = pk[ik + 2 * m];
345
346            doff = coff / 8;
347            buff0 = buff[jk] + doff;
348            buff1 = buff[jk + 1] + doff;
349            buff2 = buff[jk + 2] + doff;
350
351            off = coff & 7;
352            vis_write_gsr(gsr_scale + off);
353
354            if (off == 0) {
355#pragma pipeloop(0)
356              for (i = 0; i < (xsize + 7) / 8; i++) {
357                d0 = buffd[2 * i];
358                d1 = buffd[2 * i + 1];
359
360                s0 = buff0[i];
361                s1 = buff1[i];
362                s2 = buff2[i];
363
364                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
365                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
366                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
367                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
368                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
369                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
370
371                d00 = vis_fpadd16(d00, d10);
372                d0 = vis_fpadd16(d20, d0);
373                d0 = vis_fpadd16(d00, d0);
374                d01 = vis_fpadd16(d01, d11);
375                d1 = vis_fpadd16(d21, d1);
376                d1 = vis_fpadd16(d01, d1);
377                buffd[2 * i] = d0;
378                buffd[2 * i + 1] = d1;
379              }
380            }
381            else if (off == 4) {
382              s01 = buff0[0];
383              s11 = buff1[0];
384              s21 = buff2[0];
385#pragma pipeloop(0)
386              for (i = 0; i < (xsize + 7) / 8; i++) {
387                d0 = buffd[2 * i];
388                d1 = buffd[2 * i + 1];
389
390                s00 = s01;
391                s10 = s11;
392                s20 = s21;
393                s01 = buff0[i + 1];
394                s11 = buff1[i + 1];
395                s21 = buff2[i + 1];
396
397                d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
398                d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
399                d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
400                d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
401                d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
402                d21 = vis_fmul8x16au(vis_read_hi(s21), k2);
403
404                d00 = vis_fpadd16(d00, d10);
405                d0 = vis_fpadd16(d20, d0);
406                d0 = vis_fpadd16(d00, d0);
407                d01 = vis_fpadd16(d01, d11);
408                d1 = vis_fpadd16(d21, d1);
409                d1 = vis_fpadd16(d01, d1);
410                buffd[2 * i] = d0;
411                buffd[2 * i + 1] = d1;
412              }
413            }
414            else {
415              s01 = buff0[0];
416              s11 = buff1[0];
417              s21 = buff2[0];
418#pragma pipeloop(0)
419              for (i = 0; i < (xsize + 7) / 8; i++) {
420                d0 = buffd[2 * i];
421                d1 = buffd[2 * i + 1];
422
423                s00 = s01;
424                s10 = s11;
425                s20 = s21;
426                s01 = buff0[i + 1];
427                s11 = buff1[i + 1];
428                s21 = buff2[i + 1];
429                s0 = vis_faligndata(s00, s01);
430                s1 = vis_faligndata(s10, s11);
431                s2 = vis_faligndata(s20, s21);
432
433                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
434                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
435                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
436                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
437                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
438                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
439
440                d00 = vis_fpadd16(d00, d10);
441                d0 = vis_fpadd16(d20, d0);
442                d0 = vis_fpadd16(d00, d0);
443                d01 = vis_fpadd16(d01, d11);
444                d1 = vis_fpadd16(d21, d1);
445                d1 = vis_fpadd16(d01, d1);
446                buffd[2 * i] = d0;
447                buffd[2 * i + 1] = d1;
448              }
449            }
450          }
451
452          pk += 3 * m;
453        }
454        else {                              /* jk_size == 4 */
455
456          for (ik = 0; ik < m; ik++, coff++) {
457            if (!jk && ik == ik_last)
458              continue;
459
460            k0 = pk[ik];
461            k1 = pk[ik + m];
462            k2 = pk[ik + 2 * m];
463            k3 = pk[ik + 3 * m];
464
465            doff = coff / 8;
466            buff0 = buff[jk] + doff;
467            buff1 = buff[jk + 1] + doff;
468            buff2 = buff[jk + 2] + doff;
469            buff3 = buff[jk + 3] + doff;
470
471            off = coff & 7;
472            vis_write_gsr(gsr_scale + off);
473
474            if (off == 0) {
475
476#pragma pipeloop(0)
477              for (i = 0; i < (xsize + 7) / 8; i++) {
478                d0 = buffd[2 * i];
479                d1 = buffd[2 * i + 1];
480
481                s0 = buff0[i];
482                s1 = buff1[i];
483                s2 = buff2[i];
484                s3 = buff3[i];
485
486                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
487                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
488                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
489                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
490                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
491                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
492                d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
493                d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
494
495                d00 = vis_fpadd16(d00, d10);
496                d20 = vis_fpadd16(d20, d30);
497                d0 = vis_fpadd16(d0, d00);
498                d0 = vis_fpadd16(d0, d20);
499                d01 = vis_fpadd16(d01, d11);
500                d21 = vis_fpadd16(d21, d31);
501                d1 = vis_fpadd16(d1, d01);
502                d1 = vis_fpadd16(d1, d21);
503                buffd[2 * i] = d0;
504                buffd[2 * i + 1] = d1;
505              }
506            }
507            else if (off == 4) {
508
509              s01 = buff0[0];
510              s11 = buff1[0];
511              s21 = buff2[0];
512              s31 = buff3[0];
513#pragma pipeloop(0)
514              for (i = 0; i < (xsize + 7) / 8; i++) {
515                d0 = buffd[2 * i];
516                d1 = buffd[2 * i + 1];
517
518                s00 = s01;
519                s10 = s11;
520                s20 = s21;
521                s30 = s31;
522                s01 = buff0[i + 1];
523                s11 = buff1[i + 1];
524                s21 = buff2[i + 1];
525                s31 = buff3[i + 1];
526
527                d00 = vis_fmul8x16au(vis_read_lo(s00), k0);
528                d01 = vis_fmul8x16au(vis_read_hi(s01), k0);
529                d10 = vis_fmul8x16au(vis_read_lo(s10), k1);
530                d11 = vis_fmul8x16au(vis_read_hi(s11), k1);
531                d20 = vis_fmul8x16au(vis_read_lo(s20), k2);
532                d21 = vis_fmul8x16au(vis_read_hi(s21), k2);
533                d30 = vis_fmul8x16au(vis_read_lo(s30), k3);
534                d31 = vis_fmul8x16au(vis_read_hi(s31), k3);
535
536                d00 = vis_fpadd16(d00, d10);
537                d20 = vis_fpadd16(d20, d30);
538                d0 = vis_fpadd16(d0, d00);
539                d0 = vis_fpadd16(d0, d20);
540                d01 = vis_fpadd16(d01, d11);
541                d21 = vis_fpadd16(d21, d31);
542                d1 = vis_fpadd16(d1, d01);
543                d1 = vis_fpadd16(d1, d21);
544                buffd[2 * i] = d0;
545                buffd[2 * i + 1] = d1;
546              }
547            }
548            else {
549
550              s01 = buff0[0];
551              s11 = buff1[0];
552              s21 = buff2[0];
553              s31 = buff3[0];
554#pragma pipeloop(0)
555              for (i = 0; i < (xsize + 7) / 8; i++) {
556                d0 = buffd[2 * i];
557                d1 = buffd[2 * i + 1];
558
559                s00 = s01;
560                s10 = s11;
561                s20 = s21;
562                s30 = s31;
563                s01 = buff0[i + 1];
564                s11 = buff1[i + 1];
565                s21 = buff2[i + 1];
566                s31 = buff3[i + 1];
567                s0 = vis_faligndata(s00, s01);
568                s1 = vis_faligndata(s10, s11);
569                s2 = vis_faligndata(s20, s21);
570                s3 = vis_faligndata(s30, s31);
571
572                d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
573                d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
574                d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
575                d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
576                d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
577                d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
578                d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
579                d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
580
581                d00 = vis_fpadd16(d00, d10);
582                d20 = vis_fpadd16(d20, d30);
583                d0 = vis_fpadd16(d0, d00);
584                d0 = vis_fpadd16(d0, d20);
585                d01 = vis_fpadd16(d01, d11);
586                d21 = vis_fpadd16(d21, d31);
587                d1 = vis_fpadd16(d1, d01);
588                d1 = vis_fpadd16(d1, d21);
589                buffd[2 * i] = d0;
590                buffd[2 * i + 1] = d1;
591              }
592            }
593          }
594
595          pk += 4 * m;
596        }
597      }
598
599      /*****************************************
600       *****************************************
601       **          Final iteration            **
602       *****************************************
603       *****************************************/
604
605      jk_size = n;
606
607      if (jk_size >= 6)
608        jk_size = 4;
609
610      if (jk_size == 5)
611        jk_size = 3;
612
613      k0 = karr[ik_last];
614      k1 = karr[ik_last + m];
615      k2 = karr[ik_last + 2 * m];
616      k3 = karr[ik_last + 3 * m];
617
618      off = ik_last;
619      doff = off / 8;
620      off &= 7;
621      buff0 = buff[0] + doff;
622      buff1 = buff[1] + doff;
623      buff2 = buff[2] + doff;
624      buff3 = buff[3] + doff;
625      vis_write_gsr(gsr_scale + off);
626
627      if (jk_size == 1) {
628        dp = buffe;
629
630        s01 = buff0[0];
631#pragma pipeloop(0)
632        for (i = 0; i < (xsize + 7) / 8; i++) {
633          s00 = s01;
634          s01 = buff0[i + 1];
635          s0 = vis_faligndata(s00, s01);
636
637          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
638          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
639
640          d0 = buffd[2 * i];
641          d1 = buffd[2 * i + 1];
642          d0 = vis_fpadd16(d0, d00);
643          d1 = vis_fpadd16(d1, d01);
644
645          dd = vis_fpack16_pair(d0, d1);
646          dp[i] = dd;
647
648          buffd[2 * i] = drnd;
649          buffd[2 * i + 1] = drnd;
650        }
651      }
652      else if (jk_size == 2) {
653        dp = buffe;
654
655        s01 = buff0[0];
656        s11 = buff1[0];
657#pragma pipeloop(0)
658        for (i = 0; i < (xsize + 7) / 8; i++) {
659          s00 = s01;
660          s10 = s11;
661          s01 = buff0[i + 1];
662          s11 = buff1[i + 1];
663          s0 = vis_faligndata(s00, s01);
664          s1 = vis_faligndata(s10, s11);
665
666          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
667          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
668          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
669          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
670
671          d0 = buffd[2 * i];
672          d1 = buffd[2 * i + 1];
673          d0 = vis_fpadd16(d0, d00);
674          d0 = vis_fpadd16(d0, d10);
675          d1 = vis_fpadd16(d1, d01);
676          d1 = vis_fpadd16(d1, d11);
677
678          dd = vis_fpack16_pair(d0, d1);
679          dp[i] = dd;
680
681          buffd[2 * i] = drnd;
682          buffd[2 * i + 1] = drnd;
683        }
684      }
685      else if (jk_size == 3) {
686
687        dp = buffe;
688
689        s01 = buff0[0];
690        s11 = buff1[0];
691        s21 = buff2[0];
692#pragma pipeloop(0)
693        for (i = 0; i < (xsize + 7) / 8; i++) {
694          s00 = s01;
695          s10 = s11;
696          s20 = s21;
697          s01 = buff0[i + 1];
698          s11 = buff1[i + 1];
699          s21 = buff2[i + 1];
700          s0 = vis_faligndata(s00, s01);
701          s1 = vis_faligndata(s10, s11);
702          s2 = vis_faligndata(s20, s21);
703
704          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
705          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
706          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
707          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
708          d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
709          d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
710
711          d0 = buffd[2 * i];
712          d1 = buffd[2 * i + 1];
713          d0 = vis_fpadd16(d0, d00);
714          d0 = vis_fpadd16(d0, d10);
715          d0 = vis_fpadd16(d0, d20);
716          d1 = vis_fpadd16(d1, d01);
717          d1 = vis_fpadd16(d1, d11);
718          d1 = vis_fpadd16(d1, d21);
719
720          dd = vis_fpack16_pair(d0, d1);
721          dp[i] = dd;
722
723          buffd[2 * i] = drnd;
724          buffd[2 * i + 1] = drnd;
725        }
726      }
727      else {                                /* if (jk_size == 4) */
728
729        dp = buffe;
730
731        s01 = buff0[0];
732        s11 = buff1[0];
733        s21 = buff2[0];
734        s31 = buff3[0];
735#pragma pipeloop(0)
736        for (i = 0; i < (xsize + 7) / 8; i++) {
737          s00 = s01;
738          s10 = s11;
739          s20 = s21;
740          s30 = s31;
741          s01 = buff0[i + 1];
742          s11 = buff1[i + 1];
743          s21 = buff2[i + 1];
744          s31 = buff3[i + 1];
745          s0 = vis_faligndata(s00, s01);
746          s1 = vis_faligndata(s10, s11);
747          s2 = vis_faligndata(s20, s21);
748          s3 = vis_faligndata(s30, s31);
749
750          d00 = vis_fmul8x16au(vis_read_hi(s0), k0);
751          d01 = vis_fmul8x16au(vis_read_lo(s0), k0);
752          d10 = vis_fmul8x16au(vis_read_hi(s1), k1);
753          d11 = vis_fmul8x16au(vis_read_lo(s1), k1);
754          d20 = vis_fmul8x16au(vis_read_hi(s2), k2);
755          d21 = vis_fmul8x16au(vis_read_lo(s2), k2);
756          d30 = vis_fmul8x16au(vis_read_hi(s3), k3);
757          d31 = vis_fmul8x16au(vis_read_lo(s3), k3);
758
759          d0 = buffd[2 * i];
760          d1 = buffd[2 * i + 1];
761          d0 = vis_fpadd16(d0, d00);
762          d0 = vis_fpadd16(d0, d10);
763          d0 = vis_fpadd16(d0, d20);
764          d0 = vis_fpadd16(d0, d30);
765          d1 = vis_fpadd16(d1, d01);
766          d1 = vis_fpadd16(d1, d11);
767          d1 = vis_fpadd16(d1, d21);
768          d1 = vis_fpadd16(d1, d31);
769
770          dd = vis_fpack16_pair(d0, d1);
771          dp[i] = dd;
772
773          buffd[2 * i] = drnd;
774          buffd[2 * i + 1] = drnd;
775        }
776      }
777
778      (*p_proc_store) ((mlib_u8 *) buffe, (mlib_u8 *) dl, xsize, testchan);
779
780      sl += sll;
781      dl += dll;
782
783      buff_ind++;
784
785      if (buff_ind >= (n + 1))
786        buff_ind = 0;
787    }
788
789    testchan <<= 1;
790  }
791
792  mlib_free(pbuff);
793
794  if (buffs != buffs_local)
795    mlib_free(buffs);
796
797  return MLIB_SUCCESS;
798}
799
800/***************************************************************/
801