1/*
2 * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26
27/*
28 * FUNCTION
29 *   Internal functions for mlib_ImageConv* on U8/S16/U16 types and
30 *   MLIB_EDGE_DST_NO_WRITE mask
31 */
32
33#include "mlib_image.h"
34#include "mlib_ImageConv.h"
35#include "mlib_c_ImageConv.h"
36
37/*
38  This define switches between functions of different data types
39*/
40#define IMG_TYPE 1
41
42/***************************************************************/
43#if IMG_TYPE == 1
44
45#define DTYPE             mlib_u8
46#define CONV_FUNC(KERN)   mlib_c_conv##KERN##nw_u8
47#define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u8
48#define DSCALE            (1 << 24)
49#define FROM_S32(x)       (((x) >> 24) ^ 128)
50#define S64TOS32(x)       (x)
51#define SAT_OFF           -(1u << 31)
52
53#elif IMG_TYPE == 2
54
55#define DTYPE             mlib_s16
56#define CONV_FUNC(KERN)   mlib_conv##KERN##nw_s16
57#define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_s16
58#define DSCALE            65536.0
59#define FROM_S32(x)       ((x) >> 16)
60#define S64TOS32(x)       ((x) & 0xffffffff)
61#define SAT_OFF
62
63#elif IMG_TYPE == 3
64
65#define DTYPE             mlib_u16
66#define CONV_FUNC(KERN)   mlib_conv##KERN##nw_u16
67#define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u16
68#define DSCALE            65536.0
69#define FROM_S32(x)       (((x) >> 16) ^ 0x8000)
70#define S64TOS32(x)       (x)
71#define SAT_OFF           -(1u << 31)
72
73#endif /* IMG_TYPE == 1 */
74
75/***************************************************************/
76#define BUFF_SIZE   1600
77
78#define CACHE_SIZE  (64*1024)
79
80/***************************************************************/
81#define FTYPE mlib_d64
82
83#ifndef MLIB_USE_FTOI_CLAMPING
84
85#define CLAMP_S32(x)                                            \
86  (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
87
88#else
89
90#define CLAMP_S32(x) ((mlib_s32)(x))
91
92#endif /* MLIB_USE_FTOI_CLAMPING */
93
94/***************************************************************/
95#define D2I(x) CLAMP_S32((x) SAT_OFF)
96
97/***************************************************************/
98#ifdef _LITTLE_ENDIAN
99
100#define STORE2(res0, res1)                                      \
101  dp[0    ] = res1;                                             \
102  dp[chan1] = res0
103
104#else
105
106#define STORE2(res0, res1)                                      \
107  dp[0    ] = res0;                                             \
108  dp[chan1] = res1
109
110#endif /* _LITTLE_ENDIAN */
111
112/***************************************************************/
113#ifdef _NO_LONGLONG
114
115#define LOAD_BUFF(buff)                                         \
116  buff[i    ] = sp[0];                                          \
117  buff[i + 1] = sp[chan1]
118
119#else /* _NO_LONGLONG */
120
121#ifdef _LITTLE_ENDIAN
122
123#define LOAD_BUFF(buff)                                         \
124  *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
125
126#else /* _LITTLE_ENDIAN */
127
128#define LOAD_BUFF(buff)                                         \
129  *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
130
131#endif /* _LITTLE_ENDIAN */
132#endif /* _NO_LONGLONG */
133
134/***************************************************************/
135typedef union {
136  mlib_d64 d64;
137  struct {
138    mlib_s32 i0;
139    mlib_s32 i1;
140  } i32s;
141  struct {
142    mlib_s32 f0;
143    mlib_s32 f1;
144  } f32s;
145} d64_2x32;
146
147/***************************************************************/
148#define DEF_VARS(type)                                          \
149  type     *adr_src, *sl, *sp = NULL;                           \
150  type     *adr_dst, *dl, *dp = NULL;                           \
151  FTYPE    *pbuff = buff;                                       \
152  mlib_s32 wid, hgt, sll, dll;                                  \
153  mlib_s32 nchannel, chan1;                                     \
154  mlib_s32 i, j, c
155
156/***************************************************************/
157#define GET_SRC_DST_PARAMETERS(type)                            \
158  hgt = mlib_ImageGetHeight(src);                               \
159  wid = mlib_ImageGetWidth(src);                                \
160  nchannel = mlib_ImageGetChannels(src);                        \
161  sll = mlib_ImageGetStride(src) / sizeof(type);                \
162  dll = mlib_ImageGetStride(dst) / sizeof(type);                \
163  adr_src = (type *)mlib_ImageGetData(src);                     \
164  adr_dst = (type *)mlib_ImageGetData(dst)
165
166/***************************************************************/
167#ifndef __sparc
168
169#if IMG_TYPE == 1
170
171/* Test for the presence of any "1" bit in bits
172   8 to 31 of val. If present, then val is either
173   negative or >255. If over/underflows of 8 bits
174   are uncommon, then this technique can be a win,
175   since only a single test, rather than two, is
176   necessary to determine if clamping is needed.
177   On the other hand, if over/underflows are common,
178   it adds an extra test.
179*/
180#define CLAMP_STORE(dst, val)                                   \
181  if (val & 0xffffff00) {                                       \
182    if (val < MLIB_U8_MIN)                                      \
183      dst = MLIB_U8_MIN;                                        \
184    else                                                        \
185      dst = MLIB_U8_MAX;                                        \
186  } else {                                                      \
187    dst = (mlib_u8)val;                                         \
188  }
189
190#elif IMG_TYPE == 2
191
192#define CLAMP_STORE(dst, val)                                   \
193  if (val >= MLIB_S16_MAX)                                      \
194    dst = MLIB_S16_MAX;                                         \
195  else if (val <= MLIB_S16_MIN)                                 \
196    dst = MLIB_S16_MIN;                                         \
197  else                                                          \
198    dst = (mlib_s16)val
199
200#elif IMG_TYPE == 3
201
202#define CLAMP_STORE(dst, val)                                   \
203  if (val >= MLIB_U16_MAX)                                      \
204    dst = MLIB_U16_MAX;                                         \
205  else if (val <= MLIB_U16_MIN)                                 \
206    dst = MLIB_U16_MIN;                                         \
207  else                                                          \
208    dst = (mlib_u16)val
209
210#endif /* IMG_TYPE == 1 */
211#endif /* __sparc */
212
213/***************************************************************/
214#define MAX_KER   7
215#define MAX_N    15
216
217static mlib_status mlib_ImageConv1xN(mlib_image       *dst,
218                                     const mlib_image *src,
219                                     const mlib_d64   *k,
220                                     mlib_s32         n,
221                                     mlib_s32         dn,
222                                     mlib_s32         cmask)
223{
224  FTYPE    buff[BUFF_SIZE];
225  mlib_s32 off, kh;
226  mlib_s32 d0, d1;
227  const FTYPE    *pk;
228  FTYPE    k0, k1, k2, k3;
229  FTYPE    p0, p1, p2, p3, p4;
230  DEF_VARS(DTYPE);
231  DTYPE    *sl_c, *dl_c, *sl0;
232  mlib_s32 l, hsize, max_hsize;
233  GET_SRC_DST_PARAMETERS(DTYPE);
234
235  hgt -= (n - 1);
236  adr_dst += dn*dll;
237
238  max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll;
239
240  if (!max_hsize) max_hsize = 1;
241
242  if (max_hsize > BUFF_SIZE) {
243    pbuff = mlib_malloc(sizeof(FTYPE)*max_hsize);
244  }
245
246  chan1 = nchannel;
247
248  sl_c = adr_src;
249  dl_c = adr_dst;
250
251  for (l = 0; l < hgt; l += hsize) {
252    hsize = hgt - l;
253
254    if (hsize > max_hsize) hsize = max_hsize;
255
256    for (c = 0; c < nchannel; c++) {
257      if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
258
259      sl = sl_c + c;
260      dl = dl_c + c;
261
262#ifdef __SUNPRO_C
263#pragma pipeloop(0)
264#endif /* __SUNPRO_C */
265      for (j = 0; j < hsize; j++) pbuff[j] = 0.0;
266
267      for (i = 0; i < wid; i++) {
268        sl0 = sl;
269
270        for (off = 0; off < (n - 4); off += 4) {
271          pk = k + off;
272          sp = sl0;
273
274          k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
275          p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
276          sp += 3*sll;
277
278#ifdef __SUNPRO_C
279#pragma pipeloop(0)
280#endif /* __SUNPRO_C */
281          for (j = 0; j < hsize; j += 2) {
282            p0 = p2; p1 = p3; p2 = p4;
283            p3 = sp[0];
284            p4 = sp[sll];
285
286            pbuff[j    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
287            pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
288
289            sp += 2*sll;
290          }
291
292          sl0 += 4*sll;
293        }
294
295        pk = k + off;
296        sp = sl0;
297
298        k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
299        p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll];
300
301        dp = dl;
302        kh = n - off;
303
304        if (kh == 4) {
305          sp += 3*sll;
306
307#ifdef __SUNPRO_C
308#pragma pipeloop(0)
309#endif /* __SUNPRO_C */
310          for (j = 0; j <= (hsize - 2); j += 2) {
311            p0 = p2; p1 = p3; p2 = p4;
312            p3 = sp[0];
313            p4 = sp[sll];
314
315            d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);
316            d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1]);
317
318            dp[0  ] = FROM_S32(d0);
319            dp[dll] = FROM_S32(d1);
320
321            pbuff[j] = 0;
322            pbuff[j + 1] = 0;
323
324            sp += 2*sll;
325            dp += 2*dll;
326          }
327
328          if (j < hsize) {
329            p0 = p2; p1 = p3; p2 = p4;
330            p3 = sp[0];
331
332            d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]);
333
334            pbuff[j] = 0;
335
336            dp[0] = FROM_S32(d0);
337          }
338
339        } else if (kh == 3) {
340          sp += 2*sll;
341
342#ifdef __SUNPRO_C
343#pragma pipeloop(0)
344#endif /* __SUNPRO_C */
345          for (j = 0; j <= (hsize - 2); j += 2) {
346            p0 = p2; p1 = p3;
347            p2 = sp[0];
348            p3 = sp[sll];
349
350            d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);
351            d1 = D2I(p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1]);
352
353            dp[0  ] = FROM_S32(d0);
354            dp[dll] = FROM_S32(d1);
355
356            pbuff[j] = 0;
357            pbuff[j + 1] = 0;
358
359            sp += 2*sll;
360            dp += 2*dll;
361          }
362
363          if (j < hsize) {
364            p0 = p2; p1 = p3;
365            p2 = sp[0];
366
367            d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]);
368
369            pbuff[j] = 0;
370
371            dp[0] = FROM_S32(d0);
372          }
373
374        } else if (kh == 2) {
375          sp += sll;
376
377#ifdef __SUNPRO_C
378#pragma pipeloop(0)
379#endif /* __SUNPRO_C */
380          for (j = 0; j <= (hsize - 2); j += 2) {
381            p0 = p2;
382            p1 = sp[0];
383            p2 = sp[sll];
384
385            d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);
386            d1 = D2I(p1*k0 + p2*k1 + pbuff[j + 1]);
387
388            dp[0  ] = FROM_S32(d0);
389            dp[dll] = FROM_S32(d1);
390
391            pbuff[j] = 0;
392            pbuff[j + 1] = 0;
393
394            sp += 2*sll;
395            dp += 2*dll;
396          }
397
398          if (j < hsize) {
399            p0 = p2;
400            p1 = sp[0];
401
402            d0 = D2I(p0*k0 + p1*k1 + pbuff[j]);
403
404            pbuff[j] = 0;
405
406            dp[0] = FROM_S32(d0);
407          }
408
409        } else /* if (kh == 1) */ {
410#ifdef __SUNPRO_C
411#pragma pipeloop(0)
412#endif /* __SUNPRO_C */
413          for (j = 0; j < hsize; j++) {
414            p0 = sp[0];
415
416            d0 = D2I(p0*k0 + pbuff[j]);
417
418            dp[0] = FROM_S32(d0);
419
420            pbuff[j] = 0;
421
422            sp += sll;
423            dp += dll;
424          }
425        }
426
427        sl += chan1;
428        dl += chan1;
429      }
430    }
431
432    sl_c += max_hsize*sll;
433    dl_c += max_hsize*dll;
434  }
435
436  if (pbuff != buff) mlib_free(pbuff);
437
438  return MLIB_SUCCESS;
439}
440
441/***************************************************************/
442mlib_status CONV_FUNC(MxN)(mlib_image       *dst,
443                           const mlib_image *src,
444                           const mlib_s32   *kernel,
445                           mlib_s32         m,
446                           mlib_s32         n,
447                           mlib_s32         dm,
448                           mlib_s32         dn,
449                           mlib_s32         scale,
450                           mlib_s32         cmask)
451{
452  FTYPE    buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
453  FTYPE    **buffs = buffs_arr, *buffd;
454  FTYPE    akernel[256], *k = akernel, fscale = DSCALE;
455  mlib_s32 mn, l, off, kw, bsize, buff_ind;
456  mlib_s32 d0, d1;
457  FTYPE    k0, k1, k2, k3, k4, k5, k6;
458  FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
459  d64_2x32 dd;
460  DEF_VARS(DTYPE);
461  mlib_s32 chan2;
462  mlib_s32 *buffo, *buffi;
463  mlib_status status = MLIB_SUCCESS;
464
465  GET_SRC_DST_PARAMETERS(DTYPE);
466
467  if (scale > 30) {
468    fscale *= 1.0/(1 << 30);
469    scale -= 30;
470  }
471
472  fscale /= (1 << scale);
473
474  mn = m*n;
475
476  if (mn > 256) {
477    k = mlib_malloc(mn*sizeof(mlib_d64));
478
479    if (k == NULL) return MLIB_FAILURE;
480  }
481
482  for (i = 0; i < mn; i++) {
483    k[i] = kernel[i]*fscale;
484  }
485
486  if (m == 1) {
487    status = mlib_ImageConv1xN(dst, src, k, n, dn, cmask);
488    FREE_AND_RETURN_STATUS;
489  }
490
491  bsize = (n + 3)*wid;
492
493  if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
494    pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
495
496    if (pbuff == NULL) {
497      status = MLIB_FAILURE;
498      FREE_AND_RETURN_STATUS;
499    }
500    buffs = (FTYPE   **)(pbuff + bsize);
501  }
502
503  for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*wid;
504  for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
505  buffd = buffs[n] + wid;
506  buffo = (mlib_s32*)(buffd + wid);
507  buffi = buffo + (wid &~ 1);
508
509  chan1 = nchannel;
510  chan2 = chan1 + chan1;
511
512  wid -= (m - 1);
513  hgt -= (n - 1);
514  adr_dst += dn*dll + dm*nchannel;
515
516  for (c = 0; c < nchannel; c++) {
517    if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
518
519    sl = adr_src + c;
520    dl = adr_dst + c;
521
522    for (l = 0; l < n; l++) {
523      FTYPE    *buff = buffs[l];
524
525#ifdef __SUNPRO_C
526#pragma pipeloop(0)
527#endif /* __SUNPRO_C */
528      for (i = 0; i < wid + (m - 1); i++) {
529        buff[i] = (FTYPE)sl[i*chan1];
530      }
531
532      sl += sll;
533    }
534
535    buff_ind = 0;
536
537#ifdef __SUNPRO_C
538#pragma pipeloop(0)
539#endif /* __SUNPRO_C */
540    for (i = 0; i < wid; i++) buffd[i] = 0.0;
541
542    for (j = 0; j < hgt; j++) {
543      FTYPE    **buffc = buffs + buff_ind;
544      FTYPE    *buffn = buffc[n];
545      FTYPE    *pk = k;
546
547      for (l = 0; l < n; l++) {
548        FTYPE    *buff_l = buffc[l];
549
550        for (off = 0; off < m;) {
551          FTYPE    *buff = buff_l + off;
552
553          kw = m - off;
554
555          if (kw > 2*MAX_KER) kw = MAX_KER; else
556            if (kw > MAX_KER) kw = kw/2;
557          off += kw;
558
559          sp = sl;
560          dp = dl;
561
562          p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
563          p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
564
565          k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
566          k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
567          pk += kw;
568
569          if (kw == 7) {
570
571            if (l < (n - 1) || off < m) {
572#ifdef __SUNPRO_C
573#pragma pipeloop(0)
574#endif /* __SUNPRO_C */
575              for (i = 0; i <= (wid - 2); i += 2) {
576                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
577
578                p6 = buff[i + 6]; p7 = buff[i + 7];
579
580                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
581                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
582              }
583
584            } else {
585#ifdef __SUNPRO_C
586#pragma pipeloop(0)
587#endif /* __SUNPRO_C */
588              for (i = 0; i <= (wid - 2); i += 2) {
589                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
590
591                p6 = buff[i + 6]; p7 = buff[i + 7];
592
593                LOAD_BUFF(buffi);
594
595                dd.d64 = *(FTYPE   *)(buffi + i);
596                buffn[i    ] = (FTYPE)dd.i32s.i0;
597                buffn[i + 1] = (FTYPE)dd.i32s.i1;
598
599                d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
600                d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
601
602                dp[0    ] = FROM_S32(d0);
603                dp[chan1] = FROM_S32(d1);
604
605                buffd[i    ] = 0.0;
606                buffd[i + 1] = 0.0;
607
608                sp += chan2;
609                dp += chan2;
610              }
611            }
612
613          } else if (kw == 6) {
614
615            if (l < (n - 1) || off < m) {
616#ifdef __SUNPRO_C
617#pragma pipeloop(0)
618#endif /* __SUNPRO_C */
619              for (i = 0; i <= (wid - 2); i += 2) {
620                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
621
622                p5 = buff[i + 5]; p6 = buff[i + 6];
623
624                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
625                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
626              }
627
628            } else {
629#ifdef __SUNPRO_C
630#pragma pipeloop(0)
631#endif /* __SUNPRO_C */
632              for (i = 0; i <= (wid - 2); i += 2) {
633                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
634
635                p5 = buff[i + 5]; p6 = buff[i + 6];
636
637                buffn[i    ] = (FTYPE)sp[0];
638                buffn[i + 1] = (FTYPE)sp[chan1];
639
640                d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
641                d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
642
643                dp[0    ] = FROM_S32(d0);
644                dp[chan1] = FROM_S32(d1);
645
646                buffd[i    ] = 0.0;
647                buffd[i + 1] = 0.0;
648
649                sp += chan2;
650                dp += chan2;
651              }
652            }
653
654          } else if (kw == 5) {
655
656            if (l < (n - 1) || off < m) {
657#ifdef __SUNPRO_C
658#pragma pipeloop(0)
659#endif /* __SUNPRO_C */
660              for (i = 0; i <= (wid - 2); i += 2) {
661                p0 = p2; p1 = p3; p2 = p4; p3 = p5;
662
663                p4 = buff[i + 4]; p5 = buff[i + 5];
664
665                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
666                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
667              }
668
669            } else {
670#ifdef __SUNPRO_C
671#pragma pipeloop(0)
672#endif /* __SUNPRO_C */
673              for (i = 0; i <= (wid - 2); i += 2) {
674                p0 = p2; p1 = p3; p2 = p4; p3 = p5;
675
676                p4 = buff[i + 4]; p5 = buff[i + 5];
677
678                buffn[i    ] = (FTYPE)sp[0];
679                buffn[i + 1] = (FTYPE)sp[chan1];
680
681                d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
682                d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
683
684                dp[0    ] = FROM_S32(d0);
685                dp[chan1] = FROM_S32(d1);
686
687                buffd[i    ] = 0.0;
688                buffd[i + 1] = 0.0;
689
690                sp += chan2;
691                dp += chan2;
692              }
693            }
694
695          } else if (kw == 4) {
696
697            if (l < (n - 1) || off < m) {
698#ifdef __SUNPRO_C
699#pragma pipeloop(0)
700#endif /* __SUNPRO_C */
701              for (i = 0; i <= (wid - 2); i += 2) {
702                p0 = p2; p1 = p3; p2 = p4;
703
704                p3 = buff[i + 3]; p4 = buff[i + 4];
705
706                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
707                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
708              }
709
710            } else {
711#ifdef __SUNPRO_C
712#pragma pipeloop(0)
713#endif /* __SUNPRO_C */
714              for (i = 0; i <= (wid - 2); i += 2) {
715                p0 = p2; p1 = p3; p2 = p4;
716
717                p3 = buff[i + 3]; p4 = buff[i + 4];
718
719                buffn[i    ] = (FTYPE)sp[0];
720                buffn[i + 1] = (FTYPE)sp[chan1];
721
722                d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
723                d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
724
725                dp[0    ] = FROM_S32(d0);
726                dp[chan1] = FROM_S32(d1);
727
728                buffd[i    ] = 0.0;
729                buffd[i + 1] = 0.0;
730
731                sp += chan2;
732                dp += chan2;
733              }
734            }
735
736          } else if (kw == 3) {
737
738            if (l < (n - 1) || off < m) {
739#ifdef __SUNPRO_C
740#pragma pipeloop(0)
741#endif /* __SUNPRO_C */
742              for (i = 0; i <= (wid - 2); i += 2) {
743                p0 = p2; p1 = p3;
744
745                p2 = buff[i + 2]; p3 = buff[i + 3];
746
747                buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
748                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
749              }
750
751            } else {
752#ifdef __SUNPRO_C
753#pragma pipeloop(0)
754#endif /* __SUNPRO_C */
755              for (i = 0; i <= (wid - 2); i += 2) {
756                p0 = p2; p1 = p3;
757
758                p2 = buff[i + 2]; p3 = buff[i + 3];
759
760                buffn[i    ] = (FTYPE)sp[0];
761                buffn[i + 1] = (FTYPE)sp[chan1];
762
763                d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
764                d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
765
766                dp[0    ] = FROM_S32(d0);
767                dp[chan1] = FROM_S32(d1);
768
769                buffd[i    ] = 0.0;
770                buffd[i + 1] = 0.0;
771
772                sp += chan2;
773                dp += chan2;
774              }
775            }
776
777          } else /*if (kw == 2)*/ {
778
779            if (l < (n - 1) || off < m) {
780#ifdef __SUNPRO_C
781#pragma pipeloop(0)
782#endif /* __SUNPRO_C */
783              for (i = 0; i <= (wid - 2); i += 2) {
784                p0 = p2;
785
786                p1 = buff[i + 1]; p2 = buff[i + 2];
787
788                buffd[i    ] += p0*k0 + p1*k1;
789                buffd[i + 1] += p1*k0 + p2*k1;
790              }
791
792            } else {
793#ifdef __SUNPRO_C
794#pragma pipeloop(0)
795#endif /* __SUNPRO_C */
796              for (i = 0; i <= (wid - 2); i += 2) {
797                p0 = p2;
798
799                p1 = buff[i + 1]; p2 = buff[i + 2];
800
801                buffn[i    ] = (FTYPE)sp[0];
802                buffn[i + 1] = (FTYPE)sp[chan1];
803
804                d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
805                d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
806
807                dp[0    ] = FROM_S32(d0);
808                dp[chan1] = FROM_S32(d1);
809
810                buffd[i    ] = 0.0;
811                buffd[i + 1] = 0.0;
812
813                sp += chan2;
814                dp += chan2;
815              }
816            }
817          }
818        }
819      }
820
821      /* last pixels */
822      for (; i < wid; i++) {
823        FTYPE    *pk = k, s = 0;
824        mlib_s32 x, d0;
825
826        for (l = 0; l < n; l++) {
827          FTYPE    *buff = buffc[l] + i;
828
829          for (x = 0; x < m; x++) s += buff[x] * (*pk++);
830        }
831
832        d0 = D2I(s);
833        dp[0] = FROM_S32(d0);
834
835        buffn[i] = (FTYPE)sp[0];
836
837        sp += chan1;
838        dp += chan1;
839      }
840
841      for (l = 0; l < (m - 1); l++) buffn[wid + l] = sp[l*chan1];
842
843      /* next line */
844      sl += sll;
845      dl += dll;
846
847      buff_ind++;
848
849      if (buff_ind >= n + 1) buff_ind = 0;
850    }
851  }
852
853  FREE_AND_RETURN_STATUS;
854}
855
856/***************************************************************/
857#ifndef __sparc /* for x86, using integer multiplies is faster */
858
859#define STORE_RES(res, x)                                       \
860  x >>= shift2;                                                 \
861  CLAMP_STORE(res, x)
862
863mlib_status CONV_FUNC_I(MxN)(mlib_image       *dst,
864                             const mlib_image *src,
865                             const mlib_s32   *kernel,
866                             mlib_s32         m,
867                             mlib_s32         n,
868                             mlib_s32         dm,
869                             mlib_s32         dn,
870                             mlib_s32         scale,
871                             mlib_s32         cmask)
872{
873  mlib_s32 buff[BUFF_SIZE], *buffd = buff;
874  mlib_s32 l, off, kw;
875  mlib_s32 d0, d1, shift1, shift2;
876  mlib_s32 k0, k1, k2, k3, k4, k5, k6;
877  mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
878  DTYPE    *adr_src, *sl, *sp = NULL;
879  DTYPE    *adr_dst, *dl, *dp = NULL;
880  mlib_s32 wid, hgt, sll, dll;
881  mlib_s32 nchannel, chan1;
882  mlib_s32 i, j, c;
883  mlib_s32 chan2;
884  mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
885  GET_SRC_DST_PARAMETERS(DTYPE);
886
887#if IMG_TYPE != 1
888  shift1 = 16;
889#else
890  shift1 = 8;
891#endif /* IMG_TYPE != 1 */
892  shift2 = scale - shift1;
893
894  chan1 = nchannel;
895  chan2 = chan1 + chan1;
896
897  wid -= (m - 1);
898  hgt -= (n - 1);
899  adr_dst += dn*dll + dm*nchannel;
900
901  if (wid > BUFF_SIZE) {
902    buffd = mlib_malloc(sizeof(mlib_s32)*wid);
903
904    if (buffd == NULL) return MLIB_FAILURE;
905  }
906
907  if (m*n > MAX_N*MAX_N) {
908    k = mlib_malloc(sizeof(mlib_s32)*(m*n));
909
910    if (k == NULL) {
911      if (buffd != buff) mlib_free(buffd);
912      return MLIB_FAILURE;
913    }
914  }
915
916  for (i = 0; i < m*n; i++) {
917    k[i] = kernel[i] >> shift1;
918  }
919
920  for (c = 0; c < nchannel; c++) {
921    if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
922
923    sl = adr_src + c;
924    dl = adr_dst + c;
925
926#ifdef __SUNPRO_C
927#pragma pipeloop(0)
928#endif /* __SUNPRO_C */
929    for (i = 0; i < wid; i++) buffd[i] = 0;
930
931    for (j = 0; j < hgt; j++) {
932      mlib_s32 *pk = k;
933
934      for (l = 0; l < n; l++) {
935        DTYPE *sp0 = sl + l*sll;
936
937        for (off = 0; off < m;) {
938          sp = sp0 + off*chan1;
939          dp = dl;
940
941          kw = m - off;
942
943          if (kw > 2*MAX_KER) kw = MAX_KER; else
944            if (kw > MAX_KER) kw = kw/2;
945          off += kw;
946
947          p2 = sp[0]; p3 = sp[chan1]; p4 = sp[chan2];
948          p5 = sp[chan2 + chan1]; p6 = sp[chan2 + chan2]; p7 = sp[5*chan1];
949
950          k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
951          k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
952          pk += kw;
953
954          sp += (kw - 1)*chan1;
955
956          if (kw == 7) {
957
958            if (l < (n - 1) || off < m) {
959#ifdef __SUNPRO_C
960#pragma pipeloop(0)
961#endif /* __SUNPRO_C */
962              for (i = 0; i <= (wid - 2); i += 2) {
963                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
964                p6 = sp[0];
965                p7 = sp[chan1];
966
967                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
968                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
969
970                sp += chan2;
971              }
972
973            } else {
974#ifdef __SUNPRO_C
975#pragma pipeloop(0)
976#endif /* __SUNPRO_C */
977              for (i = 0; i <= (wid - 2); i += 2) {
978                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
979                p6 = sp[0];
980                p7 = sp[chan1];
981
982                d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
983                d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
984
985                STORE_RES(dp[0    ], d0);
986                STORE_RES(dp[chan1], d1);
987
988                buffd[i    ] = 0;
989                buffd[i + 1] = 0;
990
991                sp += chan2;
992                dp += chan2;
993              }
994            }
995
996          } else if (kw == 6) {
997
998            if (l < (n - 1) || off < m) {
999#ifdef __SUNPRO_C
1000#pragma pipeloop(0)
1001#endif /* __SUNPRO_C */
1002              for (i = 0; i <= (wid - 2); i += 2) {
1003                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1004                p5 = sp[0];
1005                p6 = sp[chan1];
1006
1007                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
1008                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
1009
1010                sp += chan2;
1011              }
1012
1013            } else {
1014#ifdef __SUNPRO_C
1015#pragma pipeloop(0)
1016#endif /* __SUNPRO_C */
1017              for (i = 0; i <= (wid - 2); i += 2) {
1018                p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
1019                p5 = sp[0];
1020                p6 = sp[chan1];
1021
1022                d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
1023                d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
1024
1025                STORE_RES(dp[0    ], d0);
1026                STORE_RES(dp[chan1], d1);
1027
1028                buffd[i    ] = 0;
1029                buffd[i + 1] = 0;
1030
1031                sp += chan2;
1032                dp += chan2;
1033              }
1034            }
1035
1036          } else if (kw == 5) {
1037
1038            if (l < (n - 1) || off < m) {
1039#ifdef __SUNPRO_C
1040#pragma pipeloop(0)
1041#endif /* __SUNPRO_C */
1042              for (i = 0; i <= (wid - 2); i += 2) {
1043                p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1044                p4 = sp[0];
1045                p5 = sp[chan1];
1046
1047                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
1048                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
1049
1050                sp += chan2;
1051              }
1052
1053            } else {
1054#ifdef __SUNPRO_C
1055#pragma pipeloop(0)
1056#endif /* __SUNPRO_C */
1057              for (i = 0; i <= (wid - 2); i += 2) {
1058                p0 = p2; p1 = p3; p2 = p4; p3 = p5;
1059                p4 = sp[0];
1060                p5 = sp[chan1];
1061
1062                d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
1063                d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
1064
1065                STORE_RES(dp[0    ], d0);
1066                STORE_RES(dp[chan1], d1);
1067
1068                buffd[i    ] = 0;
1069                buffd[i + 1] = 0;
1070
1071                sp += chan2;
1072                dp += chan2;
1073              }
1074            }
1075
1076          } else if (kw == 4) {
1077
1078            if (l < (n - 1) || off < m) {
1079#ifdef __SUNPRO_C
1080#pragma pipeloop(0)
1081#endif /* __SUNPRO_C */
1082              for (i = 0; i <= (wid - 2); i += 2) {
1083                p0 = p2; p1 = p3; p2 = p4;
1084                p3 = sp[0];
1085                p4 = sp[chan1];
1086
1087                buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
1088                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
1089
1090                sp += chan2;
1091              }
1092
1093            } else {
1094#ifdef __SUNPRO_C
1095#pragma pipeloop(0)
1096#endif /* __SUNPRO_C */
1097              for (i = 0; i <= (wid - 2); i += 2) {
1098                p0 = p2; p1 = p3; p2 = p4;
1099                p3 = sp[0];
1100                p4 = sp[chan1];
1101
1102                d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
1103                d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
1104
1105                STORE_RES(dp[0    ], d0);
1106                STORE_RES(dp[chan1], d1);
1107
1108                buffd[i    ] = 0;
1109                buffd[i + 1] = 0;
1110
1111                sp += chan2;
1112                dp += chan2;
1113              }
1114            }
1115
1116          } else if (kw == 3) {
1117
1118            if (l < (n - 1) || off < m) {
1119#ifdef __SUNPRO_C
1120#pragma pipeloop(0)
1121#endif /* __SUNPRO_C */
1122              for (i = 0; i <= (wid - 2); i += 2) {
1123                p0 = p2; p1 = p3;
1124                p2 = sp[0];
1125                p3 = sp[chan1];
1126
1127                buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
1128                buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
1129
1130                sp += chan2;
1131              }
1132
1133            } else {
1134#ifdef __SUNPRO_C
1135#pragma pipeloop(0)
1136#endif /* __SUNPRO_C */
1137              for (i = 0; i <= (wid - 2); i += 2) {
1138                p0 = p2; p1 = p3;
1139                p2 = sp[0];
1140                p3 = sp[chan1];
1141
1142                d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
1143                d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
1144
1145                STORE_RES(dp[0    ], d0);
1146                STORE_RES(dp[chan1], d1);
1147
1148                buffd[i    ] = 0;
1149                buffd[i + 1] = 0;
1150
1151                sp += chan2;
1152                dp += chan2;
1153              }
1154            }
1155
1156          } else if (kw == 2) {
1157
1158            if (l < (n - 1) || off < m) {
1159#ifdef __SUNPRO_C
1160#pragma pipeloop(0)
1161#endif /* __SUNPRO_C */
1162              for (i = 0; i <= (wid - 2); i += 2) {
1163                p0 = p2;
1164                p1 = sp[0];
1165                p2 = sp[chan1];
1166
1167                buffd[i    ] += p0*k0 + p1*k1;
1168                buffd[i + 1] += p1*k0 + p2*k1;
1169
1170                sp += chan2;
1171              }
1172
1173            } else {
1174#ifdef __SUNPRO_C
1175#pragma pipeloop(0)
1176#endif /* __SUNPRO_C */
1177              for (i = 0; i <= (wid - 2); i += 2) {
1178                p0 = p2;
1179                p1 = sp[0];
1180                p2 = sp[chan1];
1181
1182                d0 = (p0*k0 + p1*k1 + buffd[i    ]);
1183                d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
1184
1185                STORE_RES(dp[0    ], d0);
1186                STORE_RES(dp[chan1], d1);
1187
1188                buffd[i    ] = 0;
1189                buffd[i + 1] = 0;
1190
1191                sp += chan2;
1192                dp += chan2;
1193              }
1194            }
1195
1196          } else /*if (kw == 1)*/ {
1197
1198            if (l < (n - 1) || off < m) {
1199#ifdef __SUNPRO_C
1200#pragma pipeloop(0)
1201#endif /* __SUNPRO_C */
1202              for (i = 0; i <= (wid - 2); i += 2) {
1203                p0 = sp[0];
1204                p1 = sp[chan1];
1205
1206                buffd[i    ] += p0*k0;
1207                buffd[i + 1] += p1*k0;
1208
1209                sp += chan2;
1210              }
1211
1212            } else {
1213#ifdef __SUNPRO_C
1214#pragma pipeloop(0)
1215#endif /* __SUNPRO_C */
1216              for (i = 0; i <= (wid - 2); i += 2) {
1217                p0 = sp[0];
1218                p1 = sp[chan1];
1219
1220                d0 = (p0*k0 + buffd[i    ]);
1221                d1 = (p1*k0 + buffd[i + 1]);
1222
1223                STORE_RES(dp[0    ], d0);
1224                STORE_RES(dp[chan1], d1);
1225
1226                buffd[i    ] = 0;
1227                buffd[i + 1] = 0;
1228
1229                sp += chan2;
1230                dp += chan2;
1231              }
1232            }
1233          }
1234        }
1235      }
1236
1237      /* last pixels */
1238      for (; i < wid; i++) {
1239        mlib_s32 *pk = k, s = 0;
1240        mlib_s32 x;
1241
1242        for (l = 0; l < n; l++) {
1243          sp = sl + l*sll + i*chan1;
1244
1245          for (x = 0; x < m; x++) {
1246            s += sp[0] * pk[0];
1247            sp += chan1;
1248            pk ++;
1249          }
1250        }
1251
1252        STORE_RES(dp[0], s);
1253
1254        sp += chan1;
1255        dp += chan1;
1256      }
1257
1258      sl += sll;
1259      dl += dll;
1260    }
1261  }
1262
1263  if (buffd != buff) mlib_free(buffd);
1264  if (k != k_locl) mlib_free(k);
1265
1266  return MLIB_SUCCESS;
1267}
1268
1269/***************************************************************/
1270#endif /* __sparc ( for x86, using integer multiplies is faster ) */
1271
1272/***************************************************************/
1273