1/*
2 * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26
27
28/*
29 *      The functions step along the lines from xLeft to xRight and apply
30 *      the bicubic filtering.
31 *
32 */
33
34#include "vis_proto.h"
35#include "mlib_ImageAffine.h"
36#include "mlib_v_ImageFilters.h"
37
38/***************************************************************/
39#define DTYPE  mlib_s16
40
41#define FILTER_BITS  9
42
43/***************************************************************/
44#define sPtr srcPixelPtr
45
46/***************************************************************/
47#define NEXT_PIXEL_1BC_S16()                                    \
48  xSrc = (X >> MLIB_SHIFT)-1;                                   \
49  ySrc = (Y >> MLIB_SHIFT)-1;                                   \
50  sPtr = (mlib_s16 *)lineAddr[ySrc] + xSrc
51
52/***************************************************************/
53#define LOAD_BC_S16_1CH_1PIXEL(mlib_filters_s16, mlib_filters_s16_4)    \
54  vis_alignaddr(sPtr, 0);                                               \
55  dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
56  data0 = dpSrc[0];                                                     \
57  data1 = dpSrc[1];                                                     \
58  row0 = vis_faligndata(data0, data1);                                  \
59  sPtr += srcYStride;                                                   \
60  vis_alignaddr(sPtr, 0);                                               \
61  dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
62  data0 = dpSrc[0];                                                     \
63  data1 = dpSrc[1];                                                     \
64  row1 = vis_faligndata(data0, data1);                                  \
65  sPtr += srcYStride;                                                   \
66  vis_alignaddr(sPtr, 0);                                               \
67  dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
68  data0 = dpSrc[0];                                                     \
69  data1 = dpSrc[1];                                                     \
70  row2 = vis_faligndata(data0, data1);                                  \
71  sPtr += srcYStride;                                                   \
72  vis_alignaddr(sPtr, 0);                                               \
73  dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
74  data0 = dpSrc[0];                                                     \
75  data1 = dpSrc[1];                                                     \
76  row3 = vis_faligndata(data0, data1);                                  \
77  filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                       \
78  yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
79  yFilter0 = yPtr[0];                                                   \
80  yFilter1 = yPtr[1];                                                   \
81  yFilter2 = yPtr[2];                                                   \
82  yFilter3 = yPtr[3];                                                   \
83  filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                       \
84  xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_s16 + filterposx));  \
85  X += dX;                                                              \
86  Y += dY
87
88/***************************************************************/
89#define RESULT_1BC_S16_1PIXEL()                                          \
90  u0 = vis_fmul8sux16(row0, yFilter0);                                   \
91  u1 = vis_fmul8ulx16(row0, yFilter0);                                   \
92  u2 = vis_fmul8sux16(row1, yFilter1);                                   \
93  v0 = vis_fpadd16(u0, u1);                                              \
94  u3 = vis_fmul8ulx16(row1, yFilter1);                                   \
95  u0 = vis_fmul8sux16(row2, yFilter2);                                   \
96  v1 = vis_fpadd16(u2, u3);                                              \
97  u1 = vis_fmul8ulx16(row2, yFilter2);                                   \
98  sum = vis_fpadd16(v0, v1);                                             \
99  u2 = vis_fmul8sux16(row3, yFilter3);                                   \
100  v2 = vis_fpadd16(u0, u1);                                              \
101  u3 = vis_fmul8ulx16(row3, yFilter3);                                   \
102  sum = vis_fpadd16(sum, v2);                                            \
103  v3 = vis_fpadd16(u2, u3);                                              \
104  sum = vis_fpadd16(sum, v3);                                            \
105  d00 = vis_fmul8sux16(sum, xFilter);                                    \
106  d10 = vis_fmul8ulx16(sum, xFilter);                                    \
107  d0 = vis_fpadd16(d00, d10);                                            \
108  p0 = vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0));                   \
109  d0 = vis_fmuld8sux16(f_x01000100, p0);                                 \
110  d1 = vis_write_lo(d1, vis_fpadd32s(vis_read_hi(d0), vis_read_lo(d0))); \
111  res = vis_fpackfix_pair(d1, d1)
112
113/***************************************************************/
114#define BC_S16_1CH(ind, mlib_filters_s16, mlib_filters_s16_4)           \
115  u0 = vis_fmul8sux16(row0, yFilter0);                                  \
116  u1 = vis_fmul8ulx16(row0, yFilter0);                                  \
117  vis_alignaddr(sPtr, 0);                                               \
118  dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
119  u2 = vis_fmul8sux16(row1, yFilter1);                                  \
120  v0 = vis_fpadd16(u0, u1);                                             \
121  data0 = dpSrc[0];                                                     \
122  filterposy = (Y >> FILTER_SHIFT);                                     \
123  u3 = vis_fmul8ulx16(row1, yFilter1);                                  \
124  data1 = dpSrc[1];                                                     \
125  row0 = vis_faligndata(data0, data1);                                  \
126  filterposx = (X >> FILTER_SHIFT);                                     \
127  sPtr += srcYStride;                                                   \
128  vis_alignaddr(sPtr, 0);                                               \
129  dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
130  u0 = vis_fmul8sux16(row2, yFilter2);                                  \
131  v1 = vis_fpadd16(u2, u3);                                             \
132  data0 = dpSrc[0];                                                     \
133  u1 = vis_fmul8ulx16(row2, yFilter2);                                  \
134  sum = vis_fpadd16(v0, v1);                                            \
135  X += dX;                                                              \
136  data1 = dpSrc[1];                                                     \
137  row1 = vis_faligndata(data0, data1);                                  \
138  sPtr += srcYStride;                                                   \
139  vis_alignaddr(sPtr, 0);                                               \
140  dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
141  u2 = vis_fmul8sux16(row3, yFilter3);                                  \
142  v2 = vis_fpadd16(u0, u1);                                             \
143  Y += dY;                                                              \
144  xSrc = (X >> MLIB_SHIFT)-1;                                           \
145  data0 = dpSrc[0];                                                     \
146  u3 = vis_fmul8ulx16(row3, yFilter3);                                  \
147  sum = vis_fpadd16(sum, v2);                                           \
148  ySrc = (Y >> MLIB_SHIFT)-1;                                           \
149  data1 = dpSrc[1];                                                     \
150  filterposy &= FILTER_MASK;                                            \
151  row2 = vis_faligndata(data0, data1);                                  \
152  sPtr += srcYStride;                                                   \
153  filterposx &= FILTER_MASK;                                            \
154  vis_alignaddr(sPtr, 0);                                               \
155  dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
156  data0 = dpSrc[0];                                                     \
157  v3 = vis_fpadd16(u2, u3);                                             \
158  data1 = dpSrc[1];                                                     \
159  row3 = vis_faligndata(data0, data1);                                  \
160  yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
161  yFilter0 = yPtr[0];                                                   \
162  sum = vis_fpadd16(sum, v3);                                           \
163  yFilter1 = yPtr[1];                                                   \
164  d0 = vis_fmul8sux16(sum, xFilter);                                    \
165  yFilter2 = yPtr[2];                                                   \
166  d1 = vis_fmul8ulx16(sum, xFilter);                                    \
167  yFilter3 = yPtr[3];                                                   \
168  xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_s16 + filterposx));  \
169  d0##ind = vis_fpadd16(d0, d1);                                        \
170  sPtr = (mlib_s16 *)lineAddr[ySrc] + xSrc
171
172/***************************************************************/
173#define FADD_1BC_S16()                                                \
174  p0 = vis_fpadd16s(vis_read_hi(d00), vis_read_lo(d00));              \
175  p1 = vis_fpadd16s(vis_read_hi(d01), vis_read_lo(d01));              \
176  p2 = vis_fpadd16s(vis_read_hi(d02), vis_read_lo(d02));              \
177  p3 = vis_fpadd16s(vis_read_hi(d03), vis_read_lo(d03));              \
178  d0 = vis_fmuld8sux16(f_x01000100, p0);                              \
179  d1 = vis_fmuld8sux16(f_x01000100, p1);                              \
180  d2 = vis_fmuld8sux16(f_x01000100, p2);                              \
181  d3 = vis_fmuld8sux16(f_x01000100, p3);                              \
182  d0 = vis_freg_pair(vis_fpadd32s(vis_read_hi(d0), vis_read_lo(d0)),  \
183                     vis_fpadd32s(vis_read_hi(d1), vis_read_lo(d1))); \
184  d1 = vis_freg_pair(vis_fpadd32s(vis_read_hi(d2), vis_read_lo(d2)),  \
185                     vis_fpadd32s(vis_read_hi(d3), vis_read_lo(d3))); \
186  res = vis_fpackfix_pair(d0, d1)
187
188/***************************************************************/
189mlib_status mlib_ImageAffine_s16_1ch_bc (mlib_affine_param *param)
190{
191  DECLAREVAR_BC();
192  mlib_s32  filterposx, filterposy;
193  mlib_d64  data0, data1;
194  mlib_d64  sum;
195  mlib_d64  row0, row1, row2, row3;
196  mlib_f32  p0, p1, p2, p3;
197  mlib_d64  xFilter, yFilter0, yFilter1, yFilter2, yFilter3;
198  mlib_d64  v0, v1, v2, v3;
199  mlib_d64  u0, u1, u2, u3;
200  mlib_d64  d0, d1, d2, d3;
201  mlib_d64  d00, d10, d01, d02, d03;
202  mlib_d64 *yPtr;
203  mlib_d64 *dpSrc;
204  mlib_s32  align, cols, i;
205  mlib_d64  res;
206  mlib_f32  f_x01000100 = vis_to_float(0x01000100);
207  const mlib_s16 *mlib_filters_table  ;
208  const mlib_s16 *mlib_filters_table_4;
209
210  if (filter == MLIB_BICUBIC) {
211    mlib_filters_table   = mlib_filters_s16_bc;
212    mlib_filters_table_4 = mlib_filters_s16_bc_4;
213  } else {
214    mlib_filters_table   = mlib_filters_s16_bc2;
215    mlib_filters_table_4 = mlib_filters_s16_bc2_4;
216  }
217
218  srcYStride >>= 1;
219
220  for (j = yStart; j <= yFinish; j++) {
221
222    vis_write_gsr(10 << 3);
223
224    CLIP(1);
225
226    cols = xRight - xLeft + 1;
227    align = (8 - ((mlib_addr)dstPixelPtr) & 7) & 7;
228    align >>= 1;
229    align = (cols < align)? cols : align;
230
231    for (i = 0; i < align; i++) {
232      NEXT_PIXEL_1BC_S16();
233      LOAD_BC_S16_1CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
234      RESULT_1BC_S16_1PIXEL();
235      vis_st_u16(res, dstPixelPtr++);
236    }
237
238    if (i <= cols - 10) {
239
240      NEXT_PIXEL_1BC_S16();
241      LOAD_BC_S16_1CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
242
243      NEXT_PIXEL_1BC_S16();
244
245      BC_S16_1CH(0, mlib_filters_table, mlib_filters_table_4);
246      BC_S16_1CH(1, mlib_filters_table, mlib_filters_table_4);
247      BC_S16_1CH(2, mlib_filters_table, mlib_filters_table_4);
248      BC_S16_1CH(3, mlib_filters_table, mlib_filters_table_4);
249
250      FADD_1BC_S16();
251
252      BC_S16_1CH(0, mlib_filters_table, mlib_filters_table_4);
253      BC_S16_1CH(1, mlib_filters_table, mlib_filters_table_4);
254      BC_S16_1CH(2, mlib_filters_table, mlib_filters_table_4);
255      BC_S16_1CH(3, mlib_filters_table, mlib_filters_table_4);
256
257#pragma pipeloop(0)
258      for (; i <= cols - 14; i += 4) {
259        *(mlib_d64*)dstPixelPtr = res;
260        FADD_1BC_S16();
261        BC_S16_1CH(0, mlib_filters_table, mlib_filters_table_4);
262        BC_S16_1CH(1, mlib_filters_table, mlib_filters_table_4);
263        BC_S16_1CH(2, mlib_filters_table, mlib_filters_table_4);
264        BC_S16_1CH(3, mlib_filters_table, mlib_filters_table_4);
265        dstPixelPtr += 4;
266      }
267
268      *(mlib_d64*)dstPixelPtr = res;
269      dstPixelPtr += 4;
270      FADD_1BC_S16();
271      *(mlib_d64*)dstPixelPtr = res;
272      dstPixelPtr += 4;
273
274      RESULT_1BC_S16_1PIXEL();
275      vis_st_u16(res, dstPixelPtr++);
276
277      LOAD_BC_S16_1CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
278      RESULT_1BC_S16_1PIXEL();
279      vis_st_u16(res, dstPixelPtr++);
280      i += 10;
281    }
282
283    for (; i < cols; i++) {
284      NEXT_PIXEL_1BC_S16();
285      LOAD_BC_S16_1CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
286      RESULT_1BC_S16_1PIXEL();
287      vis_st_u16(res, dstPixelPtr++);
288    }
289  }
290
291  return MLIB_SUCCESS;
292}
293
294/***************************************************************/
295#define NEXT_PIXEL_2BC_S16()                                    \
296  xSrc = (X >> MLIB_SHIFT)-1;                                   \
297  ySrc = (Y >> MLIB_SHIFT)-1;                                   \
298  sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc << 1)
299
300/***************************************************************/
301#define LOAD_BC_S16_2CH_1PIXEL(mlib_filters_s16, mlib_filters_s16_4)    \
302  vis_alignaddr(sPtr, 0);                                               \
303  dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
304  data0 = dpSrc[0];                                                     \
305  data1 = dpSrc[1];                                                     \
306  data2 = dpSrc[2];                                                     \
307  row00 = vis_faligndata(data0, data1);                                 \
308  row01 = vis_faligndata(data1, data2);                                 \
309  sPtr += srcYStride;                                                   \
310  vis_alignaddr(sPtr, 0);                                               \
311  dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
312  data0 = dpSrc[0];                                                     \
313  data1 = dpSrc[1];                                                     \
314  data2 = dpSrc[2];                                                     \
315  row10 = vis_faligndata(data0, data1);                                 \
316  row11 = vis_faligndata(data1, data2);                                 \
317  sPtr += srcYStride;                                                   \
318  vis_alignaddr(sPtr, 0);                                               \
319  dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
320  data0 = dpSrc[0];                                                     \
321  data1 = dpSrc[1];                                                     \
322  data2 = dpSrc[2];                                                     \
323  row20 = vis_faligndata(data0, data1);                                 \
324  row21 = vis_faligndata(data1, data2);                                 \
325  sPtr += srcYStride;                                                   \
326  vis_alignaddr(sPtr, 0);                                               \
327  dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
328  data0 = dpSrc[0];                                                     \
329  data1 = dpSrc[1];                                                     \
330  data2 = dpSrc[2];                                                     \
331  row30 = vis_faligndata(data0, data1);                                 \
332  row31 = vis_faligndata(data1, data2);                                 \
333  filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                       \
334  yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
335  yFilter0 = yPtr[0];                                                   \
336  yFilter1 = yPtr[1];                                                   \
337  yFilter2 = yPtr[2];                                                   \
338  yFilter3 = yPtr[3];                                                   \
339  filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                       \
340  xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_s16 + filterposx));  \
341  X += dX;                                                              \
342  Y += dY
343
344/***************************************************************/
345#define RESULT_2BC_S16_1PIXEL()                                 \
346  u00 = vis_fmul8sux16(row00, yFilter0);                        \
347  dr = vis_fpmerge(vis_read_hi(xFilter), vis_read_lo(xFilter)); \
348  u01 = vis_fmul8ulx16(row00, yFilter0);                        \
349  dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr));           \
350  u10 = vis_fmul8sux16(row01, yFilter0);                        \
351  dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr));          \
352  u11 = vis_fmul8ulx16(row01, yFilter0);                        \
353  dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr));           \
354  u20 = vis_fmul8sux16(row10, yFilter1);                        \
355  v00 = vis_fpadd16(u00, u01);                                  \
356  u21 = vis_fmul8ulx16(row10, yFilter1);                        \
357  v01 = vis_fpadd16(u10, u11);                                  \
358  u00 = vis_fmul8sux16(row11, yFilter1);                        \
359  xFilter0 = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1));    \
360  u01 = vis_fmul8ulx16(row11, yFilter1);                        \
361  u10 = vis_fmul8sux16(row20, yFilter2);                        \
362  u11 = vis_fmul8ulx16(row20, yFilter2);                        \
363  v10 = vis_fpadd16(u20, u21);                                  \
364  sum0 = vis_fpadd16(v00, v10);                                 \
365  u20 = vis_fmul8sux16(row21, yFilter2);                        \
366  v11 = vis_fpadd16(u00, u01);                                  \
367  u21 = vis_fmul8ulx16(row21, yFilter2);                        \
368  xFilter1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1));    \
369  u00 = vis_fmul8sux16(row30, yFilter3);                        \
370  v20 = vis_fpadd16(u10, u11);                                  \
371  sum1 = vis_fpadd16(v01, v11);                                 \
372  u01 = vis_fmul8ulx16(row30, yFilter3);                        \
373  sum0 = vis_fpadd16(sum0, v20);                                \
374  v21 = vis_fpadd16(u20, u21);                                  \
375  u10 = vis_fmul8sux16(row31, yFilter3);                        \
376  v30 = vis_fpadd16(u00, u01);                                  \
377  sum1 = vis_fpadd16(sum1, v21);                                \
378  u11 = vis_fmul8ulx16(row31, yFilter3);                        \
379  sum0 = vis_fpadd16(sum0, v30);                                \
380  v31 = vis_fpadd16(u10, u11);                                  \
381  sum1 = vis_fpadd16(sum1, v31);                                \
382  d00 = vis_fmul8sux16(sum0, xFilter0);                         \
383  d10 = vis_fmul8ulx16(sum0, xFilter0);                         \
384  d20 = vis_fmul8sux16(sum1, xFilter1);                         \
385  d30 = vis_fmul8ulx16(sum1, xFilter1);                         \
386  d0 = vis_fpadd16(d00, d10);                                   \
387  d1 = vis_fpadd16(d20, d30);                                   \
388  d0 = vis_fpadd16(d0, d1);                                     \
389  p0 = vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0));          \
390  d0 = vis_fmuld8sux16(f_x01000100, p0);                        \
391  res = vis_fpackfix_pair(d0, d0)
392
393/***************************************************************/
394#define BC_S16_2CH(ind, mlib_filters_s16, mlib_filters_s16_4)           \
395  u00 = vis_fmul8sux16(row00, yFilter0);                                \
396  dr = vis_fpmerge(vis_read_hi(xFilter), vis_read_lo(xFilter));         \
397  u01 = vis_fmul8ulx16(row00, yFilter0);                                \
398  dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr));                   \
399  u10 = vis_fmul8sux16(row01, yFilter0);                                \
400  dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr));                  \
401  u11 = vis_fmul8ulx16(row01, yFilter0);                                \
402  dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr));                   \
403  vis_alignaddr(sPtr, 0);                                               \
404  dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
405  u20 = vis_fmul8sux16(row10, yFilter1);                                \
406  v00 = vis_fpadd16(u00, u01);                                          \
407  u21 = vis_fmul8ulx16(row10, yFilter1);                                \
408  data0 = dpSrc[0];                                                     \
409  filterposy = (Y >> FILTER_SHIFT);                                     \
410  v01 = vis_fpadd16(u10, u11);                                          \
411  data1 = dpSrc[1];                                                     \
412  u00 = vis_fmul8sux16(row11, yFilter1);                                \
413  xFilter0 = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1));            \
414  data2 = dpSrc[2];                                                     \
415  u01 = vis_fmul8ulx16(row11, yFilter1);                                \
416  row00 = vis_faligndata(data0, data1);                                 \
417  u10 = vis_fmul8sux16(row20, yFilter2);                                \
418  row01 = vis_faligndata(data1, data2);                                 \
419  filterposx = (X >> FILTER_SHIFT);                                     \
420  sPtr += srcYStride;                                                   \
421  vis_alignaddr(sPtr, 0);                                               \
422  dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
423  u11 = vis_fmul8ulx16(row20, yFilter2);                                \
424  v10 = vis_fpadd16(u20, u21);                                          \
425  data0 = dpSrc[0];                                                     \
426  sum0 = vis_fpadd16(v00, v10);                                         \
427  X += dX;                                                              \
428  data1 = dpSrc[1];                                                     \
429  u20 = vis_fmul8sux16(row21, yFilter2);                                \
430  v11 = vis_fpadd16(u00, u01);                                          \
431  data2 = dpSrc[2];                                                     \
432  row10 = vis_faligndata(data0, data1);                                 \
433  u21 = vis_fmul8ulx16(row21, yFilter2);                                \
434  row11 = vis_faligndata(data1, data2);                                 \
435  sPtr += srcYStride;                                                   \
436  xFilter1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1));            \
437  vis_alignaddr(sPtr, 0);                                               \
438  dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
439  u00 = vis_fmul8sux16(row30, yFilter3);                                \
440  v20 = vis_fpadd16(u10, u11);                                          \
441  Y += dY;                                                              \
442  xSrc = (X >> MLIB_SHIFT)-1;                                           \
443  sum1 = vis_fpadd16(v01, v11);                                         \
444  data0 = dpSrc[0];                                                     \
445  u01 = vis_fmul8ulx16(row30, yFilter3);                                \
446  sum0 = vis_fpadd16(sum0, v20);                                        \
447  ySrc = (Y >> MLIB_SHIFT)-1;                                           \
448  data1 = dpSrc[1];                                                     \
449  v21 = vis_fpadd16(u20, u21);                                          \
450  u10 = vis_fmul8sux16(row31, yFilter3);                                \
451  data2 = dpSrc[2];                                                     \
452  v30 = vis_fpadd16(u00, u01);                                          \
453  filterposy &= FILTER_MASK;                                            \
454  row20 = vis_faligndata(data0, data1);                                 \
455  sum1 = vis_fpadd16(sum1, v21);                                        \
456  u11 = vis_fmul8ulx16(row31, yFilter3);                                \
457  row21 = vis_faligndata(data1, data2);                                 \
458  sPtr += srcYStride;                                                   \
459  filterposx &= FILTER_MASK;                                            \
460  v31 = vis_fpadd16(u10, u11);                                          \
461  vis_alignaddr(sPtr, 0);                                               \
462  dpSrc = (mlib_d64*)(((mlib_addr)sPtr) & (~7));                        \
463  data0 = dpSrc[0];                                                     \
464  sum0 = vis_fpadd16(sum0, v30);                                        \
465  data1 = dpSrc[1];                                                     \
466  sum1 = vis_fpadd16(sum1, v31);                                        \
467  data2 = dpSrc[2];                                                     \
468  row30 = vis_faligndata(data0, data1);                                 \
469  d0 = vis_fmul8sux16(sum0, xFilter0);                                  \
470  row31 = vis_faligndata(data1, data2);                                 \
471  yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
472  d1 = vis_fmul8ulx16(sum0, xFilter0);                                  \
473  yFilter0 = yPtr[0];                                                   \
474  d2 = vis_fmul8sux16(sum1, xFilter1);                                  \
475  yFilter1 = yPtr[1];                                                   \
476  d3 = vis_fmul8ulx16(sum1, xFilter1);                                  \
477  d0##ind = vis_fpadd16(d0, d1);                                        \
478  yFilter2 = yPtr[2];                                                   \
479  yFilter3 = yPtr[3];                                                   \
480  d1##ind = vis_fpadd16(d2, d3);                                        \
481  xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_s16 + filterposx));  \
482  sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc << 1)
483
484/***************************************************************/
485#define FADD_2BC_S16()                                          \
486  d0 = vis_fpadd16(d00, d10);                                   \
487  d2 = vis_fpadd16(d01, d11);                                   \
488  p0 = vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0));          \
489  p1 = vis_fpadd16s(vis_read_hi(d2), vis_read_lo(d2));          \
490  d0 = vis_fmuld8sux16(f_x01000100, p0);                        \
491  d1 = vis_fmuld8sux16(f_x01000100, p1);                        \
492  res = vis_fpackfix_pair(d0, d1)
493
494/***************************************************************/
495mlib_status mlib_ImageAffine_s16_2ch_bc (mlib_affine_param *param)
496{
497  DECLAREVAR_BC();
498  DTYPE  *dstLineEnd;
499  mlib_s32  filterposx, filterposy;
500  mlib_d64  data0, data1, data2;
501  mlib_d64  sum0, sum1;
502  mlib_d64  row00, row10, row20, row30;
503  mlib_d64  row01, row11, row21, row31;
504  mlib_f32  p0, p1;
505  mlib_d64  xFilter, xFilter0, xFilter1;
506  mlib_d64  yFilter0, yFilter1, yFilter2, yFilter3;
507  mlib_d64  v00, v01, v10, v11, v20, v21, v30, v31;
508  mlib_d64  u00, u01, u10, u11, u20, u21;
509  mlib_d64  d0, d1, d2, d3;
510  mlib_d64  d00, d10, d20, d30, d01, d11;
511  mlib_d64  *yPtr;
512  mlib_d64  *dp, *dpSrc;
513  mlib_s32  cols, i, mask, emask;
514  mlib_d64  res, res1;
515  mlib_d64  dr, dr1;
516  mlib_f32 f_x01000100 = vis_to_float(0x01000100);
517  const mlib_s16 *mlib_filters_table  ;
518  const mlib_s16 *mlib_filters_table_4;
519
520  if (filter == MLIB_BICUBIC) {
521    mlib_filters_table   = mlib_filters_s16_bc;
522    mlib_filters_table_4 = mlib_filters_s16_bc_4;
523  } else {
524    mlib_filters_table   = mlib_filters_s16_bc2;
525    mlib_filters_table_4 = mlib_filters_s16_bc2_4;
526  }
527
528  srcYStride >>= 1;
529
530  for (j = yStart; j <= yFinish; j++) {
531
532    vis_write_gsr(10 << 3);
533
534    CLIP(2);
535    dstLineEnd  = (DTYPE*)dstData + 2 * xRight;
536
537    cols = xRight - xLeft + 1;
538    dp = vis_alignaddr(dstPixelPtr, 0);
539    dstLineEnd += 1;
540    mask = vis_edge16(dstPixelPtr, dstLineEnd);
541    i = 0;
542
543    if (i <= cols - 6) {
544
545      NEXT_PIXEL_2BC_S16();
546      LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
547
548      NEXT_PIXEL_2BC_S16();
549
550      BC_S16_2CH(0, mlib_filters_table, mlib_filters_table_4);
551      BC_S16_2CH(1, mlib_filters_table, mlib_filters_table_4);
552
553      FADD_2BC_S16();
554
555      BC_S16_2CH(0, mlib_filters_table, mlib_filters_table_4);
556      BC_S16_2CH(1, mlib_filters_table, mlib_filters_table_4);
557
558#pragma pipeloop(0)
559      for (; i <= cols-8; i += 2) {
560        vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
561        res = vis_faligndata(res, res);
562        vis_pst_16(res, dp++, mask);
563        vis_pst_16(res, dp, ~mask);
564        FADD_2BC_S16();
565        BC_S16_2CH(0, mlib_filters_table, mlib_filters_table_4);
566        BC_S16_2CH(1, mlib_filters_table, mlib_filters_table_4);
567      }
568
569      vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
570      res = vis_faligndata(res, res);
571      vis_pst_16(res, dp++, mask);
572      vis_pst_16(res, dp, ~mask);
573
574      FADD_2BC_S16();
575      vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
576      res = vis_faligndata(res, res);
577      vis_pst_16(res, dp++, mask);
578      vis_pst_16(res, dp, ~mask);
579
580      RESULT_2BC_S16_1PIXEL();
581      res1 = res;
582
583      LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
584      RESULT_2BC_S16_1PIXEL();
585      res = vis_write_hi(res, vis_read_hi(res1));
586      vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
587      res = vis_faligndata(res, res);
588      vis_pst_16(res, dp++, mask);
589      vis_pst_16(res, dp, ~mask);
590
591      i += 6;
592    }
593
594    if (i <= cols - 4) {
595      NEXT_PIXEL_2BC_S16();
596      LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
597
598      NEXT_PIXEL_2BC_S16();
599
600      BC_S16_2CH(0, mlib_filters_table, mlib_filters_table_4);
601      BC_S16_2CH(1, mlib_filters_table, mlib_filters_table_4);
602
603      FADD_2BC_S16();
604      vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
605      res = vis_faligndata(res, res);
606      vis_pst_16(res, dp++, mask);
607      vis_pst_16(res, dp, ~mask);
608
609      RESULT_2BC_S16_1PIXEL();
610      res1 = res;
611
612      LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
613      RESULT_2BC_S16_1PIXEL();
614      res = vis_write_hi(res, vis_read_hi(res1));
615      vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
616      res = vis_faligndata(res, res);
617      vis_pst_16(res, dp++, mask);
618      vis_pst_16(res, dp, ~mask);
619
620      i += 4;
621    }
622
623    if (i <= cols - 2) {
624      NEXT_PIXEL_2BC_S16();
625      LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
626      RESULT_2BC_S16_1PIXEL();
627      res1 = res;
628
629      NEXT_PIXEL_2BC_S16();
630      LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
631      RESULT_2BC_S16_1PIXEL();
632      res = vis_write_hi(res, vis_read_hi(res1));
633      vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
634      res = vis_faligndata(res, res);
635      vis_pst_16(res, dp++, mask);
636      vis_pst_16(res, dp, ~mask);
637
638      i += 2;
639    }
640
641    if (i < cols) {
642      NEXT_PIXEL_2BC_S16();
643      LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
644      RESULT_2BC_S16_1PIXEL();
645      vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
646      res = vis_faligndata(res, res);
647      emask = vis_edge16(dp, dstLineEnd);
648      vis_pst_16(res, dp++, mask & emask);
649
650      if ((mlib_s16*)dp <= dstLineEnd) {
651        mask = vis_edge16(dp, dstLineEnd);
652        vis_pst_16(res, dp, mask);
653      }
654    }
655  }
656
657  return MLIB_SUCCESS;
658}
659
660/***************************************************************/
661#define NEXT_PIXEL_3BC_S16()                                    \
662  xSrc = (X >> MLIB_SHIFT)-1;                                   \
663  ySrc = (Y >> MLIB_SHIFT)-1;                                   \
664  sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc*3)
665
666/***************************************************************/
667#define LOAD_BC_S16_3CH_1PIXEL(mlib_filters_s16_3, mlib_filters_s16_4)  \
668  dpSrc = vis_alignaddr(sPtr, 0);                                       \
669  data0 = dpSrc[0];                                                     \
670  data1 = dpSrc[1];                                                     \
671  data2 = dpSrc[2];                                                     \
672  data3 = dpSrc[3];                                                     \
673  row00 = vis_faligndata(data0, data1);                                 \
674  row01 = vis_faligndata(data1, data2);                                 \
675  row02 = vis_faligndata(data2, data3);                                 \
676  sPtr += srcYStride;                                                   \
677  dpSrc = vis_alignaddr(sPtr, 0);                                       \
678  data0 = dpSrc[0];                                                     \
679  data1 = dpSrc[1];                                                     \
680  data2 = dpSrc[2];                                                     \
681  data3 = dpSrc[3];                                                     \
682  row10 = vis_faligndata(data0, data1);                                 \
683  row11 = vis_faligndata(data1, data2);                                 \
684  row12 = vis_faligndata(data2, data3);                                 \
685  sPtr += srcYStride;                                                   \
686  dpSrc = vis_alignaddr(sPtr, 0);                                       \
687  data0 = dpSrc[0];                                                     \
688  data1 = dpSrc[1];                                                     \
689  data2 = dpSrc[2];                                                     \
690  data3 = dpSrc[3];                                                     \
691  row20 = vis_faligndata(data0, data1);                                 \
692  row21 = vis_faligndata(data1, data2);                                 \
693  row22 = vis_faligndata(data2, data3);                                 \
694  sPtr += srcYStride;                                                   \
695  dpSrc = vis_alignaddr(sPtr, 0);                                       \
696  data0 = dpSrc[0];                                                     \
697  data1 = dpSrc[1];                                                     \
698  data2 = dpSrc[2];                                                     \
699  data3 = dpSrc[3];                                                     \
700  row30 = vis_faligndata(data0, data1);                                 \
701  row31 = vis_faligndata(data1, data2);                                 \
702  row32 = vis_faligndata(data2, data3);                                 \
703  filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                       \
704  yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
705  yFilter0 = yPtr[0];                                                   \
706  yFilter1 = yPtr[1];                                                   \
707  yFilter2 = yPtr[2];                                                   \
708  yFilter3 = yPtr[3];                                                   \
709  filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                       \
710  xPtr = ((mlib_d64 *)((mlib_u8 *)mlib_filters_s16_3 + filterposx*3));  \
711  xFilter0 = xPtr[0];                                                   \
712  xFilter1 = xPtr[1];                                                   \
713  xFilter2 = xPtr[2];                                                   \
714  X += dX;                                                              \
715  Y += dY
716
717/***************************************************************/
718#define STORE_BC_S16_3CH_1PIXEL()                               \
719  dstPixelPtr[0] = f0.t[0];                                     \
720  dstPixelPtr[1] = f0.t[1];                                     \
721  dstPixelPtr[2] = f0.t[2];                                     \
722  dstPixelPtr += 3
723
724/***************************************************************/
725#define RESULT_3BC_S16_1PIXEL()                                 \
726  u00 = vis_fmul8sux16(row00, yFilter0);                        \
727  u01 = vis_fmul8ulx16(row00, yFilter0);                        \
728  u10 = vis_fmul8sux16(row01, yFilter0);                        \
729  u11 = vis_fmul8ulx16(row01, yFilter0);                        \
730  v00 = vis_fpadd16(u00, u01);                                  \
731  u20 = vis_fmul8sux16(row02, yFilter0);                        \
732  v01 = vis_fpadd16(u10, u11);                                  \
733  u21 = vis_fmul8ulx16(row02, yFilter0);                        \
734  u00 = vis_fmul8sux16(row10, yFilter1);                        \
735  u01 = vis_fmul8ulx16(row10, yFilter1);                        \
736  v02 = vis_fpadd16(u20, u21);                                  \
737  u10 = vis_fmul8sux16(row11, yFilter1);                        \
738  u11 = vis_fmul8ulx16(row11, yFilter1);                        \
739  v10 = vis_fpadd16(u00, u01);                                  \
740  u20 = vis_fmul8sux16(row12, yFilter1);                        \
741  u21 = vis_fmul8ulx16(row12, yFilter1);                        \
742  u00 = vis_fmul8sux16(row20, yFilter2);                        \
743  v11 = vis_fpadd16(u10, u11);                                  \
744  u01 = vis_fmul8ulx16(row20, yFilter2);                        \
745  v12 = vis_fpadd16(u20, u21);                                  \
746  u10 = vis_fmul8sux16(row21, yFilter2);                        \
747  u11 = vis_fmul8ulx16(row21, yFilter2);                        \
748  v20 = vis_fpadd16(u00, u01);                                  \
749  u20 = vis_fmul8sux16(row22, yFilter2);                        \
750  sum0 = vis_fpadd16(v00, v10);                                 \
751  u21 = vis_fmul8ulx16(row22, yFilter2);                        \
752  u00 = vis_fmul8sux16(row30, yFilter3);                        \
753  u01 = vis_fmul8ulx16(row30, yFilter3);                        \
754  v21 = vis_fpadd16(u10, u11);                                  \
755  sum1 = vis_fpadd16(v01, v11);                                 \
756  u10 = vis_fmul8sux16(row31, yFilter3);                        \
757  sum2 = vis_fpadd16(v02, v12);                                 \
758  v22 = vis_fpadd16(u20, u21);                                  \
759  u11 = vis_fmul8ulx16(row31, yFilter3);                        \
760  sum0 = vis_fpadd16(sum0, v20);                                \
761  u20 = vis_fmul8sux16(row32, yFilter3);                        \
762  v30 = vis_fpadd16(u00, u01);                                  \
763  sum1 = vis_fpadd16(sum1, v21);                                \
764  u21 = vis_fmul8ulx16(row32, yFilter3);                        \
765  v31 = vis_fpadd16(u10, u11);                                  \
766  sum2 = vis_fpadd16(sum2, v22);                                \
767  v32 = vis_fpadd16(u20, u21);                                  \
768  sum0 = vis_fpadd16(sum0, v30);                                \
769  row30 = vis_faligndata(data0, data1);                         \
770  v00 = vis_fmul8sux16(sum0, xFilter0);                         \
771  sum1 = vis_fpadd16(sum1, v31);                                \
772  sum2 = vis_fpadd16(sum2, v32);                                \
773  v01 = vis_fmul8ulx16(sum0, xFilter0);                         \
774  v10 = vis_fmul8sux16(sum1, xFilter1);                         \
775  v11 = vis_fmul8ulx16(sum1, xFilter1);                         \
776  d0 = vis_fpadd16(v00, v01);                                   \
777  v20 = vis_fmul8sux16(sum2, xFilter2);                         \
778  v21 = vis_fmul8ulx16(sum2, xFilter2);                         \
779  d1 = vis_fpadd16(v10, v11);                                   \
780  d2 = vis_fpadd16(v20, v21);                                   \
781  vis_alignaddr((void*)6, 0);                                   \
782  d3 = vis_faligndata(d0, d1);                                  \
783  vis_alignaddr((void*)2, 0);                                   \
784  d4 = vis_faligndata(d1, d2);                                  \
785  d0 = vis_fpadd16(d0, d3);                                     \
786  d2 = vis_fpadd16(d2, d4);                                     \
787  d1 = vis_faligndata(d2, d2);                                  \
788  d0 = vis_fpadd16(d0, d1);                                     \
789  d2 = vis_fmuld8sux16(f_x01000100, vis_read_hi(d0));           \
790  d3 = vis_fmuld8sux16(f_x01000100, vis_read_lo(d0));           \
791  f0.d = vis_fpackfix_pair(d2, d3)
792
793/***************************************************************/
794#define BC_S16_3CH(mlib_filters_s16_3, mlib_filters_s16_4)              \
795  u00 = vis_fmul8sux16(row00, yFilter0);                                \
796  u01 = vis_fmul8ulx16(row00, yFilter0);                                \
797  u10 = vis_fmul8sux16(row01, yFilter0);                                \
798  u11 = vis_fmul8ulx16(row01, yFilter0);                                \
799  v00 = vis_fpadd16(u00, u01);                                          \
800  u20 = vis_fmul8sux16(row02, yFilter0);                                \
801  v01 = vis_fpadd16(u10, u11);                                          \
802  u21 = vis_fmul8ulx16(row02, yFilter0);                                \
803  dpSrc = vis_alignaddr(sPtr, 0);                                       \
804  u00 = vis_fmul8sux16(row10, yFilter1);                                \
805  u01 = vis_fmul8ulx16(row10, yFilter1);                                \
806  data0 = dpSrc[0];                                                     \
807  filterposy = (Y >> FILTER_SHIFT);                                     \
808  v02 = vis_fpadd16(u20, u21);                                          \
809  data1 = dpSrc[1];                                                     \
810  u10 = vis_fmul8sux16(row11, yFilter1);                                \
811  data2 = dpSrc[2];                                                     \
812  u11 = vis_fmul8ulx16(row11, yFilter1);                                \
813  v10 = vis_fpadd16(u00, u01);                                          \
814  data3 = dpSrc[3];                                                     \
815  u20 = vis_fmul8sux16(row12, yFilter1);                                \
816  row00 = vis_faligndata(data0, data1);                                 \
817  u21 = vis_fmul8ulx16(row12, yFilter1);                                \
818  row01 = vis_faligndata(data1, data2);                                 \
819  u00 = vis_fmul8sux16(row20, yFilter2);                                \
820  row02 = vis_faligndata(data2, data3);                                 \
821  filterposx = (X >> FILTER_SHIFT);                                     \
822  sPtr += srcYStride;                                                   \
823  dpSrc = vis_alignaddr(sPtr, 0);                                       \
824  v11 = vis_fpadd16(u10, u11);                                          \
825  u01 = vis_fmul8ulx16(row20, yFilter2);                                \
826  v12 = vis_fpadd16(u20, u21);                                          \
827  data0 = dpSrc[0];                                                     \
828  u10 = vis_fmul8sux16(row21, yFilter2);                                \
829  X += dX;                                                              \
830  data1 = dpSrc[1];                                                     \
831  u11 = vis_fmul8ulx16(row21, yFilter2);                                \
832  v20 = vis_fpadd16(u00, u01);                                          \
833  data2 = dpSrc[2];                                                     \
834  u20 = vis_fmul8sux16(row22, yFilter2);                                \
835  sum0 = vis_fpadd16(v00, v10);                                         \
836  data3 = dpSrc[3];                                                     \
837  row10 = vis_faligndata(data0, data1);                                 \
838  u21 = vis_fmul8ulx16(row22, yFilter2);                                \
839  row11 = vis_faligndata(data1, data2);                                 \
840  u00 = vis_fmul8sux16(row30, yFilter3);                                \
841  row12 = vis_faligndata(data2, data3);                                 \
842  sPtr += srcYStride;                                                   \
843  dpSrc = vis_alignaddr(sPtr, 0);                                       \
844  u01 = vis_fmul8ulx16(row30, yFilter3);                                \
845  v21 = vis_fpadd16(u10, u11);                                          \
846  Y += dY;                                                              \
847  xSrc = (X >> MLIB_SHIFT)-1;                                           \
848  sum1 = vis_fpadd16(v01, v11);                                         \
849  data0 = dpSrc[0];                                                     \
850  u10 = vis_fmul8sux16(row31, yFilter3);                                \
851  sum2 = vis_fpadd16(v02, v12);                                         \
852  ySrc = (Y >> MLIB_SHIFT)-1;                                           \
853  data1 = dpSrc[1];                                                     \
854  v22 = vis_fpadd16(u20, u21);                                          \
855  u11 = vis_fmul8ulx16(row31, yFilter3);                                \
856  data2 = dpSrc[2];                                                     \
857  sum0 = vis_fpadd16(sum0, v20);                                        \
858  u20 = vis_fmul8sux16(row32, yFilter3);                                \
859  data3 = dpSrc[3];                                                     \
860  v30 = vis_fpadd16(u00, u01);                                          \
861  filterposy &= FILTER_MASK;                                            \
862  row20 = vis_faligndata(data0, data1);                                 \
863  sum1 = vis_fpadd16(sum1, v21);                                        \
864  u21 = vis_fmul8ulx16(row32, yFilter3);                                \
865  row21 = vis_faligndata(data1, data2);                                 \
866  row22 = vis_faligndata(data2, data3);                                 \
867  sPtr += srcYStride;                                                   \
868  filterposx &= FILTER_MASK;                                            \
869  v31 = vis_fpadd16(u10, u11);                                          \
870  dpSrc = vis_alignaddr(sPtr, 0);                                       \
871  data0 = dpSrc[0];                                                     \
872  sum2 = vis_fpadd16(sum2, v22);                                        \
873  data1 = dpSrc[1];                                                     \
874  v32 = vis_fpadd16(u20, u21);                                          \
875  data2 = dpSrc[2];                                                     \
876  sum0 = vis_fpadd16(sum0, v30);                                        \
877  data3 = dpSrc[3];                                                     \
878  row30 = vis_faligndata(data0, data1);                                 \
879  v00 = vis_fmul8sux16(sum0, xFilter0);                                 \
880  row31 = vis_faligndata(data1, data2);                                 \
881  row32 = vis_faligndata(data2, data3);                                 \
882  yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
883  sum1 = vis_fpadd16(sum1, v31);                                        \
884  yFilter0 = yPtr[0];                                                   \
885  sum2 = vis_fpadd16(sum2, v32);                                        \
886  v01 = vis_fmul8ulx16(sum0, xFilter0);                                 \
887  yFilter1 = yPtr[1];                                                   \
888  v10 = vis_fmul8sux16(sum1, xFilter1);                                 \
889  yFilter2 = yPtr[2];                                                   \
890  v11 = vis_fmul8ulx16(sum1, xFilter1);                                 \
891  d0 = vis_fpadd16(v00, v01);                                           \
892  yFilter3 = yPtr[3];                                                   \
893  xPtr = ((mlib_d64 *)((mlib_u8 *)mlib_filters_s16_3 + filterposx*3));  \
894  v20 = vis_fmul8sux16(sum2, xFilter2);                                 \
895  xFilter0 = xPtr[0];                                                   \
896  v21 = vis_fmul8ulx16(sum2, xFilter2);                                 \
897  d1 = vis_fpadd16(v10, v11);                                           \
898  xFilter1 = xPtr[1];                                                   \
899  d2 = vis_fpadd16(v20, v21);                                           \
900  xFilter2 = xPtr[2];                                                   \
901  sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc*3)
902
903/***************************************************************/
904#define FADD_3BC_S16()                                          \
905  vis_alignaddr((void*)6, 0);                                   \
906  d3 = vis_faligndata(d0, d1);                                  \
907  vis_alignaddr((void*)2, 0);                                   \
908  d4 = vis_faligndata(d1, d2);                                  \
909  d0 = vis_fpadd16(d0, d3);                                     \
910  d2 = vis_fpadd16(d2, d4);                                     \
911  d1 = vis_faligndata(d2, d2);                                  \
912  d0 = vis_fpadd16(d0, d1);                                     \
913  d2 = vis_fmuld8sux16(f_x01000100, vis_read_hi(d0));           \
914  d3 = vis_fmuld8sux16(f_x01000100, vis_read_lo(d0));           \
915  f0.d = vis_fpackfix_pair(d2, d3)
916
917/***************************************************************/
918mlib_status mlib_ImageAffine_s16_3ch_bc (mlib_affine_param *param)
919{
920  DECLAREVAR_BC();
921  mlib_s32  filterposx, filterposy;
922  mlib_d64  data0, data1, data2, data3;
923  mlib_d64  sum0, sum1, sum2;
924  mlib_d64  row00, row10, row20, row30;
925  mlib_d64  row01, row11, row21, row31;
926  mlib_d64  row02, row12, row22, row32;
927  mlib_d64  xFilter0, xFilter1, xFilter2;
928  mlib_d64  yFilter0, yFilter1, yFilter2, yFilter3;
929  mlib_d64  v00, v01, v02, v10, v11, v12, v20, v21, v22, v30, v31, v32;
930  mlib_d64  u00, u01, u10, u11, u20, u21;
931  mlib_d64  d0, d1, d2, d3, d4;
932  mlib_d64 *yPtr, *xPtr;
933  mlib_d64 *dpSrc;
934  mlib_s32  cols, i;
935  mlib_f32  f_x01000100 = vis_to_float(0x01000100);
936  union {
937    mlib_s16 t[4];
938    mlib_d64 d;
939  } f0;
940  const mlib_s16 *mlib_filters_table_3;
941  const mlib_s16 *mlib_filters_table_4;
942
943  if (filter == MLIB_BICUBIC) {
944    mlib_filters_table_3 = mlib_filters_s16_bc_3;
945    mlib_filters_table_4 = mlib_filters_s16_bc_4;
946  } else {
947    mlib_filters_table_3 = mlib_filters_s16_bc2_3;
948    mlib_filters_table_4 = mlib_filters_s16_bc2_4;
949  }
950
951  srcYStride >>= 1;
952
953  for (j = yStart; j <= yFinish; j++) {
954
955    vis_write_gsr(10 << 3);
956
957    CLIP(3);
958
959    cols = xRight - xLeft + 1;
960
961    i = 0;
962
963    if (i <= cols - 4) {
964
965      NEXT_PIXEL_3BC_S16();
966      LOAD_BC_S16_3CH_1PIXEL(mlib_filters_table_3, mlib_filters_table_4);
967
968      NEXT_PIXEL_3BC_S16();
969
970      BC_S16_3CH(mlib_filters_table_3, mlib_filters_table_4);
971      FADD_3BC_S16();
972
973      BC_S16_3CH(mlib_filters_table_3, mlib_filters_table_4);
974
975#pragma pipeloop(0)
976      for (; i < cols-4; i++) {
977        STORE_BC_S16_3CH_1PIXEL();
978
979        FADD_3BC_S16();
980        BC_S16_3CH(mlib_filters_table_3, mlib_filters_table_4);
981      }
982
983      STORE_BC_S16_3CH_1PIXEL();
984
985      FADD_3BC_S16();
986      STORE_BC_S16_3CH_1PIXEL();
987
988      RESULT_3BC_S16_1PIXEL();
989      STORE_BC_S16_3CH_1PIXEL();
990
991      LOAD_BC_S16_3CH_1PIXEL(mlib_filters_table_3, mlib_filters_table_4);
992      RESULT_3BC_S16_1PIXEL();
993      STORE_BC_S16_3CH_1PIXEL();
994      i += 4;
995    }
996
997    for (; i < cols; i++) {
998      NEXT_PIXEL_3BC_S16();
999      LOAD_BC_S16_3CH_1PIXEL(mlib_filters_table_3, mlib_filters_table_4);
1000      RESULT_3BC_S16_1PIXEL();
1001      STORE_BC_S16_3CH_1PIXEL();
1002    }
1003  }
1004
1005  return MLIB_SUCCESS;
1006}
1007
1008/***************************************************************/
1009#define NEXT_PIXEL_4BC_S16()                                    \
1010  xSrc = (X >> MLIB_SHIFT)-1;                                   \
1011  ySrc = (Y >> MLIB_SHIFT)-1;                                   \
1012  sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc << 2)
1013
1014/***************************************************************/
1015#define LOAD_BC_S16_4CH_1PIXEL(mlib_filters_s16_4)                      \
1016  dpSrc = vis_alignaddr(sPtr, 0);                                       \
1017  data0 = dpSrc[0];                                                     \
1018  data1 = dpSrc[1];                                                     \
1019  data2 = dpSrc[2];                                                     \
1020  data3 = dpSrc[3];                                                     \
1021  data4 = dpSrc[4];                                                     \
1022  row00 = vis_faligndata(data0, data1);                                 \
1023  row01 = vis_faligndata(data1, data2);                                 \
1024  row02 = vis_faligndata(data2, data3);                                 \
1025  row03 = vis_faligndata(data3, data4);                                 \
1026  sPtr += srcYStride;                                                   \
1027  dpSrc = vis_alignaddr(sPtr, 0);                                       \
1028  data0 = dpSrc[0];                                                     \
1029  data1 = dpSrc[1];                                                     \
1030  data2 = dpSrc[2];                                                     \
1031  data3 = dpSrc[3];                                                     \
1032  data4 = dpSrc[4];                                                     \
1033  row10 = vis_faligndata(data0, data1);                                 \
1034  row11 = vis_faligndata(data1, data2);                                 \
1035  row12 = vis_faligndata(data2, data3);                                 \
1036  row13 = vis_faligndata(data3, data4);                                 \
1037  sPtr += srcYStride;                                                   \
1038  dpSrc = vis_alignaddr(sPtr, 0);                                       \
1039  data0 = dpSrc[0];                                                     \
1040  data1 = dpSrc[1];                                                     \
1041  data2 = dpSrc[2];                                                     \
1042  data3 = dpSrc[3];                                                     \
1043  data4 = dpSrc[4];                                                     \
1044  row20 = vis_faligndata(data0, data1);                                 \
1045  row21 = vis_faligndata(data1, data2);                                 \
1046  row22 = vis_faligndata(data2, data3);                                 \
1047  row23 = vis_faligndata(data3, data4);                                 \
1048  sPtr += srcYStride;                                                   \
1049  dpSrc = vis_alignaddr(sPtr, 0);                                       \
1050  data0 = dpSrc[0];                                                     \
1051  data1 = dpSrc[1];                                                     \
1052  data2 = dpSrc[2];                                                     \
1053  data3 = dpSrc[3];                                                     \
1054  data4 = dpSrc[4];                                                     \
1055  row30 = vis_faligndata(data0, data1);                                 \
1056  row31 = vis_faligndata(data1, data2);                                 \
1057  row32 = vis_faligndata(data2, data3);                                 \
1058  row33 = vis_faligndata(data3, data4);                                 \
1059  filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                       \
1060  yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
1061  yFilter0 = yPtr[0];                                                   \
1062  yFilter1 = yPtr[1];                                                   \
1063  yFilter2 = yPtr[2];                                                   \
1064  yFilter3 = yPtr[3];                                                   \
1065  filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                       \
1066  xPtr = ((mlib_d64 *)((mlib_u8 *)mlib_filters_s16_4 + filterposx*4));  \
1067  xFilter0 = xPtr[0];                                                   \
1068  xFilter1 = xPtr[1];                                                   \
1069  xFilter2 = xPtr[2];                                                   \
1070  xFilter3 = xPtr[3];                                                   \
1071  X += dX;                                                              \
1072  Y += dY
1073
1074/***************************************************************/
1075#define RESULT_4BC_S16_1PIXEL()                                 \
1076  u00 = vis_fmul8sux16(row00, yFilter0);                        \
1077  u01 = vis_fmul8ulx16(row00, yFilter0);                        \
1078  u10 = vis_fmul8sux16(row01, yFilter0);                        \
1079  u11 = vis_fmul8ulx16(row01, yFilter0);                        \
1080  v00 = vis_fpadd16(u00, u01);                                  \
1081  u20 = vis_fmul8sux16(row02, yFilter0);                        \
1082  v01 = vis_fpadd16(u10, u11);                                  \
1083  u21 = vis_fmul8ulx16(row02, yFilter0);                        \
1084  u30 = vis_fmul8sux16(row03, yFilter0);                        \
1085  u31 = vis_fmul8ulx16(row03, yFilter0);                        \
1086  v02 = vis_fpadd16(u20, u21);                                  \
1087  u00 = vis_fmul8sux16(row10, yFilter1);                        \
1088  u01 = vis_fmul8ulx16(row10, yFilter1);                        \
1089  v03 = vis_fpadd16(u30, u31);                                  \
1090  u10 = vis_fmul8sux16(row11, yFilter1);                        \
1091  u11 = vis_fmul8ulx16(row11, yFilter1);                        \
1092  v10 = vis_fpadd16(u00, u01);                                  \
1093  u20 = vis_fmul8sux16(row12, yFilter1);                        \
1094  v11 = vis_fpadd16(u10, u11);                                  \
1095  u21 = vis_fmul8ulx16(row12, yFilter1);                        \
1096  u30 = vis_fmul8sux16(row13, yFilter1);                        \
1097  u31 = vis_fmul8ulx16(row13, yFilter1);                        \
1098  u00 = vis_fmul8sux16(row20, yFilter2);                        \
1099  v12 = vis_fpadd16(u20, u21);                                  \
1100  u01 = vis_fmul8ulx16(row20, yFilter2);                        \
1101  v13 = vis_fpadd16(u30, u31);                                  \
1102  u10 = vis_fmul8sux16(row21, yFilter2);                        \
1103  u11 = vis_fmul8ulx16(row21, yFilter2);                        \
1104  v20 = vis_fpadd16(u00, u01);                                  \
1105  u20 = vis_fmul8sux16(row22, yFilter2);                        \
1106  sum0 = vis_fpadd16(v00, v10);                                 \
1107  u21 = vis_fmul8ulx16(row22, yFilter2);                        \
1108  u30 = vis_fmul8sux16(row23, yFilter2);                        \
1109  u31 = vis_fmul8ulx16(row23, yFilter2);                        \
1110  u00 = vis_fmul8sux16(row30, yFilter3);                        \
1111  u01 = vis_fmul8ulx16(row30, yFilter3);                        \
1112  v21 = vis_fpadd16(u10, u11);                                  \
1113  sum1 = vis_fpadd16(v01, v11);                                 \
1114  u10 = vis_fmul8sux16(row31, yFilter3);                        \
1115  sum2 = vis_fpadd16(v02, v12);                                 \
1116  sum3 = vis_fpadd16(v03, v13);                                 \
1117  v22 = vis_fpadd16(u20, u21);                                  \
1118  u11 = vis_fmul8ulx16(row31, yFilter3);                        \
1119  sum0 = vis_fpadd16(sum0, v20);                                \
1120  u20 = vis_fmul8sux16(row32, yFilter3);                        \
1121  u21 = vis_fmul8ulx16(row32, yFilter3);                        \
1122  v23 = vis_fpadd16(u30, u31);                                  \
1123  v30 = vis_fpadd16(u00, u01);                                  \
1124  sum1 = vis_fpadd16(sum1, v21);                                \
1125  u30 = vis_fmul8sux16(row33, yFilter3);                        \
1126  u31 = vis_fmul8ulx16(row33, yFilter3);                        \
1127  v31 = vis_fpadd16(u10, u11);                                  \
1128  sum2 = vis_fpadd16(sum2, v22);                                \
1129  sum3 = vis_fpadd16(sum3, v23);                                \
1130  v32 = vis_fpadd16(u20, u21);                                  \
1131  sum0 = vis_fpadd16(sum0, v30);                                \
1132  v33 = vis_fpadd16(u30, u31);                                  \
1133  v00 = vis_fmul8sux16(sum0, xFilter0);                         \
1134  sum1 = vis_fpadd16(sum1, v31);                                \
1135  sum2 = vis_fpadd16(sum2, v32);                                \
1136  v01 = vis_fmul8ulx16(sum0, xFilter0);                         \
1137  v10 = vis_fmul8sux16(sum1, xFilter1);                         \
1138  sum3 = vis_fpadd16(sum3, v33);                                \
1139  v11 = vis_fmul8ulx16(sum1, xFilter1);                         \
1140  d0 = vis_fpadd16(v00, v01);                                   \
1141  v20 = vis_fmul8sux16(sum2, xFilter2);                         \
1142  v21 = vis_fmul8ulx16(sum2, xFilter2);                         \
1143  d1 = vis_fpadd16(v10, v11);                                   \
1144  v30 = vis_fmul8sux16(sum3, xFilter3);                         \
1145  v31 = vis_fmul8ulx16(sum3, xFilter3);                         \
1146  d2 = vis_fpadd16(v20, v21);                                   \
1147  d3 = vis_fpadd16(v30, v31);                                   \
1148  d0 = vis_fpadd16(d0, d1);                                     \
1149  d2 = vis_fpadd16(d2, d3);                                     \
1150  d0 = vis_fpadd16(d0, d2);                                     \
1151  d2 = vis_fmuld8sux16(f_x01000100, vis_read_hi(d0));           \
1152  d3 = vis_fmuld8sux16(f_x01000100, vis_read_lo(d0));           \
1153  res = vis_fpackfix_pair(d2, d3)
1154
1155/***************************************************************/
1156#define BC_S16_4CH(mlib_filters_s16_4)                                  \
1157  u00 = vis_fmul8sux16(row00, yFilter0);                                \
1158  u01 = vis_fmul8ulx16(row00, yFilter0);                                \
1159  u10 = vis_fmul8sux16(row01, yFilter0);                                \
1160  u11 = vis_fmul8ulx16(row01, yFilter0);                                \
1161  v00 = vis_fpadd16(u00, u01);                                          \
1162  u20 = vis_fmul8sux16(row02, yFilter0);                                \
1163  v01 = vis_fpadd16(u10, u11);                                          \
1164  u21 = vis_fmul8ulx16(row02, yFilter0);                                \
1165  u30 = vis_fmul8sux16(row03, yFilter0);                                \
1166  u31 = vis_fmul8ulx16(row03, yFilter0);                                \
1167  v02 = vis_fpadd16(u20, u21);                                          \
1168  dpSrc = vis_alignaddr(sPtr, 0);                                       \
1169  u00 = vis_fmul8sux16(row10, yFilter1);                                \
1170  u01 = vis_fmul8ulx16(row10, yFilter1);                                \
1171  data0 = dpSrc[0];                                                     \
1172  filterposy = (Y >> FILTER_SHIFT);                                     \
1173  v03 = vis_fpadd16(u30, u31);                                          \
1174  data1 = dpSrc[1];                                                     \
1175  u10 = vis_fmul8sux16(row11, yFilter1);                                \
1176  data2 = dpSrc[2];                                                     \
1177  u11 = vis_fmul8ulx16(row11, yFilter1);                                \
1178  v10 = vis_fpadd16(u00, u01);                                          \
1179  data3 = dpSrc[3];                                                     \
1180  u20 = vis_fmul8sux16(row12, yFilter1);                                \
1181  v11 = vis_fpadd16(u10, u11);                                          \
1182  data4 = dpSrc[4];                                                     \
1183  u21 = vis_fmul8ulx16(row12, yFilter1);                                \
1184  row00 = vis_faligndata(data0, data1);                                 \
1185  u30 = vis_fmul8sux16(row13, yFilter1);                                \
1186  row01 = vis_faligndata(data1, data2);                                 \
1187  u31 = vis_fmul8ulx16(row13, yFilter1);                                \
1188  row02 = vis_faligndata(data2, data3);                                 \
1189  u00 = vis_fmul8sux16(row20, yFilter2);                                \
1190  row03 = vis_faligndata(data3, data4);                                 \
1191  filterposx = (X >> FILTER_SHIFT);                                     \
1192  sPtr += srcYStride;                                                   \
1193  v12 = vis_fpadd16(u20, u21);                                          \
1194  dpSrc = vis_alignaddr(sPtr, 0);                                       \
1195  u01 = vis_fmul8ulx16(row20, yFilter2);                                \
1196  v13 = vis_fpadd16(u30, u31);                                          \
1197  data0 = dpSrc[0];                                                     \
1198  u10 = vis_fmul8sux16(row21, yFilter2);                                \
1199  X += dX;                                                              \
1200  data1 = dpSrc[1];                                                     \
1201  u11 = vis_fmul8ulx16(row21, yFilter2);                                \
1202  v20 = vis_fpadd16(u00, u01);                                          \
1203  data2 = dpSrc[2];                                                     \
1204  u20 = vis_fmul8sux16(row22, yFilter2);                                \
1205  sum0 = vis_fpadd16(v00, v10);                                         \
1206  data3 = dpSrc[3];                                                     \
1207  u21 = vis_fmul8ulx16(row22, yFilter2);                                \
1208  data4 = dpSrc[4];                                                     \
1209  row10 = vis_faligndata(data0, data1);                                 \
1210  u30 = vis_fmul8sux16(row23, yFilter2);                                \
1211  row11 = vis_faligndata(data1, data2);                                 \
1212  u31 = vis_fmul8ulx16(row23, yFilter2);                                \
1213  row12 = vis_faligndata(data2, data3);                                 \
1214  u00 = vis_fmul8sux16(row30, yFilter3);                                \
1215  row13 = vis_faligndata(data3, data4);                                 \
1216  sPtr += srcYStride;                                                   \
1217  dpSrc = vis_alignaddr(sPtr, 0);                                       \
1218  u01 = vis_fmul8ulx16(row30, yFilter3);                                \
1219  v21 = vis_fpadd16(u10, u11);                                          \
1220  Y += dY;                                                              \
1221  xSrc = (X >> MLIB_SHIFT)-1;                                           \
1222  sum1 = vis_fpadd16(v01, v11);                                         \
1223  data0 = dpSrc[0];                                                     \
1224  u10 = vis_fmul8sux16(row31, yFilter3);                                \
1225  sum2 = vis_fpadd16(v02, v12);                                         \
1226  sum3 = vis_fpadd16(v03, v13);                                         \
1227  ySrc = (Y >> MLIB_SHIFT)-1;                                           \
1228  data1 = dpSrc[1];                                                     \
1229  v22 = vis_fpadd16(u20, u21);                                          \
1230  u11 = vis_fmul8ulx16(row31, yFilter3);                                \
1231  data2 = dpSrc[2];                                                     \
1232  sum0 = vis_fpadd16(sum0, v20);                                        \
1233  u20 = vis_fmul8sux16(row32, yFilter3);                                \
1234  data3 = dpSrc[3];                                                     \
1235  u21 = vis_fmul8ulx16(row32, yFilter3);                                \
1236  v23 = vis_fpadd16(u30, u31);                                          \
1237  data4 = dpSrc[4];                                                     \
1238  v30 = vis_fpadd16(u00, u01);                                          \
1239  filterposy &= FILTER_MASK;                                            \
1240  row20 = vis_faligndata(data0, data1);                                 \
1241  sum1 = vis_fpadd16(sum1, v21);                                        \
1242  u30 = vis_fmul8sux16(row33, yFilter3);                                \
1243  row21 = vis_faligndata(data1, data2);                                 \
1244  u31 = vis_fmul8ulx16(row33, yFilter3);                                \
1245  row22 = vis_faligndata(data2, data3);                                 \
1246  row23 = vis_faligndata(data3, data4);                                 \
1247  sPtr += srcYStride;                                                   \
1248  filterposx &= FILTER_MASK;                                            \
1249  v31 = vis_fpadd16(u10, u11);                                          \
1250  dpSrc = vis_alignaddr(sPtr, 0);                                       \
1251  data0 = dpSrc[0];                                                     \
1252  sum2 = vis_fpadd16(sum2, v22);                                        \
1253  sum3 = vis_fpadd16(sum3, v23);                                        \
1254  data1 = dpSrc[1];                                                     \
1255  v32 = vis_fpadd16(u20, u21);                                          \
1256  data2 = dpSrc[2];                                                     \
1257  sum0 = vis_fpadd16(sum0, v30);                                        \
1258  data3 = dpSrc[3];                                                     \
1259  v33 = vis_fpadd16(u30, u31);                                          \
1260  data4 = dpSrc[4];                                                     \
1261  row30 = vis_faligndata(data0, data1);                                 \
1262  v00 = vis_fmul8sux16(sum0, xFilter0);                                 \
1263  row31 = vis_faligndata(data1, data2);                                 \
1264  row32 = vis_faligndata(data2, data3);                                 \
1265  row33 = vis_faligndata(data3, data4);                                 \
1266  yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
1267  sum1 = vis_fpadd16(sum1, v31);                                        \
1268  yFilter0 = yPtr[0];                                                   \
1269  sum2 = vis_fpadd16(sum2, v32);                                        \
1270  v01 = vis_fmul8ulx16(sum0, xFilter0);                                 \
1271  yFilter1 = yPtr[1];                                                   \
1272  v10 = vis_fmul8sux16(sum1, xFilter1);                                 \
1273  sum3 = vis_fpadd16(sum3, v33);                                        \
1274  yFilter2 = yPtr[2];                                                   \
1275  v11 = vis_fmul8ulx16(sum1, xFilter1);                                 \
1276  d0 = vis_fpadd16(v00, v01);                                           \
1277  yFilter3 = yPtr[3];                                                   \
1278  xPtr = ((mlib_d64 *)((mlib_u8 *)mlib_filters_s16_4 + filterposx*4));  \
1279  v20 = vis_fmul8sux16(sum2, xFilter2);                                 \
1280  xFilter0 = xPtr[0];                                                   \
1281  v21 = vis_fmul8ulx16(sum2, xFilter2);                                 \
1282  d1 = vis_fpadd16(v10, v11);                                           \
1283  xFilter1 = xPtr[1];                                                   \
1284  v30 = vis_fmul8sux16(sum3, xFilter3);                                 \
1285  v31 = vis_fmul8ulx16(sum3, xFilter3);                                 \
1286  d2 = vis_fpadd16(v20, v21);                                           \
1287  xFilter2 = xPtr[2];                                                   \
1288  d3 = vis_fpadd16(v30, v31);                                           \
1289  xFilter3 = xPtr[3];                                                   \
1290  sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc << 2)
1291
1292/***************************************************************/
1293#define FADD_4BC_S16()                                          \
1294  d0 = vis_fpadd16(d0, d1);                                     \
1295  d2 = vis_fpadd16(d2, d3);                                     \
1296  d0 = vis_fpadd16(d0, d2);                                     \
1297  d2 = vis_fmuld8sux16(f_x01000100, vis_read_hi(d0));           \
1298  d3 = vis_fmuld8sux16(f_x01000100, vis_read_lo(d0));           \
1299  res = vis_fpackfix_pair(d2, d3)
1300
1301/***************************************************************/
1302mlib_status mlib_ImageAffine_s16_4ch_bc (mlib_affine_param *param)
1303{
1304  DECLAREVAR_BC();
1305  DTYPE  *dstLineEnd;
1306  mlib_s32  filterposx, filterposy;
1307  mlib_d64  data0, data1, data2, data3, data4;
1308  mlib_d64  sum0, sum1, sum2, sum3;
1309  mlib_d64  row00, row10, row20, row30;
1310  mlib_d64  row01, row11, row21, row31;
1311  mlib_d64  row02, row12, row22, row32;
1312  mlib_d64  row03, row13, row23, row33;
1313  mlib_d64  xFilter0, xFilter1, xFilter2, xFilter3;
1314  mlib_d64  yFilter0, yFilter1, yFilter2, yFilter3;
1315  mlib_d64  v00, v01, v02, v03, v10, v11, v12, v13;
1316  mlib_d64  v20, v21, v22, v23, v30, v31, v32, v33;
1317  mlib_d64  u00, u01, u10, u11, u20, u21, u30, u31;
1318  mlib_d64  d0, d1, d2, d3;
1319  mlib_d64 *yPtr, *xPtr;
1320  mlib_d64 *dp, *dpSrc;
1321  mlib_s32  cols, i, mask, gsrd;
1322  mlib_d64  res;
1323  mlib_f32  f_x01000100 = vis_to_float(0x01000100);
1324  const mlib_s16 *mlib_filters_table_4;
1325
1326  if (filter == MLIB_BICUBIC) {
1327    mlib_filters_table_4 = mlib_filters_s16_bc_4;
1328  } else {
1329    mlib_filters_table_4 = mlib_filters_s16_bc2_4;
1330  }
1331
1332  srcYStride >>= 1;
1333
1334  for (j = yStart; j <= yFinish; j++) {
1335
1336    vis_write_gsr(10 << 3);
1337
1338    CLIP(4);
1339    dstLineEnd  = (DTYPE*)dstData + 4 * xRight;
1340
1341    cols = xRight - xLeft + 1;
1342    dp = vis_alignaddr(dstPixelPtr, 0);
1343    dstLineEnd += 3;
1344    mask = vis_edge16(dstPixelPtr, dstLineEnd);
1345    gsrd = ((8 - (mlib_addr)dstPixelPtr) & 7);
1346
1347    i = 0;
1348
1349    if (i <= cols - 4) {
1350
1351      NEXT_PIXEL_4BC_S16();
1352      LOAD_BC_S16_4CH_1PIXEL(mlib_filters_table_4);
1353
1354      NEXT_PIXEL_4BC_S16();
1355
1356      BC_S16_4CH(mlib_filters_table_4);
1357      FADD_4BC_S16();
1358
1359      BC_S16_4CH(mlib_filters_table_4);
1360
1361#pragma pipeloop(0)
1362      for (; i < cols-4; i++) {
1363        vis_alignaddr((void *)gsrd, 0);
1364        res = vis_faligndata(res, res);
1365
1366        vis_pst_16(res, dp++, mask);
1367        vis_pst_16(res, dp, ~mask);
1368
1369        FADD_4BC_S16();
1370        BC_S16_4CH(mlib_filters_table_4);
1371      }
1372
1373      vis_alignaddr((void *)gsrd, 0);
1374      res = vis_faligndata(res, res);
1375      vis_pst_16(res, dp++, mask);
1376      vis_pst_16(res, dp, ~mask);
1377
1378      FADD_4BC_S16();
1379      vis_alignaddr((void *)gsrd, 0);
1380      res = vis_faligndata(res, res);
1381      vis_pst_16(res, dp++, mask);
1382      vis_pst_16(res, dp, ~mask);
1383
1384      RESULT_4BC_S16_1PIXEL();
1385      vis_alignaddr((void *)gsrd, 0);
1386      res = vis_faligndata(res, res);
1387      vis_pst_16(res, dp++, mask);
1388      vis_pst_16(res, dp, ~mask);
1389
1390      LOAD_BC_S16_4CH_1PIXEL(mlib_filters_table_4);
1391      RESULT_4BC_S16_1PIXEL();
1392      vis_alignaddr((void *)gsrd, 0);
1393      res = vis_faligndata(res, res);
1394      vis_pst_16(res, dp++, mask);
1395      vis_pst_16(res, dp, ~mask);
1396      i += 4;
1397    }
1398
1399#pragma pipeloop(0)
1400    for (; i < cols; i++) {
1401      NEXT_PIXEL_4BC_S16();
1402      LOAD_BC_S16_4CH_1PIXEL(mlib_filters_table_4);
1403      RESULT_4BC_S16_1PIXEL();
1404      vis_alignaddr((void *)gsrd, 0);
1405      res = vis_faligndata(res, res);
1406      vis_pst_16(res, dp++, mask);
1407      vis_pst_16(res, dp, ~mask);
1408    }
1409  }
1410
1411  return MLIB_SUCCESS;
1412}
1413
1414/***************************************************************/
1415