1/*
2 * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26
27
28/*
29 *      The functions step along the lines from xLeft to xRight and apply
30 *      the bicubic filtering.
31 *
32 */
33
34#include "vis_proto.h"
35#include "mlib_ImageAffine.h"
36#include "mlib_v_ImageFilters.h"
37
38/***************************************************************/
39#define DTYPE  mlib_s16
40
41#define FILTER_BITS  9
42
43/***************************************************************/
44#define sPtr srcPixelPtr
45
46/***************************************************************/
47#define NEXT_PIXEL_1BC_S16()                                    \
48  xSrc = (X >> MLIB_SHIFT)-1;                                   \
49  ySrc = (Y >> MLIB_SHIFT)-1;                                   \
50  sPtr = (mlib_s16 *)lineAddr[ySrc] + xSrc
51
52/***************************************************************/
53#define LOAD_BC_S16_1CH_1PIXEL(mlib_filters_s16, mlib_filters_s16_4)    \
54  dpSrc = vis_alignaddr(sPtr, 0);                                       \
55  data0 = dpSrc[0];                                                     \
56  data1 = dpSrc[1];                                                     \
57  row0 = vis_faligndata(data0, data1);                                  \
58  sPtr += srcYStride;                                                   \
59  dpSrc = vis_alignaddr(sPtr, 0);                                       \
60  data0 = dpSrc[0];                                                     \
61  data1 = dpSrc[1];                                                     \
62  row1 = vis_faligndata(data0, data1);                                  \
63  sPtr += srcYStride;                                                   \
64  dpSrc = vis_alignaddr(sPtr, 0);                                       \
65  data0 = dpSrc[0];                                                     \
66  data1 = dpSrc[1];                                                     \
67  row2 = vis_faligndata(data0, data1);                                  \
68  sPtr += srcYStride;                                                   \
69  dpSrc = vis_alignaddr(sPtr, 0);                                       \
70  data0 = dpSrc[0];                                                     \
71  data1 = dpSrc[1];                                                     \
72  row3 = vis_faligndata(data0, data1);                                  \
73  filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                       \
74  yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
75  yFilter0 = yPtr[0];                                                   \
76  yFilter1 = yPtr[1];                                                   \
77  yFilter2 = yPtr[2];                                                   \
78  yFilter3 = yPtr[3];                                                   \
79  filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                       \
80  xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_s16 + filterposx));  \
81  X += dX;                                                              \
82  Y += dY
83
84/***************************************************************/
85#define RESULT_1BC_S16_1PIXEL()                                          \
86  u0 = vis_fmul8sux16(vis_fxor(row0, mask8000), yFilter0);               \
87  u1 = vis_fmul8ulx16(vis_fxor(row0, mask8000), yFilter0);               \
88  u2 = vis_fmul8sux16(vis_fxor(row1, mask8000), yFilter1);               \
89  v0 = vis_fpadd16(u0, u1);                                              \
90  u3 = vis_fmul8ulx16(vis_fxor(row1, mask8000), yFilter1);               \
91  u0 = vis_fmul8sux16(vis_fxor(row2, mask8000), yFilter2);               \
92  v1 = vis_fpadd16(u2, u3);                                              \
93  u1 = vis_fmul8ulx16(vis_fxor(row2, mask8000), yFilter2);               \
94  sum = vis_fpadd16(v0, v1);                                             \
95  u2 = vis_fmul8sux16(vis_fxor(row3, mask8000), yFilter3);               \
96  v2 = vis_fpadd16(u0, u1);                                              \
97  u3 = vis_fmul8ulx16(vis_fxor(row3, mask8000), yFilter3);               \
98  sum = vis_fpadd16(sum, v2);                                            \
99  v3 = vis_fpadd16(u2, u3);                                              \
100  sum = vis_fpadd16(sum, v3);                                            \
101  d00 = vis_fmul8sux16(sum, xFilter);                                    \
102  d10 = vis_fmul8ulx16(sum, xFilter);                                    \
103  d0 = vis_fpadd16(d00, d10);                                            \
104  p0 = vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0));                   \
105  d0 = vis_fmuld8sux16(f_x01000100, p0);                                 \
106  d1 = vis_write_lo(d1, vis_fpadd32s(vis_read_hi(d0), vis_read_lo(d0))); \
107  res = vis_fxor(vis_fpackfix_pair(d1, d1), mask8000)
108
109/***************************************************************/
110#define BC_S16_1CH(ind, mlib_filters_s16, mlib_filters_s16_4)           \
111  u0 = vis_fmul8sux16(vis_fxor(row0, mask8000), yFilter0);              \
112  u1 = vis_fmul8ulx16(vis_fxor(row0, mask8000), yFilter0);              \
113  dpSrc = vis_alignaddr(sPtr, 0);                                       \
114  u2 = vis_fmul8sux16(vis_fxor(row1, mask8000), yFilter1);              \
115  v0 = vis_fpadd16(u0, u1);                                             \
116  data0 = dpSrc[0];                                                     \
117  filterposy = (Y >> FILTER_SHIFT);                                     \
118  u3 = vis_fmul8ulx16(vis_fxor(row1, mask8000), yFilter1);              \
119  data1 = dpSrc[1];                                                     \
120  row0 = vis_faligndata(data0, data1);                                  \
121  filterposx = (X >> FILTER_SHIFT);                                     \
122  sPtr += srcYStride;                                                   \
123  dpSrc = vis_alignaddr(sPtr, 0);                                       \
124  u0 = vis_fmul8sux16(vis_fxor(row2, mask8000), yFilter2);              \
125  v1 = vis_fpadd16(u2, u3);                                             \
126  data0 = dpSrc[0];                                                     \
127  u1 = vis_fmul8ulx16(vis_fxor(row2, mask8000), yFilter2);              \
128  sum = vis_fpadd16(v0, v1);                                            \
129  X += dX;                                                              \
130  data1 = dpSrc[1];                                                     \
131  row1 = vis_faligndata(data0, data1);                                  \
132  sPtr += srcYStride;                                                   \
133  dpSrc = vis_alignaddr(sPtr, 0);                                       \
134  u2 = vis_fmul8sux16(vis_fxor(row3, mask8000), yFilter3);              \
135  v2 = vis_fpadd16(u0, u1);                                             \
136  Y += dY;                                                              \
137  xSrc = (X >> MLIB_SHIFT)-1;                                           \
138  data0 = dpSrc[0];                                                     \
139  u3 = vis_fmul8ulx16(vis_fxor(row3, mask8000), yFilter3);              \
140  sum = vis_fpadd16(sum, v2);                                           \
141  ySrc = (Y >> MLIB_SHIFT)-1;                                           \
142  data1 = dpSrc[1];                                                     \
143  filterposy &= FILTER_MASK;                                            \
144  row2 = vis_faligndata(data0, data1);                                  \
145  sPtr += srcYStride;                                                   \
146  filterposx &= FILTER_MASK;                                            \
147  dpSrc = vis_alignaddr(sPtr, 0);                                       \
148  data0 = dpSrc[0];                                                     \
149  v3 = vis_fpadd16(u2, u3);                                             \
150  data1 = dpSrc[1];                                                     \
151  row3 = vis_faligndata(data0, data1);                                  \
152  yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
153  yFilter0 = yPtr[0];                                                   \
154  sum = vis_fpadd16(sum, v3);                                           \
155  yFilter1 = yPtr[1];                                                   \
156  d0 = vis_fmul8sux16(sum, xFilter);                                    \
157  yFilter2 = yPtr[2];                                                   \
158  d1 = vis_fmul8ulx16(sum, xFilter);                                    \
159  yFilter3 = yPtr[3];                                                   \
160  xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_s16 + filterposx));  \
161  d0##ind = vis_fpadd16(d0, d1);                                        \
162  sPtr = (mlib_s16 *)lineAddr[ySrc] + xSrc
163
164/***************************************************************/
165#define FADD_1BC_S16()                                                \
166  p0 = vis_fpadd16s(vis_read_hi(d00), vis_read_lo(d00));              \
167  p1 = vis_fpadd16s(vis_read_hi(d01), vis_read_lo(d01));              \
168  p2 = vis_fpadd16s(vis_read_hi(d02), vis_read_lo(d02));              \
169  p3 = vis_fpadd16s(vis_read_hi(d03), vis_read_lo(d03));              \
170  d0 = vis_fmuld8sux16(f_x01000100, p0);                              \
171  d1 = vis_fmuld8sux16(f_x01000100, p1);                              \
172  d2 = vis_fmuld8sux16(f_x01000100, p2);                              \
173  d3 = vis_fmuld8sux16(f_x01000100, p3);                              \
174  d0 = vis_freg_pair(vis_fpadd32s(vis_read_hi(d0), vis_read_lo(d0)),  \
175                     vis_fpadd32s(vis_read_hi(d1), vis_read_lo(d1))); \
176  d1 = vis_freg_pair(vis_fpadd32s(vis_read_hi(d2), vis_read_lo(d2)),  \
177                     vis_fpadd32s(vis_read_hi(d3), vis_read_lo(d3))); \
178  res = vis_fxor(vis_fpackfix_pair(d0, d1), mask8000)
179
180/***************************************************************/
181mlib_status mlib_ImageAffine_u16_1ch_bc (mlib_affine_param *param)
182{
183  DECLAREVAR_BC();
184  mlib_s32  filterposx, filterposy;
185  mlib_d64  data0, data1;
186  mlib_d64  sum;
187  mlib_d64  row0, row1, row2, row3;
188  mlib_f32  p0, p1, p2, p3;
189  mlib_d64  xFilter, yFilter0, yFilter1, yFilter2, yFilter3;
190  mlib_d64  v0, v1, v2, v3;
191  mlib_d64  u0, u1, u2, u3;
192  mlib_d64  d0, d1, d2, d3;
193  mlib_d64  d00, d10, d01, d02, d03;
194  mlib_d64 *yPtr;
195  mlib_d64 *dpSrc;
196  mlib_s32  align, cols, i;
197  mlib_d64  res;
198  mlib_f32  f_x01000100 = vis_to_float(0x01000100);
199  mlib_d64  mask8000 = vis_to_double_dup(0x80008000);
200  const mlib_s16 *mlib_filters_table  ;
201  const mlib_s16 *mlib_filters_table_4;
202
203  if (filter == MLIB_BICUBIC) {
204    mlib_filters_table   = mlib_filters_s16_bc;
205    mlib_filters_table_4 = mlib_filters_s16_bc_4;
206  } else {
207    mlib_filters_table   = mlib_filters_s16_bc2;
208    mlib_filters_table_4 = mlib_filters_s16_bc2_4;
209  }
210
211  srcYStride >>= 1;
212
213  for (j = yStart; j <= yFinish; j++) {
214
215    vis_write_gsr(10 << 3);
216
217    CLIP(1);
218
219    cols = xRight - xLeft + 1;
220    align = (8 - ((mlib_addr)dstPixelPtr) & 7) & 7;
221    align >>= 1;
222    align = (cols < align)? cols : align;
223
224    for (i = 0; i < align; i++) {
225      NEXT_PIXEL_1BC_S16();
226      LOAD_BC_S16_1CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
227      RESULT_1BC_S16_1PIXEL();
228      vis_st_u16(res, dstPixelPtr++);
229    }
230
231    if (i <= cols - 10) {
232
233      NEXT_PIXEL_1BC_S16();
234      LOAD_BC_S16_1CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
235
236      NEXT_PIXEL_1BC_S16();
237
238      BC_S16_1CH(0, mlib_filters_table, mlib_filters_table_4);
239      BC_S16_1CH(1, mlib_filters_table, mlib_filters_table_4);
240      BC_S16_1CH(2, mlib_filters_table, mlib_filters_table_4);
241      BC_S16_1CH(3, mlib_filters_table, mlib_filters_table_4);
242
243      FADD_1BC_S16();
244
245      BC_S16_1CH(0, mlib_filters_table, mlib_filters_table_4);
246      BC_S16_1CH(1, mlib_filters_table, mlib_filters_table_4);
247      BC_S16_1CH(2, mlib_filters_table, mlib_filters_table_4);
248      BC_S16_1CH(3, mlib_filters_table, mlib_filters_table_4);
249
250#pragma pipeloop(0)
251      for (; i <= cols - 14; i += 4) {
252        *(mlib_d64*)dstPixelPtr = res;
253        FADD_1BC_S16();
254        BC_S16_1CH(0, mlib_filters_table, mlib_filters_table_4);
255        BC_S16_1CH(1, mlib_filters_table, mlib_filters_table_4);
256        BC_S16_1CH(2, mlib_filters_table, mlib_filters_table_4);
257        BC_S16_1CH(3, mlib_filters_table, mlib_filters_table_4);
258        dstPixelPtr += 4;
259      }
260
261      *(mlib_d64*)dstPixelPtr = res;
262      dstPixelPtr += 4;
263      FADD_1BC_S16();
264      *(mlib_d64*)dstPixelPtr = res;
265      dstPixelPtr += 4;
266
267      RESULT_1BC_S16_1PIXEL();
268      vis_st_u16(res, dstPixelPtr++);
269
270      LOAD_BC_S16_1CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
271      RESULT_1BC_S16_1PIXEL();
272      vis_st_u16(res, dstPixelPtr++);
273      i += 10;
274    }
275
276    for (; i < cols; i++) {
277      NEXT_PIXEL_1BC_S16();
278      LOAD_BC_S16_1CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
279      RESULT_1BC_S16_1PIXEL();
280      vis_st_u16(res, dstPixelPtr++);
281    }
282  }
283
284  return MLIB_SUCCESS;
285}
286
287/***************************************************************/
288#define NEXT_PIXEL_2BC_S16()                                    \
289  xSrc = (X >> MLIB_SHIFT)-1;                                   \
290  ySrc = (Y >> MLIB_SHIFT)-1;                                   \
291  sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc << 1)
292
293/***************************************************************/
294#define LOAD_BC_S16_2CH_1PIXEL(mlib_filters_s16, mlib_filters_s16_4)    \
295  dpSrc = vis_alignaddr(sPtr, 0);                                       \
296  data0 = dpSrc[0];                                                     \
297  data1 = dpSrc[1];                                                     \
298  data2 = dpSrc[2];                                                     \
299  row00 = vis_faligndata(data0, data1);                                 \
300  row01 = vis_faligndata(data1, data2);                                 \
301  sPtr += srcYStride;                                                   \
302  dpSrc = vis_alignaddr(sPtr, 0);                                       \
303  data0 = dpSrc[0];                                                     \
304  data1 = dpSrc[1];                                                     \
305  data2 = dpSrc[2];                                                     \
306  row10 = vis_faligndata(data0, data1);                                 \
307  row11 = vis_faligndata(data1, data2);                                 \
308  sPtr += srcYStride;                                                   \
309  dpSrc = vis_alignaddr(sPtr, 0);                                       \
310  data0 = dpSrc[0];                                                     \
311  data1 = dpSrc[1];                                                     \
312  data2 = dpSrc[2];                                                     \
313  row20 = vis_faligndata(data0, data1);                                 \
314  row21 = vis_faligndata(data1, data2);                                 \
315  sPtr += srcYStride;                                                   \
316  dpSrc = vis_alignaddr(sPtr, 0);                                       \
317  data0 = dpSrc[0];                                                     \
318  data1 = dpSrc[1];                                                     \
319  data2 = dpSrc[2];                                                     \
320  row30 = vis_faligndata(data0, data1);                                 \
321  row31 = vis_faligndata(data1, data2);                                 \
322  filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                       \
323  yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
324  yFilter0 = yPtr[0];                                                   \
325  yFilter1 = yPtr[1];                                                   \
326  yFilter2 = yPtr[2];                                                   \
327  yFilter3 = yPtr[3];                                                   \
328  filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                       \
329  xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_s16 + filterposx));  \
330  X += dX;                                                              \
331  Y += dY
332
333/***************************************************************/
334#define RESULT_2BC_S16_1PIXEL()                                 \
335  u00 = vis_fmul8sux16(vis_fxor(row00, mask8000), yFilter0);    \
336  dr = vis_fpmerge(vis_read_hi(xFilter), vis_read_lo(xFilter)); \
337  u01 = vis_fmul8ulx16(vis_fxor(row00, mask8000), yFilter0);    \
338  dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr));           \
339  u10 = vis_fmul8sux16(vis_fxor(row01, mask8000), yFilter0);    \
340  dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr));          \
341  u11 = vis_fmul8ulx16(vis_fxor(row01, mask8000), yFilter0);    \
342  dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr));           \
343  u20 = vis_fmul8sux16(vis_fxor(row10, mask8000), yFilter1);    \
344  v00 = vis_fpadd16(u00, u01);                                  \
345  u21 = vis_fmul8ulx16(vis_fxor(row10, mask8000), yFilter1);    \
346  v01 = vis_fpadd16(u10, u11);                                  \
347  u00 = vis_fmul8sux16(vis_fxor(row11, mask8000), yFilter1);    \
348  xFilter0 = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1));    \
349  u01 = vis_fmul8ulx16(vis_fxor(row11, mask8000), yFilter1);    \
350  u10 = vis_fmul8sux16(vis_fxor(row20, mask8000), yFilter2);    \
351  u11 = vis_fmul8ulx16(vis_fxor(row20, mask8000), yFilter2);    \
352  v10 = vis_fpadd16(u20, u21);                                  \
353  sum0 = vis_fpadd16(v00, v10);                                 \
354  u20 = vis_fmul8sux16(vis_fxor(row21, mask8000), yFilter2);    \
355  v11 = vis_fpadd16(u00, u01);                                  \
356  u21 = vis_fmul8ulx16(vis_fxor(row21, mask8000), yFilter2);    \
357  xFilter1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1));    \
358  u00 = vis_fmul8sux16(vis_fxor(row30, mask8000), yFilter3);    \
359  v20 = vis_fpadd16(u10, u11);                                  \
360  sum1 = vis_fpadd16(v01, v11);                                 \
361  u01 = vis_fmul8ulx16(vis_fxor(row30, mask8000), yFilter3);    \
362  sum0 = vis_fpadd16(sum0, v20);                                \
363  v21 = vis_fpadd16(u20, u21);                                  \
364  u10 = vis_fmul8sux16(vis_fxor(row31, mask8000), yFilter3);    \
365  v30 = vis_fpadd16(u00, u01);                                  \
366  sum1 = vis_fpadd16(sum1, v21);                                \
367  u11 = vis_fmul8ulx16(vis_fxor(row31, mask8000), yFilter3);    \
368  sum0 = vis_fpadd16(sum0, v30);                                \
369  v31 = vis_fpadd16(u10, u11);                                  \
370  sum1 = vis_fpadd16(sum1, v31);                                \
371  d00 = vis_fmul8sux16(sum0, xFilter0);                         \
372  d10 = vis_fmul8ulx16(sum0, xFilter0);                         \
373  d20 = vis_fmul8sux16(sum1, xFilter1);                         \
374  d30 = vis_fmul8ulx16(sum1, xFilter1);                         \
375  d0 = vis_fpadd16(d00, d10);                                   \
376  d1 = vis_fpadd16(d20, d30);                                   \
377  d0 = vis_fpadd16(d0, d1);                                     \
378  p0 = vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0));          \
379  d0 = vis_fmuld8sux16(f_x01000100, p0);                        \
380  res = vis_fxor(vis_fpackfix_pair(d0, d0), mask8000)
381
382/***************************************************************/
383#define BC_S16_2CH(ind, mlib_filters_s16, mlib_filters_s16_4)           \
384  u00 = vis_fmul8sux16(vis_fxor(row00, mask8000), yFilter0);            \
385  dr = vis_fpmerge(vis_read_hi(xFilter), vis_read_lo(xFilter));         \
386  u01 = vis_fmul8ulx16(vis_fxor(row00, mask8000), yFilter0);            \
387  dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr));                   \
388  u10 = vis_fmul8sux16(vis_fxor(row01, mask8000), yFilter0);            \
389  dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr));                  \
390  u11 = vis_fmul8ulx16(vis_fxor(row01, mask8000), yFilter0);            \
391  dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr));                   \
392  dpSrc = vis_alignaddr(sPtr, 0);                                       \
393  u20 = vis_fmul8sux16(vis_fxor(row10, mask8000), yFilter1);            \
394  v00 = vis_fpadd16(u00, u01);                                          \
395  u21 = vis_fmul8ulx16(vis_fxor(row10, mask8000), yFilter1);            \
396  data0 = dpSrc[0];                                                     \
397  filterposy = (Y >> FILTER_SHIFT);                                     \
398  v01 = vis_fpadd16(u10, u11);                                          \
399  data1 = dpSrc[1];                                                     \
400  u00 = vis_fmul8sux16(vis_fxor(row11, mask8000), yFilter1);            \
401  xFilter0 = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1));            \
402  data2 = dpSrc[2];                                                     \
403  u01 = vis_fmul8ulx16(vis_fxor(row11, mask8000), yFilter1);            \
404  row00 = vis_faligndata(data0, data1);                                 \
405  u10 = vis_fmul8sux16(vis_fxor(row20, mask8000), yFilter2);            \
406  row01 = vis_faligndata(data1, data2);                                 \
407  filterposx = (X >> FILTER_SHIFT);                                     \
408  sPtr += srcYStride;                                                   \
409  dpSrc = vis_alignaddr(sPtr, 0);                                       \
410  u11 = vis_fmul8ulx16(vis_fxor(row20, mask8000), yFilter2);            \
411  v10 = vis_fpadd16(u20, u21);                                          \
412  data0 = dpSrc[0];                                                     \
413  sum0 = vis_fpadd16(v00, v10);                                         \
414  X += dX;                                                              \
415  data1 = dpSrc[1];                                                     \
416  u20 = vis_fmul8sux16(vis_fxor(row21, mask8000), yFilter2);            \
417  v11 = vis_fpadd16(u00, u01);                                          \
418  data2 = dpSrc[2];                                                     \
419  row10 = vis_faligndata(data0, data1);                                 \
420  u21 = vis_fmul8ulx16(vis_fxor(row21, mask8000), yFilter2);            \
421  row11 = vis_faligndata(data1, data2);                                 \
422  sPtr += srcYStride;                                                   \
423  xFilter1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1));            \
424  dpSrc = vis_alignaddr(sPtr, 0);                                       \
425  u00 = vis_fmul8sux16(vis_fxor(row30, mask8000), yFilter3);            \
426  v20 = vis_fpadd16(u10, u11);                                          \
427  Y += dY;                                                              \
428  xSrc = (X >> MLIB_SHIFT)-1;                                           \
429  sum1 = vis_fpadd16(v01, v11);                                         \
430  data0 = dpSrc[0];                                                     \
431  u01 = vis_fmul8ulx16(vis_fxor(row30, mask8000), yFilter3);            \
432  sum0 = vis_fpadd16(sum0, v20);                                        \
433  ySrc = (Y >> MLIB_SHIFT)-1;                                           \
434  data1 = dpSrc[1];                                                     \
435  v21 = vis_fpadd16(u20, u21);                                          \
436  u10 = vis_fmul8sux16(vis_fxor(row31, mask8000), yFilter3);            \
437  data2 = dpSrc[2];                                                     \
438  v30 = vis_fpadd16(u00, u01);                                          \
439  filterposy &= FILTER_MASK;                                            \
440  row20 = vis_faligndata(data0, data1);                                 \
441  sum1 = vis_fpadd16(sum1, v21);                                        \
442  u11 = vis_fmul8ulx16(vis_fxor(row31, mask8000), yFilter3);            \
443  row21 = vis_faligndata(data1, data2);                                 \
444  sPtr += srcYStride;                                                   \
445  filterposx &= FILTER_MASK;                                            \
446  v31 = vis_fpadd16(u10, u11);                                          \
447  dpSrc = vis_alignaddr(sPtr, 0);                                       \
448  data0 = dpSrc[0];                                                     \
449  sum0 = vis_fpadd16(sum0, v30);                                        \
450  data1 = dpSrc[1];                                                     \
451  sum1 = vis_fpadd16(sum1, v31);                                        \
452  data2 = dpSrc[2];                                                     \
453  row30 = vis_faligndata(data0, data1);                                 \
454  d0 = vis_fmul8sux16(sum0, xFilter0);                                  \
455  row31 = vis_faligndata(data1, data2);                                 \
456  yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
457  d1 = vis_fmul8ulx16(sum0, xFilter0);                                  \
458  yFilter0 = yPtr[0];                                                   \
459  d2 = vis_fmul8sux16(sum1, xFilter1);                                  \
460  yFilter1 = yPtr[1];                                                   \
461  d3 = vis_fmul8ulx16(sum1, xFilter1);                                  \
462  d0##ind = vis_fpadd16(d0, d1);                                        \
463  yFilter2 = yPtr[2];                                                   \
464  yFilter3 = yPtr[3];                                                   \
465  d1##ind = vis_fpadd16(d2, d3);                                        \
466  xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_s16 + filterposx));  \
467  sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc << 1)
468
469/***************************************************************/
470#define FADD_2BC_S16()                                          \
471  d0 = vis_fpadd16(d00, d10);                                   \
472  d2 = vis_fpadd16(d01, d11);                                   \
473  p0 = vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0));          \
474  p1 = vis_fpadd16s(vis_read_hi(d2), vis_read_lo(d2));          \
475  d0 = vis_fmuld8sux16(f_x01000100, p0);                        \
476  d1 = vis_fmuld8sux16(f_x01000100, p1);                        \
477  res = vis_fxor(vis_fpackfix_pair(d0, d1), mask8000)
478
479/***************************************************************/
480mlib_status mlib_ImageAffine_u16_2ch_bc (mlib_affine_param *param)
481{
482  DECLAREVAR_BC();
483  DTYPE  *dstLineEnd;
484  mlib_s32  filterposx, filterposy;
485  mlib_d64  data0, data1, data2;
486  mlib_d64  sum0, sum1;
487  mlib_d64  row00, row10, row20, row30;
488  mlib_d64  row01, row11, row21, row31;
489  mlib_f32  p0, p1;
490  mlib_d64  xFilter, xFilter0, xFilter1;
491  mlib_d64  yFilter0, yFilter1, yFilter2, yFilter3;
492  mlib_d64  v00, v01, v10, v11, v20, v21, v30, v31;
493  mlib_d64  u00, u01, u10, u11, u20, u21;
494  mlib_d64  d0, d1, d2, d3;
495  mlib_d64  d00, d10, d20, d30, d01, d11;
496  mlib_d64  *yPtr;
497  mlib_d64  *dp, *dpSrc;
498  mlib_s32  cols, i, mask, emask;
499  mlib_d64  res, res1;
500  mlib_d64  dr, dr1;
501  mlib_f32 f_x01000100 = vis_to_float(0x01000100);
502  mlib_d64  mask8000 = vis_to_double_dup(0x80008000);
503  const mlib_s16 *mlib_filters_table  ;
504  const mlib_s16 *mlib_filters_table_4;
505
506  if (filter == MLIB_BICUBIC) {
507    mlib_filters_table   = mlib_filters_s16_bc;
508    mlib_filters_table_4 = mlib_filters_s16_bc_4;
509  } else {
510    mlib_filters_table   = mlib_filters_s16_bc2;
511    mlib_filters_table_4 = mlib_filters_s16_bc2_4;
512  }
513
514  srcYStride >>= 1;
515
516  for (j = yStart; j <= yFinish; j++) {
517
518    vis_write_gsr(10 << 3);
519
520    CLIP(2);
521    dstLineEnd  = (DTYPE*)dstData + 2 * xRight;
522
523    cols = xRight - xLeft + 1;
524    dp = vis_alignaddr(dstPixelPtr, 0);
525    dstLineEnd += 1;
526    mask = vis_edge16(dstPixelPtr, dstLineEnd);
527    i = 0;
528
529    if (i <= cols - 6) {
530
531      NEXT_PIXEL_2BC_S16();
532      LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
533
534      NEXT_PIXEL_2BC_S16();
535
536      BC_S16_2CH(0, mlib_filters_table, mlib_filters_table_4);
537      BC_S16_2CH(1, mlib_filters_table, mlib_filters_table_4);
538
539      FADD_2BC_S16();
540
541      BC_S16_2CH(0, mlib_filters_table, mlib_filters_table_4);
542      BC_S16_2CH(1, mlib_filters_table, mlib_filters_table_4);
543
544#pragma pipeloop(0)
545      for (; i <= cols-8; i += 2) {
546        vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
547        res = vis_faligndata(res, res);
548        vis_pst_16(res, dp++, mask);
549        vis_pst_16(res, dp, ~mask);
550        FADD_2BC_S16();
551        BC_S16_2CH(0, mlib_filters_table, mlib_filters_table_4);
552        BC_S16_2CH(1, mlib_filters_table, mlib_filters_table_4);
553      }
554
555      vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
556      res = vis_faligndata(res, res);
557      vis_pst_16(res, dp++, mask);
558      vis_pst_16(res, dp, ~mask);
559
560      FADD_2BC_S16();
561      vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
562      res = vis_faligndata(res, res);
563      vis_pst_16(res, dp++, mask);
564      vis_pst_16(res, dp, ~mask);
565
566      RESULT_2BC_S16_1PIXEL();
567      res1 = res;
568
569      LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
570      RESULT_2BC_S16_1PIXEL();
571      res = vis_write_hi(res, vis_read_hi(res1));
572      vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
573      res = vis_faligndata(res, res);
574      vis_pst_16(res, dp++, mask);
575      vis_pst_16(res, dp, ~mask);
576
577      i += 6;
578    }
579
580    if (i <= cols - 4) {
581      NEXT_PIXEL_2BC_S16();
582      LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
583
584      NEXT_PIXEL_2BC_S16();
585
586      BC_S16_2CH(0, mlib_filters_table, mlib_filters_table_4);
587      BC_S16_2CH(1, mlib_filters_table, mlib_filters_table_4);
588
589      FADD_2BC_S16();
590      vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
591      res = vis_faligndata(res, res);
592      vis_pst_16(res, dp++, mask);
593      vis_pst_16(res, dp, ~mask);
594
595      RESULT_2BC_S16_1PIXEL();
596      res1 = res;
597
598      LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
599      RESULT_2BC_S16_1PIXEL();
600      res = vis_write_hi(res, vis_read_hi(res1));
601      vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
602      res = vis_faligndata(res, res);
603      vis_pst_16(res, dp++, mask);
604      vis_pst_16(res, dp, ~mask);
605
606      i += 4;
607    }
608
609    if (i <= cols - 2) {
610      NEXT_PIXEL_2BC_S16();
611      LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
612      RESULT_2BC_S16_1PIXEL();
613      res1 = res;
614
615      NEXT_PIXEL_2BC_S16();
616      LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
617      RESULT_2BC_S16_1PIXEL();
618      res = vis_write_hi(res, vis_read_hi(res1));
619      vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
620      res = vis_faligndata(res, res);
621      vis_pst_16(res, dp++, mask);
622      vis_pst_16(res, dp, ~mask);
623
624      i += 2;
625    }
626
627    if (i < cols) {
628      NEXT_PIXEL_2BC_S16();
629      LOAD_BC_S16_2CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
630      RESULT_2BC_S16_1PIXEL();
631      vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
632      res = vis_faligndata(res, res);
633      emask = vis_edge16(dp, dstLineEnd);
634      vis_pst_16(res, dp++, mask & emask);
635
636      if ((mlib_s16*)dp <= dstLineEnd) {
637        mask = vis_edge16(dp, dstLineEnd);
638        vis_pst_16(res, dp, mask);
639      }
640    }
641  }
642
643  return MLIB_SUCCESS;
644}
645
646/***************************************************************/
647#define NEXT_PIXEL_3BC_S16()                                    \
648  xSrc = (X >> MLIB_SHIFT)-1;                                   \
649  ySrc = (Y >> MLIB_SHIFT)-1;                                   \
650  sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc*3)
651
652/***************************************************************/
653#define LOAD_BC_S16_3CH_1PIXEL(mlib_filters_s16_3, mlib_filters_s16_4)  \
654  dpSrc = vis_alignaddr(sPtr, 0);                                       \
655  data0 = dpSrc[0];                                                     \
656  data1 = dpSrc[1];                                                     \
657  data2 = dpSrc[2];                                                     \
658  data3 = dpSrc[3];                                                     \
659  row00 = vis_faligndata(data0, data1);                                 \
660  row01 = vis_faligndata(data1, data2);                                 \
661  row02 = vis_faligndata(data2, data3);                                 \
662  sPtr += srcYStride;                                                   \
663  dpSrc = vis_alignaddr(sPtr, 0);                                       \
664  data0 = dpSrc[0];                                                     \
665  data1 = dpSrc[1];                                                     \
666  data2 = dpSrc[2];                                                     \
667  data3 = dpSrc[3];                                                     \
668  row10 = vis_faligndata(data0, data1);                                 \
669  row11 = vis_faligndata(data1, data2);                                 \
670  row12 = vis_faligndata(data2, data3);                                 \
671  sPtr += srcYStride;                                                   \
672  dpSrc = vis_alignaddr(sPtr, 0);                                       \
673  data0 = dpSrc[0];                                                     \
674  data1 = dpSrc[1];                                                     \
675  data2 = dpSrc[2];                                                     \
676  data3 = dpSrc[3];                                                     \
677  row20 = vis_faligndata(data0, data1);                                 \
678  row21 = vis_faligndata(data1, data2);                                 \
679  row22 = vis_faligndata(data2, data3);                                 \
680  sPtr += srcYStride;                                                   \
681  dpSrc = vis_alignaddr(sPtr, 0);                                       \
682  data0 = dpSrc[0];                                                     \
683  data1 = dpSrc[1];                                                     \
684  data2 = dpSrc[2];                                                     \
685  data3 = dpSrc[3];                                                     \
686  row30 = vis_faligndata(data0, data1);                                 \
687  row31 = vis_faligndata(data1, data2);                                 \
688  row32 = vis_faligndata(data2, data3);                                 \
689  filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                       \
690  yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
691  yFilter0 = yPtr[0];                                                   \
692  yFilter1 = yPtr[1];                                                   \
693  yFilter2 = yPtr[2];                                                   \
694  yFilter3 = yPtr[3];                                                   \
695  filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                       \
696  xPtr = ((mlib_d64 *)((mlib_u8 *)mlib_filters_s16_3 + filterposx*3));  \
697  xFilter0 = xPtr[0];                                                   \
698  xFilter1 = xPtr[1];                                                   \
699  xFilter2 = xPtr[2];                                                   \
700  X += dX;                                                              \
701  Y += dY
702
703/***************************************************************/
704#define STORE_BC_S16_3CH_1PIXEL()                               \
705  dstPixelPtr[0] = f0.t[0];                                     \
706  dstPixelPtr[1] = f0.t[1];                                     \
707  dstPixelPtr[2] = f0.t[2];                                     \
708  dstPixelPtr += 3
709
710/***************************************************************/
711#define RESULT_3BC_S16_1PIXEL()                                 \
712  u00 = vis_fmul8sux16(vis_fxor(row00, mask8000), yFilter0);    \
713  u01 = vis_fmul8ulx16(vis_fxor(row00, mask8000), yFilter0);    \
714  u10 = vis_fmul8sux16(vis_fxor(row01, mask8000), yFilter0);    \
715  u11 = vis_fmul8ulx16(vis_fxor(row01, mask8000), yFilter0);    \
716  v00 = vis_fpadd16(u00, u01);                                  \
717  u20 = vis_fmul8sux16(vis_fxor(row02, mask8000), yFilter0);    \
718  v01 = vis_fpadd16(u10, u11);                                  \
719  u21 = vis_fmul8ulx16(vis_fxor(row02, mask8000), yFilter0);    \
720  u00 = vis_fmul8sux16(vis_fxor(row10, mask8000), yFilter1);    \
721  u01 = vis_fmul8ulx16(vis_fxor(row10, mask8000), yFilter1);    \
722  v02 = vis_fpadd16(u20, u21);                                  \
723  u10 = vis_fmul8sux16(vis_fxor(row11, mask8000), yFilter1);    \
724  u11 = vis_fmul8ulx16(vis_fxor(row11, mask8000), yFilter1);    \
725  v10 = vis_fpadd16(u00, u01);                                  \
726  u20 = vis_fmul8sux16(vis_fxor(row12, mask8000), yFilter1);    \
727  u21 = vis_fmul8ulx16(vis_fxor(row12, mask8000), yFilter1);    \
728  u00 = vis_fmul8sux16(vis_fxor(row20, mask8000), yFilter2);    \
729  v11 = vis_fpadd16(u10, u11);                                  \
730  u01 = vis_fmul8ulx16(vis_fxor(row20, mask8000), yFilter2);    \
731  v12 = vis_fpadd16(u20, u21);                                  \
732  u10 = vis_fmul8sux16(vis_fxor(row21, mask8000), yFilter2);    \
733  u11 = vis_fmul8ulx16(vis_fxor(row21, mask8000), yFilter2);    \
734  v20 = vis_fpadd16(u00, u01);                                  \
735  u20 = vis_fmul8sux16(vis_fxor(row22, mask8000), yFilter2);    \
736  sum0 = vis_fpadd16(v00, v10);                                 \
737  u21 = vis_fmul8ulx16(vis_fxor(row22, mask8000), yFilter2);    \
738  u00 = vis_fmul8sux16(vis_fxor(row30, mask8000), yFilter3);    \
739  u01 = vis_fmul8ulx16(vis_fxor(row30, mask8000), yFilter3);    \
740  v21 = vis_fpadd16(u10, u11);                                  \
741  sum1 = vis_fpadd16(v01, v11);                                 \
742  u10 = vis_fmul8sux16(vis_fxor(row31, mask8000), yFilter3);    \
743  sum2 = vis_fpadd16(v02, v12);                                 \
744  v22 = vis_fpadd16(u20, u21);                                  \
745  u11 = vis_fmul8ulx16(vis_fxor(row31, mask8000), yFilter3);    \
746  sum0 = vis_fpadd16(sum0, v20);                                \
747  u20 = vis_fmul8sux16(vis_fxor(row32, mask8000), yFilter3);    \
748  v30 = vis_fpadd16(u00, u01);                                  \
749  sum1 = vis_fpadd16(sum1, v21);                                \
750  u21 = vis_fmul8ulx16(vis_fxor(row32, mask8000), yFilter3);    \
751  v31 = vis_fpadd16(u10, u11);                                  \
752  sum2 = vis_fpadd16(sum2, v22);                                \
753  v32 = vis_fpadd16(u20, u21);                                  \
754  sum0 = vis_fpadd16(sum0, v30);                                \
755  row30 = vis_faligndata(data0, data1);                         \
756  v00 = vis_fmul8sux16(sum0, xFilter0);                         \
757  sum1 = vis_fpadd16(sum1, v31);                                \
758  sum2 = vis_fpadd16(sum2, v32);                                \
759  v01 = vis_fmul8ulx16(sum0, xFilter0);                         \
760  v10 = vis_fmul8sux16(sum1, xFilter1);                         \
761  v11 = vis_fmul8ulx16(sum1, xFilter1);                         \
762  d0 = vis_fpadd16(v00, v01);                                   \
763  v20 = vis_fmul8sux16(sum2, xFilter2);                         \
764  v21 = vis_fmul8ulx16(sum2, xFilter2);                         \
765  d1 = vis_fpadd16(v10, v11);                                   \
766  d2 = vis_fpadd16(v20, v21);                                   \
767  vis_alignaddr((void*)6, 0);                                   \
768  d3 = vis_faligndata(d0, d1);                                  \
769  vis_alignaddr((void*)2, 0);                                   \
770  d4 = vis_faligndata(d1, d2);                                  \
771  d0 = vis_fpadd16(d0, d3);                                     \
772  d2 = vis_fpadd16(d2, d4);                                     \
773  d1 = vis_faligndata(d2, d2);                                  \
774  d0 = vis_fpadd16(d0, d1);                                     \
775  d2 = vis_fmuld8sux16(f_x01000100, vis_read_hi(d0));           \
776  d3 = vis_fmuld8sux16(f_x01000100, vis_read_lo(d0));           \
777  f0.d = vis_fxor(vis_fpackfix_pair(d2, d3), mask8000)
778
779/***************************************************************/
780#define BC_S16_3CH(mlib_filters_s16_3, mlib_filters_s16_4)              \
781  u00 = vis_fmul8sux16(vis_fxor(row00, mask8000), yFilter0);            \
782  u01 = vis_fmul8ulx16(vis_fxor(row00, mask8000), yFilter0);            \
783  u10 = vis_fmul8sux16(vis_fxor(row01, mask8000), yFilter0);            \
784  u11 = vis_fmul8ulx16(vis_fxor(row01, mask8000), yFilter0);            \
785  v00 = vis_fpadd16(u00, u01);                                          \
786  u20 = vis_fmul8sux16(vis_fxor(row02, mask8000), yFilter0);            \
787  v01 = vis_fpadd16(u10, u11);                                          \
788  u21 = vis_fmul8ulx16(vis_fxor(row02, mask8000), yFilter0);            \
789  dpSrc = vis_alignaddr(sPtr, 0);                                       \
790  u00 = vis_fmul8sux16(vis_fxor(row10, mask8000), yFilter1);            \
791  u01 = vis_fmul8ulx16(vis_fxor(row10, mask8000), yFilter1);            \
792  data0 = dpSrc[0];                                                     \
793  filterposy = (Y >> FILTER_SHIFT);                                     \
794  v02 = vis_fpadd16(u20, u21);                                          \
795  data1 = dpSrc[1];                                                     \
796  u10 = vis_fmul8sux16(vis_fxor(row11, mask8000), yFilter1);            \
797  data2 = dpSrc[2];                                                     \
798  u11 = vis_fmul8ulx16(vis_fxor(row11, mask8000), yFilter1);            \
799  v10 = vis_fpadd16(u00, u01);                                          \
800  data3 = dpSrc[3];                                                     \
801  u20 = vis_fmul8sux16(vis_fxor(row12, mask8000), yFilter1);            \
802  row00 = vis_faligndata(data0, data1);                                 \
803  u21 = vis_fmul8ulx16(vis_fxor(row12, mask8000), yFilter1);            \
804  row01 = vis_faligndata(data1, data2);                                 \
805  u00 = vis_fmul8sux16(vis_fxor(row20, mask8000), yFilter2);            \
806  row02 = vis_faligndata(data2, data3);                                 \
807  filterposx = (X >> FILTER_SHIFT);                                     \
808  sPtr += srcYStride;                                                   \
809  dpSrc = vis_alignaddr(sPtr, 0);                                       \
810  v11 = vis_fpadd16(u10, u11);                                          \
811  u01 = vis_fmul8ulx16(vis_fxor(row20, mask8000), yFilter2);            \
812  v12 = vis_fpadd16(u20, u21);                                          \
813  data0 = dpSrc[0];                                                     \
814  u10 = vis_fmul8sux16(vis_fxor(row21, mask8000), yFilter2);            \
815  X += dX;                                                              \
816  data1 = dpSrc[1];                                                     \
817  u11 = vis_fmul8ulx16(vis_fxor(row21, mask8000), yFilter2);            \
818  v20 = vis_fpadd16(u00, u01);                                          \
819  data2 = dpSrc[2];                                                     \
820  u20 = vis_fmul8sux16(vis_fxor(row22, mask8000), yFilter2);            \
821  sum0 = vis_fpadd16(v00, v10);                                         \
822  data3 = dpSrc[3];                                                     \
823  row10 = vis_faligndata(data0, data1);                                 \
824  u21 = vis_fmul8ulx16(vis_fxor(row22, mask8000), yFilter2);            \
825  row11 = vis_faligndata(data1, data2);                                 \
826  u00 = vis_fmul8sux16(vis_fxor(row30, mask8000), yFilter3);            \
827  row12 = vis_faligndata(data2, data3);                                 \
828  sPtr += srcYStride;                                                   \
829  dpSrc = vis_alignaddr(sPtr, 0);                                       \
830  u01 = vis_fmul8ulx16(vis_fxor(row30, mask8000), yFilter3);            \
831  v21 = vis_fpadd16(u10, u11);                                          \
832  Y += dY;                                                              \
833  xSrc = (X >> MLIB_SHIFT)-1;                                           \
834  sum1 = vis_fpadd16(v01, v11);                                         \
835  data0 = dpSrc[0];                                                     \
836  u10 = vis_fmul8sux16(vis_fxor(row31, mask8000), yFilter3);            \
837  sum2 = vis_fpadd16(v02, v12);                                         \
838  ySrc = (Y >> MLIB_SHIFT)-1;                                           \
839  data1 = dpSrc[1];                                                     \
840  v22 = vis_fpadd16(u20, u21);                                          \
841  u11 = vis_fmul8ulx16(vis_fxor(row31, mask8000), yFilter3);            \
842  data2 = dpSrc[2];                                                     \
843  sum0 = vis_fpadd16(sum0, v20);                                        \
844  u20 = vis_fmul8sux16(vis_fxor(row32, mask8000), yFilter3);            \
845  data3 = dpSrc[3];                                                     \
846  v30 = vis_fpadd16(u00, u01);                                          \
847  filterposy &= FILTER_MASK;                                            \
848  row20 = vis_faligndata(data0, data1);                                 \
849  sum1 = vis_fpadd16(sum1, v21);                                        \
850  u21 = vis_fmul8ulx16(vis_fxor(row32, mask8000), yFilter3);            \
851  row21 = vis_faligndata(data1, data2);                                 \
852  row22 = vis_faligndata(data2, data3);                                 \
853  sPtr += srcYStride;                                                   \
854  filterposx &= FILTER_MASK;                                            \
855  v31 = vis_fpadd16(u10, u11);                                          \
856  dpSrc = vis_alignaddr(sPtr, 0);                                       \
857  data0 = dpSrc[0];                                                     \
858  sum2 = vis_fpadd16(sum2, v22);                                        \
859  data1 = dpSrc[1];                                                     \
860  v32 = vis_fpadd16(u20, u21);                                          \
861  data2 = dpSrc[2];                                                     \
862  sum0 = vis_fpadd16(sum0, v30);                                        \
863  data3 = dpSrc[3];                                                     \
864  row30 = vis_faligndata(data0, data1);                                 \
865  v00 = vis_fmul8sux16(sum0, xFilter0);                                 \
866  row31 = vis_faligndata(data1, data2);                                 \
867  row32 = vis_faligndata(data2, data3);                                 \
868  yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
869  sum1 = vis_fpadd16(sum1, v31);                                        \
870  yFilter0 = yPtr[0];                                                   \
871  sum2 = vis_fpadd16(sum2, v32);                                        \
872  v01 = vis_fmul8ulx16(sum0, xFilter0);                                 \
873  yFilter1 = yPtr[1];                                                   \
874  v10 = vis_fmul8sux16(sum1, xFilter1);                                 \
875  yFilter2 = yPtr[2];                                                   \
876  v11 = vis_fmul8ulx16(sum1, xFilter1);                                 \
877  d0 = vis_fpadd16(v00, v01);                                           \
878  yFilter3 = yPtr[3];                                                   \
879  xPtr = ((mlib_d64 *)((mlib_u8 *)mlib_filters_s16_3 + filterposx*3));  \
880  v20 = vis_fmul8sux16(sum2, xFilter2);                                 \
881  xFilter0 = xPtr[0];                                                   \
882  v21 = vis_fmul8ulx16(sum2, xFilter2);                                 \
883  d1 = vis_fpadd16(v10, v11);                                           \
884  xFilter1 = xPtr[1];                                                   \
885  d2 = vis_fpadd16(v20, v21);                                           \
886  xFilter2 = xPtr[2];                                                   \
887  sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc*3)
888
889/***************************************************************/
890#define FADD_3BC_S16()                                          \
891  vis_alignaddr((void*)6, 0);                                   \
892  d3 = vis_faligndata(d0, d1);                                  \
893  vis_alignaddr((void*)2, 0);                                   \
894  d4 = vis_faligndata(d1, d2);                                  \
895  d0 = vis_fpadd16(d0, d3);                                     \
896  d2 = vis_fpadd16(d2, d4);                                     \
897  d1 = vis_faligndata(d2, d2);                                  \
898  d0 = vis_fpadd16(d0, d1);                                     \
899  d2 = vis_fmuld8sux16(f_x01000100, vis_read_hi(d0));           \
900  d3 = vis_fmuld8sux16(f_x01000100, vis_read_lo(d0));           \
901  f0.d = vis_fxor(vis_fpackfix_pair(d2, d3), mask8000)
902
903/***************************************************************/
904mlib_status mlib_ImageAffine_u16_3ch_bc (mlib_affine_param *param)
905{
906  DECLAREVAR_BC();
907  mlib_s32  filterposx, filterposy;
908  mlib_d64  data0, data1, data2, data3;
909  mlib_d64  sum0, sum1, sum2;
910  mlib_d64  row00, row10, row20, row30;
911  mlib_d64  row01, row11, row21, row31;
912  mlib_d64  row02, row12, row22, row32;
913  mlib_d64  xFilter0, xFilter1, xFilter2;
914  mlib_d64  yFilter0, yFilter1, yFilter2, yFilter3;
915  mlib_d64  v00, v01, v02, v10, v11, v12, v20, v21, v22, v30, v31, v32;
916  mlib_d64  u00, u01, u10, u11, u20, u21;
917  mlib_d64  d0, d1, d2, d3, d4;
918  mlib_d64 *yPtr, *xPtr;
919  mlib_d64 *dpSrc;
920  mlib_s32  cols, i;
921  mlib_f32  f_x01000100 = vis_to_float(0x01000100);
922  mlib_d64  mask8000 = vis_to_double_dup(0x80008000);
923  union {
924    mlib_s16 t[4];
925    mlib_d64 d;
926  } f0;
927  const mlib_s16 *mlib_filters_table_3;
928  const mlib_s16 *mlib_filters_table_4;
929
930  if (filter == MLIB_BICUBIC) {
931    mlib_filters_table_3 = mlib_filters_s16_bc_3;
932    mlib_filters_table_4 = mlib_filters_s16_bc_4;
933  } else {
934    mlib_filters_table_3 = mlib_filters_s16_bc2_3;
935    mlib_filters_table_4 = mlib_filters_s16_bc2_4;
936  }
937
938  srcYStride >>= 1;
939
940  for (j = yStart; j <= yFinish; j++) {
941
942    vis_write_gsr(10 << 3);
943
944    CLIP(3);
945
946    cols = xRight - xLeft + 1;
947
948    i = 0;
949
950    if (i <= cols - 4) {
951
952      NEXT_PIXEL_3BC_S16();
953      LOAD_BC_S16_3CH_1PIXEL(mlib_filters_table_3, mlib_filters_table_4);
954
955      NEXT_PIXEL_3BC_S16();
956
957      BC_S16_3CH(mlib_filters_table_3, mlib_filters_table_4);
958      FADD_3BC_S16();
959
960      BC_S16_3CH(mlib_filters_table_3, mlib_filters_table_4);
961
962#pragma pipeloop(0)
963      for (; i < cols-4; i++) {
964        STORE_BC_S16_3CH_1PIXEL();
965
966        FADD_3BC_S16();
967        BC_S16_3CH(mlib_filters_table_3, mlib_filters_table_4);
968      }
969
970      STORE_BC_S16_3CH_1PIXEL();
971
972      FADD_3BC_S16();
973      STORE_BC_S16_3CH_1PIXEL();
974
975      RESULT_3BC_S16_1PIXEL();
976      STORE_BC_S16_3CH_1PIXEL();
977
978      LOAD_BC_S16_3CH_1PIXEL(mlib_filters_table_3, mlib_filters_table_4);
979      RESULT_3BC_S16_1PIXEL();
980      STORE_BC_S16_3CH_1PIXEL();
981      i += 4;
982    }
983
984    for (; i < cols; i++) {
985      NEXT_PIXEL_3BC_S16();
986      LOAD_BC_S16_3CH_1PIXEL(mlib_filters_table_3, mlib_filters_table_4);
987      RESULT_3BC_S16_1PIXEL();
988      STORE_BC_S16_3CH_1PIXEL();
989    }
990  }
991
992  return MLIB_SUCCESS;
993}
994
995/***************************************************************/
996#define NEXT_PIXEL_4BC_S16()                                    \
997  xSrc = (X >> MLIB_SHIFT)-1;                                   \
998  ySrc = (Y >> MLIB_SHIFT)-1;                                   \
999  sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc << 2)
1000
1001/***************************************************************/
1002#define LOAD_BC_S16_4CH_1PIXEL(mlib_filters_s16_4)                      \
1003  dpSrc = vis_alignaddr(sPtr, 0);                                       \
1004  data0 = dpSrc[0];                                                     \
1005  data1 = dpSrc[1];                                                     \
1006  data2 = dpSrc[2];                                                     \
1007  data3 = dpSrc[3];                                                     \
1008  data4 = dpSrc[4];                                                     \
1009  row00 = vis_faligndata(data0, data1);                                 \
1010  row01 = vis_faligndata(data1, data2);                                 \
1011  row02 = vis_faligndata(data2, data3);                                 \
1012  row03 = vis_faligndata(data3, data4);                                 \
1013  sPtr += srcYStride;                                                   \
1014  dpSrc = vis_alignaddr(sPtr, 0);                                       \
1015  data0 = dpSrc[0];                                                     \
1016  data1 = dpSrc[1];                                                     \
1017  data2 = dpSrc[2];                                                     \
1018  data3 = dpSrc[3];                                                     \
1019  data4 = dpSrc[4];                                                     \
1020  row10 = vis_faligndata(data0, data1);                                 \
1021  row11 = vis_faligndata(data1, data2);                                 \
1022  row12 = vis_faligndata(data2, data3);                                 \
1023  row13 = vis_faligndata(data3, data4);                                 \
1024  sPtr += srcYStride;                                                   \
1025  dpSrc = vis_alignaddr(sPtr, 0);                                       \
1026  data0 = dpSrc[0];                                                     \
1027  data1 = dpSrc[1];                                                     \
1028  data2 = dpSrc[2];                                                     \
1029  data3 = dpSrc[3];                                                     \
1030  data4 = dpSrc[4];                                                     \
1031  row20 = vis_faligndata(data0, data1);                                 \
1032  row21 = vis_faligndata(data1, data2);                                 \
1033  row22 = vis_faligndata(data2, data3);                                 \
1034  row23 = vis_faligndata(data3, data4);                                 \
1035  sPtr += srcYStride;                                                   \
1036  dpSrc = vis_alignaddr(sPtr, 0);                                       \
1037  data0 = dpSrc[0];                                                     \
1038  data1 = dpSrc[1];                                                     \
1039  data2 = dpSrc[2];                                                     \
1040  data3 = dpSrc[3];                                                     \
1041  data4 = dpSrc[4];                                                     \
1042  row30 = vis_faligndata(data0, data1);                                 \
1043  row31 = vis_faligndata(data1, data2);                                 \
1044  row32 = vis_faligndata(data2, data3);                                 \
1045  row33 = vis_faligndata(data3, data4);                                 \
1046  filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK;                       \
1047  yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
1048  yFilter0 = yPtr[0];                                                   \
1049  yFilter1 = yPtr[1];                                                   \
1050  yFilter2 = yPtr[2];                                                   \
1051  yFilter3 = yPtr[3];                                                   \
1052  filterposx = (X >> FILTER_SHIFT) & FILTER_MASK;                       \
1053  xPtr = ((mlib_d64 *)((mlib_u8 *)mlib_filters_s16_4 + filterposx*4));  \
1054  xFilter0 = xPtr[0];                                                   \
1055  xFilter1 = xPtr[1];                                                   \
1056  xFilter2 = xPtr[2];                                                   \
1057  xFilter3 = xPtr[3];                                                   \
1058  X += dX;                                                              \
1059  Y += dY
1060
1061/***************************************************************/
1062#define RESULT_4BC_S16_1PIXEL()                                 \
1063  u00 = vis_fmul8sux16(vis_fxor(row00, mask8000), yFilter0);    \
1064  u01 = vis_fmul8ulx16(vis_fxor(row00, mask8000), yFilter0);    \
1065  u10 = vis_fmul8sux16(vis_fxor(row01, mask8000), yFilter0);    \
1066  u11 = vis_fmul8ulx16(vis_fxor(row01, mask8000), yFilter0);    \
1067  v00 = vis_fpadd16(u00, u01);                                  \
1068  u20 = vis_fmul8sux16(vis_fxor(row02, mask8000), yFilter0);    \
1069  v01 = vis_fpadd16(u10, u11);                                  \
1070  u21 = vis_fmul8ulx16(vis_fxor(row02, mask8000), yFilter0);    \
1071  u30 = vis_fmul8sux16(vis_fxor(row03, mask8000), yFilter0);    \
1072  u31 = vis_fmul8ulx16(vis_fxor(row03, mask8000), yFilter0);    \
1073  v02 = vis_fpadd16(u20, u21);                                  \
1074  u00 = vis_fmul8sux16(vis_fxor(row10, mask8000), yFilter1);    \
1075  u01 = vis_fmul8ulx16(vis_fxor(row10, mask8000), yFilter1);    \
1076  v03 = vis_fpadd16(u30, u31);                                  \
1077  u10 = vis_fmul8sux16(vis_fxor(row11, mask8000), yFilter1);    \
1078  u11 = vis_fmul8ulx16(vis_fxor(row11, mask8000), yFilter1);    \
1079  v10 = vis_fpadd16(u00, u01);                                  \
1080  u20 = vis_fmul8sux16(vis_fxor(row12, mask8000), yFilter1);    \
1081  v11 = vis_fpadd16(u10, u11);                                  \
1082  u21 = vis_fmul8ulx16(vis_fxor(row12, mask8000), yFilter1);    \
1083  u30 = vis_fmul8sux16(vis_fxor(row13, mask8000), yFilter1);    \
1084  u31 = vis_fmul8ulx16(vis_fxor(row13, mask8000), yFilter1);    \
1085  u00 = vis_fmul8sux16(vis_fxor(row20, mask8000), yFilter2);    \
1086  v12 = vis_fpadd16(u20, u21);                                  \
1087  u01 = vis_fmul8ulx16(vis_fxor(row20, mask8000), yFilter2);    \
1088  v13 = vis_fpadd16(u30, u31);                                  \
1089  u10 = vis_fmul8sux16(vis_fxor(row21, mask8000), yFilter2);    \
1090  u11 = vis_fmul8ulx16(vis_fxor(row21, mask8000), yFilter2);    \
1091  v20 = vis_fpadd16(u00, u01);                                  \
1092  u20 = vis_fmul8sux16(vis_fxor(row22, mask8000), yFilter2);    \
1093  sum0 = vis_fpadd16(v00, v10);                                 \
1094  u21 = vis_fmul8ulx16(vis_fxor(row22, mask8000), yFilter2);    \
1095  u30 = vis_fmul8sux16(vis_fxor(row23, mask8000), yFilter2);    \
1096  u31 = vis_fmul8ulx16(vis_fxor(row23, mask8000), yFilter2);    \
1097  u00 = vis_fmul8sux16(vis_fxor(row30, mask8000), yFilter3);    \
1098  u01 = vis_fmul8ulx16(vis_fxor(row30, mask8000), yFilter3);    \
1099  v21 = vis_fpadd16(u10, u11);                                  \
1100  sum1 = vis_fpadd16(v01, v11);                                 \
1101  u10 = vis_fmul8sux16(vis_fxor(row31, mask8000), yFilter3);    \
1102  sum2 = vis_fpadd16(v02, v12);                                 \
1103  sum3 = vis_fpadd16(v03, v13);                                 \
1104  v22 = vis_fpadd16(u20, u21);                                  \
1105  u11 = vis_fmul8ulx16(vis_fxor(row31, mask8000), yFilter3);    \
1106  sum0 = vis_fpadd16(sum0, v20);                                \
1107  u20 = vis_fmul8sux16(vis_fxor(row32, mask8000), yFilter3);    \
1108  u21 = vis_fmul8ulx16(vis_fxor(row32, mask8000), yFilter3);    \
1109  v23 = vis_fpadd16(u30, u31);                                  \
1110  v30 = vis_fpadd16(u00, u01);                                  \
1111  sum1 = vis_fpadd16(sum1, v21);                                \
1112  u30 = vis_fmul8sux16(vis_fxor(row33, mask8000), yFilter3);    \
1113  u31 = vis_fmul8ulx16(vis_fxor(row33, mask8000), yFilter3);    \
1114  v31 = vis_fpadd16(u10, u11);                                  \
1115  sum2 = vis_fpadd16(sum2, v22);                                \
1116  sum3 = vis_fpadd16(sum3, v23);                                \
1117  v32 = vis_fpadd16(u20, u21);                                  \
1118  sum0 = vis_fpadd16(sum0, v30);                                \
1119  v33 = vis_fpadd16(u30, u31);                                  \
1120  v00 = vis_fmul8sux16(sum0, xFilter0);                         \
1121  sum1 = vis_fpadd16(sum1, v31);                                \
1122  sum2 = vis_fpadd16(sum2, v32);                                \
1123  v01 = vis_fmul8ulx16(sum0, xFilter0);                         \
1124  v10 = vis_fmul8sux16(sum1, xFilter1);                         \
1125  sum3 = vis_fpadd16(sum3, v33);                                \
1126  v11 = vis_fmul8ulx16(sum1, xFilter1);                         \
1127  d0 = vis_fpadd16(v00, v01);                                   \
1128  v20 = vis_fmul8sux16(sum2, xFilter2);                         \
1129  v21 = vis_fmul8ulx16(sum2, xFilter2);                         \
1130  d1 = vis_fpadd16(v10, v11);                                   \
1131  v30 = vis_fmul8sux16(sum3, xFilter3);                         \
1132  v31 = vis_fmul8ulx16(sum3, xFilter3);                         \
1133  d2 = vis_fpadd16(v20, v21);                                   \
1134  d3 = vis_fpadd16(v30, v31);                                   \
1135  d0 = vis_fpadd16(d0, d1);                                     \
1136  d2 = vis_fpadd16(d2, d3);                                     \
1137  d0 = vis_fpadd16(d0, d2);                                     \
1138  d2 = vis_fmuld8sux16(f_x01000100, vis_read_hi(d0));           \
1139  d3 = vis_fmuld8sux16(f_x01000100, vis_read_lo(d0));           \
1140  res = vis_fxor(vis_fpackfix_pair(d2, d3), mask8000)
1141
1142/***************************************************************/
1143#define BC_S16_4CH(mlib_filters_s16_4)                                  \
1144  u00 = vis_fmul8sux16(vis_fxor(row00, mask8000), yFilter0);            \
1145  u01 = vis_fmul8ulx16(vis_fxor(row00, mask8000), yFilter0);            \
1146  u10 = vis_fmul8sux16(vis_fxor(row01, mask8000), yFilter0);            \
1147  u11 = vis_fmul8ulx16(vis_fxor(row01, mask8000), yFilter0);            \
1148  v00 = vis_fpadd16(u00, u01);                                          \
1149  u20 = vis_fmul8sux16(vis_fxor(row02, mask8000), yFilter0);            \
1150  v01 = vis_fpadd16(u10, u11);                                          \
1151  u21 = vis_fmul8ulx16(vis_fxor(row02, mask8000), yFilter0);            \
1152  u30 = vis_fmul8sux16(vis_fxor(row03, mask8000), yFilter0);            \
1153  u31 = vis_fmul8ulx16(vis_fxor(row03, mask8000), yFilter0);            \
1154  v02 = vis_fpadd16(u20, u21);                                          \
1155  dpSrc = vis_alignaddr(sPtr, 0);                                       \
1156  u00 = vis_fmul8sux16(vis_fxor(row10, mask8000), yFilter1);            \
1157  u01 = vis_fmul8ulx16(vis_fxor(row10, mask8000), yFilter1);            \
1158  data0 = dpSrc[0];                                                     \
1159  filterposy = (Y >> FILTER_SHIFT);                                     \
1160  v03 = vis_fpadd16(u30, u31);                                          \
1161  data1 = dpSrc[1];                                                     \
1162  u10 = vis_fmul8sux16(vis_fxor(row11, mask8000), yFilter1);            \
1163  data2 = dpSrc[2];                                                     \
1164  u11 = vis_fmul8ulx16(vis_fxor(row11, mask8000), yFilter1);            \
1165  v10 = vis_fpadd16(u00, u01);                                          \
1166  data3 = dpSrc[3];                                                     \
1167  u20 = vis_fmul8sux16(vis_fxor(row12, mask8000), yFilter1);            \
1168  v11 = vis_fpadd16(u10, u11);                                          \
1169  data4 = dpSrc[4];                                                     \
1170  u21 = vis_fmul8ulx16(vis_fxor(row12, mask8000), yFilter1);            \
1171  row00 = vis_faligndata(data0, data1);                                 \
1172  u30 = vis_fmul8sux16(vis_fxor(row13, mask8000), yFilter1);            \
1173  row01 = vis_faligndata(data1, data2);                                 \
1174  u31 = vis_fmul8ulx16(vis_fxor(row13, mask8000), yFilter1);            \
1175  row02 = vis_faligndata(data2, data3);                                 \
1176  u00 = vis_fmul8sux16(vis_fxor(row20, mask8000), yFilter2);            \
1177  row03 = vis_faligndata(data3, data4);                                 \
1178  filterposx = (X >> FILTER_SHIFT);                                     \
1179  sPtr += srcYStride;                                                   \
1180  v12 = vis_fpadd16(u20, u21);                                          \
1181  dpSrc = vis_alignaddr(sPtr, 0);                                       \
1182  u01 = vis_fmul8ulx16(vis_fxor(row20, mask8000), yFilter2);            \
1183  v13 = vis_fpadd16(u30, u31);                                          \
1184  data0 = dpSrc[0];                                                     \
1185  u10 = vis_fmul8sux16(vis_fxor(row21, mask8000), yFilter2);            \
1186  X += dX;                                                              \
1187  data1 = dpSrc[1];                                                     \
1188  u11 = vis_fmul8ulx16(vis_fxor(row21, mask8000), yFilter2);            \
1189  v20 = vis_fpadd16(u00, u01);                                          \
1190  data2 = dpSrc[2];                                                     \
1191  u20 = vis_fmul8sux16(vis_fxor(row22, mask8000), yFilter2);            \
1192  sum0 = vis_fpadd16(v00, v10);                                         \
1193  data3 = dpSrc[3];                                                     \
1194  u21 = vis_fmul8ulx16(vis_fxor(row22, mask8000), yFilter2);            \
1195  data4 = dpSrc[4];                                                     \
1196  row10 = vis_faligndata(data0, data1);                                 \
1197  u30 = vis_fmul8sux16(vis_fxor(row23, mask8000), yFilter2);            \
1198  row11 = vis_faligndata(data1, data2);                                 \
1199  u31 = vis_fmul8ulx16(vis_fxor(row23, mask8000), yFilter2);            \
1200  row12 = vis_faligndata(data2, data3);                                 \
1201  u00 = vis_fmul8sux16(vis_fxor(row30, mask8000), yFilter3);            \
1202  row13 = vis_faligndata(data3, data4);                                 \
1203  sPtr += srcYStride;                                                   \
1204  dpSrc = vis_alignaddr(sPtr, 0);                                       \
1205  u01 = vis_fmul8ulx16(vis_fxor(row30, mask8000), yFilter3);            \
1206  v21 = vis_fpadd16(u10, u11);                                          \
1207  Y += dY;                                                              \
1208  xSrc = (X >> MLIB_SHIFT)-1;                                           \
1209  sum1 = vis_fpadd16(v01, v11);                                         \
1210  data0 = dpSrc[0];                                                     \
1211  u10 = vis_fmul8sux16(vis_fxor(row31, mask8000), yFilter3);            \
1212  sum2 = vis_fpadd16(v02, v12);                                         \
1213  sum3 = vis_fpadd16(v03, v13);                                         \
1214  ySrc = (Y >> MLIB_SHIFT)-1;                                           \
1215  data1 = dpSrc[1];                                                     \
1216  v22 = vis_fpadd16(u20, u21);                                          \
1217  u11 = vis_fmul8ulx16(vis_fxor(row31, mask8000), yFilter3);            \
1218  data2 = dpSrc[2];                                                     \
1219  sum0 = vis_fpadd16(sum0, v20);                                        \
1220  u20 = vis_fmul8sux16(vis_fxor(row32, mask8000), yFilter3);            \
1221  data3 = dpSrc[3];                                                     \
1222  u21 = vis_fmul8ulx16(vis_fxor(row32, mask8000), yFilter3);            \
1223  v23 = vis_fpadd16(u30, u31);                                          \
1224  data4 = dpSrc[4];                                                     \
1225  v30 = vis_fpadd16(u00, u01);                                          \
1226  filterposy &= FILTER_MASK;                                            \
1227  row20 = vis_faligndata(data0, data1);                                 \
1228  sum1 = vis_fpadd16(sum1, v21);                                        \
1229  u30 = vis_fmul8sux16(vis_fxor(row33, mask8000), yFilter3);            \
1230  row21 = vis_faligndata(data1, data2);                                 \
1231  u31 = vis_fmul8ulx16(vis_fxor(row33, mask8000), yFilter3);            \
1232  row22 = vis_faligndata(data2, data3);                                 \
1233  row23 = vis_faligndata(data3, data4);                                 \
1234  sPtr += srcYStride;                                                   \
1235  filterposx &= FILTER_MASK;                                            \
1236  v31 = vis_fpadd16(u10, u11);                                          \
1237  dpSrc = vis_alignaddr(sPtr, 0);                                       \
1238  data0 = dpSrc[0];                                                     \
1239  sum2 = vis_fpadd16(sum2, v22);                                        \
1240  sum3 = vis_fpadd16(sum3, v23);                                        \
1241  data1 = dpSrc[1];                                                     \
1242  v32 = vis_fpadd16(u20, u21);                                          \
1243  data2 = dpSrc[2];                                                     \
1244  sum0 = vis_fpadd16(sum0, v30);                                        \
1245  data3 = dpSrc[3];                                                     \
1246  v33 = vis_fpadd16(u30, u31);                                          \
1247  data4 = dpSrc[4];                                                     \
1248  row30 = vis_faligndata(data0, data1);                                 \
1249  v00 = vis_fmul8sux16(sum0, xFilter0);                                 \
1250  row31 = vis_faligndata(data1, data2);                                 \
1251  row32 = vis_faligndata(data2, data3);                                 \
1252  row33 = vis_faligndata(data3, data4);                                 \
1253  yPtr = ((mlib_d64 *) ((mlib_u8 *)mlib_filters_s16_4 + filterposy*4)); \
1254  sum1 = vis_fpadd16(sum1, v31);                                        \
1255  yFilter0 = yPtr[0];                                                   \
1256  sum2 = vis_fpadd16(sum2, v32);                                        \
1257  v01 = vis_fmul8ulx16(sum0, xFilter0);                                 \
1258  yFilter1 = yPtr[1];                                                   \
1259  v10 = vis_fmul8sux16(sum1, xFilter1);                                 \
1260  sum3 = vis_fpadd16(sum3, v33);                                        \
1261  yFilter2 = yPtr[2];                                                   \
1262  v11 = vis_fmul8ulx16(sum1, xFilter1);                                 \
1263  d0 = vis_fpadd16(v00, v01);                                           \
1264  yFilter3 = yPtr[3];                                                   \
1265  xPtr = ((mlib_d64 *)((mlib_u8 *)mlib_filters_s16_4 + filterposx*4));  \
1266  v20 = vis_fmul8sux16(sum2, xFilter2);                                 \
1267  xFilter0 = xPtr[0];                                                   \
1268  v21 = vis_fmul8ulx16(sum2, xFilter2);                                 \
1269  d1 = vis_fpadd16(v10, v11);                                           \
1270  xFilter1 = xPtr[1];                                                   \
1271  v30 = vis_fmul8sux16(sum3, xFilter3);                                 \
1272  v31 = vis_fmul8ulx16(sum3, xFilter3);                                 \
1273  d2 = vis_fpadd16(v20, v21);                                           \
1274  xFilter2 = xPtr[2];                                                   \
1275  d3 = vis_fpadd16(v30, v31);                                           \
1276  xFilter3 = xPtr[3];                                                   \
1277  sPtr = (mlib_s16 *)lineAddr[ySrc] + (xSrc << 2)
1278
1279/***************************************************************/
1280#define FADD_4BC_S16()                                          \
1281  d0 = vis_fpadd16(d0, d1);                                     \
1282  d2 = vis_fpadd16(d2, d3);                                     \
1283  d0 = vis_fpadd16(d0, d2);                                     \
1284  d2 = vis_fmuld8sux16(f_x01000100, vis_read_hi(d0));           \
1285  d3 = vis_fmuld8sux16(f_x01000100, vis_read_lo(d0));           \
1286  res = vis_fxor(vis_fpackfix_pair(d2, d3), mask8000)
1287
1288/***************************************************************/
1289mlib_status mlib_ImageAffine_u16_4ch_bc (mlib_affine_param *param)
1290{
1291  DECLAREVAR_BC();
1292  DTYPE  *dstLineEnd;
1293  mlib_s32  filterposx, filterposy;
1294  mlib_d64  data0, data1, data2, data3, data4;
1295  mlib_d64  sum0, sum1, sum2, sum3;
1296  mlib_d64  row00, row10, row20, row30;
1297  mlib_d64  row01, row11, row21, row31;
1298  mlib_d64  row02, row12, row22, row32;
1299  mlib_d64  row03, row13, row23, row33;
1300  mlib_d64  xFilter0, xFilter1, xFilter2, xFilter3;
1301  mlib_d64  yFilter0, yFilter1, yFilter2, yFilter3;
1302  mlib_d64  v00, v01, v02, v03, v10, v11, v12, v13;
1303  mlib_d64  v20, v21, v22, v23, v30, v31, v32, v33;
1304  mlib_d64  u00, u01, u10, u11, u20, u21, u30, u31;
1305  mlib_d64  d0, d1, d2, d3;
1306  mlib_d64 *yPtr, *xPtr;
1307  mlib_d64 *dp, *dpSrc;
1308  mlib_s32  cols, i, mask, gsrd;
1309  mlib_d64  res;
1310  mlib_f32  f_x01000100 = vis_to_float(0x01000100);
1311  mlib_d64  mask8000 = vis_to_double_dup(0x80008000);
1312  const mlib_s16 *mlib_filters_table_4;
1313
1314  if (filter == MLIB_BICUBIC) {
1315    mlib_filters_table_4 = mlib_filters_s16_bc_4;
1316  } else {
1317    mlib_filters_table_4 = mlib_filters_s16_bc2_4;
1318  }
1319
1320  srcYStride >>= 1;
1321
1322  for (j = yStart; j <= yFinish; j++) {
1323
1324    vis_write_gsr(10 << 3);
1325
1326    CLIP(4);
1327    dstLineEnd  = (DTYPE*)dstData + 4 * xRight;
1328
1329    cols = xRight - xLeft + 1;
1330    dp = vis_alignaddr(dstPixelPtr, 0);
1331    dstLineEnd += 3;
1332    mask = vis_edge16(dstPixelPtr, dstLineEnd);
1333    gsrd = ((8 - (mlib_addr)dstPixelPtr) & 7);
1334
1335    i = 0;
1336
1337    if (i <= cols - 4) {
1338
1339      NEXT_PIXEL_4BC_S16();
1340      LOAD_BC_S16_4CH_1PIXEL(mlib_filters_table_4);
1341
1342      NEXT_PIXEL_4BC_S16();
1343
1344      BC_S16_4CH(mlib_filters_table_4);
1345      FADD_4BC_S16();
1346
1347      BC_S16_4CH(mlib_filters_table_4);
1348
1349#pragma pipeloop(0)
1350      for (; i < cols-4; i++) {
1351        vis_alignaddr((void *)gsrd, 0);
1352        res = vis_faligndata(res, res);
1353
1354        vis_pst_16(res, dp++, mask);
1355        vis_pst_16(res, dp, ~mask);
1356
1357        FADD_4BC_S16();
1358        BC_S16_4CH(mlib_filters_table_4);
1359      }
1360
1361      vis_alignaddr((void *)gsrd, 0);
1362      res = vis_faligndata(res, res);
1363      vis_pst_16(res, dp++, mask);
1364      vis_pst_16(res, dp, ~mask);
1365
1366      FADD_4BC_S16();
1367      vis_alignaddr((void *)gsrd, 0);
1368      res = vis_faligndata(res, res);
1369      vis_pst_16(res, dp++, mask);
1370      vis_pst_16(res, dp, ~mask);
1371
1372      RESULT_4BC_S16_1PIXEL();
1373      vis_alignaddr((void *)gsrd, 0);
1374      res = vis_faligndata(res, res);
1375      vis_pst_16(res, dp++, mask);
1376      vis_pst_16(res, dp, ~mask);
1377
1378      LOAD_BC_S16_4CH_1PIXEL(mlib_filters_table_4);
1379      RESULT_4BC_S16_1PIXEL();
1380      vis_alignaddr((void *)gsrd, 0);
1381      res = vis_faligndata(res, res);
1382      vis_pst_16(res, dp++, mask);
1383      vis_pst_16(res, dp, ~mask);
1384      i += 4;
1385    }
1386
1387#pragma pipeloop(0)
1388    for (; i < cols; i++) {
1389      NEXT_PIXEL_4BC_S16();
1390      LOAD_BC_S16_4CH_1PIXEL(mlib_filters_table_4);
1391      RESULT_4BC_S16_1PIXEL();
1392      vis_alignaddr((void *)gsrd, 0);
1393      res = vis_faligndata(res, res);
1394      vis_pst_16(res, dp++, mask);
1395      vis_pst_16(res, dp, ~mask);
1396    }
1397  }
1398
1399  return MLIB_SUCCESS;
1400}
1401
1402/***************************************************************/
1403