1/* 2 * Copyright (c) 1998, 2003, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 27 28/* 29 * FUNCTION 30 * mlib_v_convMxN_8ext - convolve a 8-bit image, MxN kernel, 31 * edge = src extended 32 * 33 * SYNOPSIS 34 * mlib_status mlib_v_convMxNext_u8(mlib_image *dst, 35 * cosmt mlib_image *dst, 36 * mlib_s32 kwid, 37 * mlib_s32 khgt, 38 * mlib_s32 dx_l, 39 * mlib_s32 dx_r, 40 * mlib_s32 dy_t, 41 * mlib_s32 dy_b, 42 * const mlib_s32 *skernel, 43 * mlib_s32 discardbits, 44 * mlib_s32 cmask) 45 * 46 * ARGUMENT 47 * src Ptr to source image structure 48 * dst Ptr to destination image structure 49 * khgt Kernel height (# of rows) 50 * kwid Kernel width (# of cols) 51 * skernel Ptr to convolution kernel 52 * discardbits The number of LSBits of the 32-bit accumulator that 53 * are discarded when the 32-bit accumulator is converted 54 * to 16-bit output data; discardbits must be 1-15 (it 55 * cannot be zero). Same as exponent N for scalefac=2**N. 56 * cmask Channel mask to indicate the channels to be convolved. 57 * Each bit of which represents a channel in the image. The 58 * channels corresponded to 1 bits are those to be processed. 59 * 60 * DESCRIPTION 61 * A 2-D convolution (MxN kernel) for 8-bit images. 62 * 63 */ 64 65#include "vis_proto.h" 66#include "mlib_image.h" 67#include "mlib_ImageCopy.h" 68#include "mlib_ImageConv.h" 69#include "mlib_c_ImageConv.h" 70#include "mlib_v_ImageChannelExtract.h" 71#include "mlib_v_ImageChannelInsert.h" 72 73/***************************************************************/ 74static mlib_status mlib_convMxN_8ext_f(mlib_image *dst, 75 const mlib_image *src, 76 mlib_s32 m, 77 mlib_s32 n, 78 mlib_s32 dx_l, 79 mlib_s32 dx_r, 80 mlib_s32 dy_t, 81 mlib_s32 dy_b, 82 const mlib_s32 *kern, 83 mlib_s32 scale); 84 85static mlib_status mlib_convMxN_8ext_mask(mlib_image *dst, 86 const mlib_image *src, 87 mlib_s32 m, 88 mlib_s32 n, 89 mlib_s32 dx_l, 90 mlib_s32 dx_r, 91 mlib_s32 dy_t, 92 mlib_s32 dy_b, 93 const mlib_s32 *kern, 94 mlib_s32 scale, 95 mlib_s32 cmask); 96 97/***************************************************************/ 98static mlib_s32 mlib_round_8[16] = { 99 0x00400040, 0x00200020, 0x00100010, 0x00080008, 100 0x00040004, 0x00020002, 0x00010001, 0x00000000, 101 0x00000000, 0x00000000, 0x00000000, 0x00000000, 102 0x00000000, 0x00000000, 0x00000000, 0x00000000 103}; 104 105/***************************************************************/ 106mlib_status mlib_convMxNext_u8(mlib_image *dst, 107 const mlib_image *src, 108 const mlib_s32 *kernel, 109 mlib_s32 kwid, 110 mlib_s32 khgt, 111 mlib_s32 dx_l, 112 mlib_s32 dx_r, 113 mlib_s32 dy_t, 114 mlib_s32 dy_b, 115 mlib_s32 discardbits, 116 mlib_s32 cmask) 117{ 118 mlib_s32 nchannel, amask; 119 120 if (mlib_ImageConvVersion(kwid, khgt, discardbits, MLIB_BYTE) == 0) 121 return mlib_c_convMxNext_u8(dst, src, kernel, kwid, khgt, 122 dx_l, dx_r, dy_t, dy_b, discardbits, cmask); 123 124 nchannel = mlib_ImageGetChannels(src); 125 126 if (nchannel == 1) 127 cmask = 1; 128 amask = (1 << nchannel) - 1; 129 130 if ((cmask & amask) == amask) { 131 return mlib_convMxN_8ext_f(dst, src, kwid, khgt, dx_l, dx_r, dy_t, dy_b, kernel, 132 discardbits); 133 } 134 else { 135 return mlib_convMxN_8ext_mask(dst, src, kwid, khgt, dx_l, dx_r, dy_t, dy_b, kernel, 136 discardbits, cmask); 137 } 138} 139 140#define MAX_N 11 141 142/***************************************************************/ 143mlib_status mlib_convMxN_8ext_f(mlib_image *dst, 144 const mlib_image *src, 145 mlib_s32 m, 146 mlib_s32 n, 147 mlib_s32 dx_l, 148 mlib_s32 dx_r, 149 mlib_s32 dy_t, 150 mlib_s32 dy_b, 151 const mlib_s32 *kern, 152 mlib_s32 scale) 153{ 154 mlib_d64 *buffs_local[3 * (MAX_N + 1)], **buffs = buffs_local, **buff; 155 mlib_d64 *buff0, *buff1, *buff2, *buff3, *buffn, *buffd, *buffe; 156 mlib_d64 s00, s01, s10, s11, s20, s21, s30, s31, s0, s1, s2, s3; 157 mlib_d64 d00, d01, d10, d11, d20, d21, d30, d31; 158 mlib_d64 dd, d0, d1; 159 mlib_s32 ik, jk, ik_last, jk_size, coff, off, doff; 160 mlib_u8 *sl, *dl; 161 mlib_s32 hgt = mlib_ImageGetHeight(src); 162 mlib_s32 wid = mlib_ImageGetWidth(src); 163 mlib_s32 sll = mlib_ImageGetStride(src); 164 mlib_s32 dll = mlib_ImageGetStride(dst); 165 mlib_u8 *adr_src = (mlib_u8 *) mlib_ImageGetData(src); 166 mlib_u8 *adr_dst = (mlib_u8 *) mlib_ImageGetData(dst); 167 mlib_s32 ssize, xsize, dsize, esize, buff_ind = 0; 168 mlib_d64 *pbuff, *dp; 169 mlib_f32 *karr = (mlib_f32 *) kern; 170 mlib_s32 gsr_scale = (31 - scale) << 3; 171 mlib_d64 drnd = vis_to_double_dup(mlib_round_8[31 - scale]); 172 mlib_s32 i, j, l, ii; 173 mlib_s32 nchan = mlib_ImageGetChannels(dst); 174 175 if (n > MAX_N) { 176 buffs = mlib_malloc(3 * (n + 1) * sizeof(mlib_d64 *)); 177 178 if (buffs == NULL) 179 return MLIB_FAILURE; 180 } 181 182 buff = buffs + 2 * (n + 1); 183 184 sl = adr_src; 185 dl = adr_dst; 186 187 ssize = nchan * (wid + (m - 1)); 188 dsize = (ssize + 7) / 8; 189 esize = dsize + 4; 190 pbuff = mlib_malloc((n + 4) * esize * sizeof(mlib_d64)); 191 192 if (pbuff == NULL) { 193 if (buffs != buffs_local) 194 mlib_free(buffs); 195 return MLIB_FAILURE; 196 } 197 198 for (i = 0; i < (n + 1); i++) 199 buffs[i] = pbuff + i * esize; 200 for (i = 0; i < (n + 1); i++) 201 buffs[(n + 1) + i] = buffs[i]; 202 buffd = buffs[n] + esize; 203 buffe = buffd + 2 * esize; 204 205 xsize = ssize - nchan * (m - 1); 206 ssize -= nchan * (dx_l + dx_r); 207 208 vis_write_gsr(gsr_scale + 7); 209 210 for (l = 0; l < n; l++) { 211 mlib_d64 *buffn = buffs[l]; 212 213 mlib_ImageCopy_na((mlib_u8 *) sl, (mlib_u8 *) buffn + dx_l * nchan, ssize); 214 215 for (i = 0; i < nchan; i++) { 216 for (ii = 0; ii < dx_l; ii++) { 217 *((mlib_u8 *) buffn + i + nchan * ii) = *((mlib_u8 *) buffn + i + nchan * dx_l); 218 } 219 } 220 221 for (i = 0; i < nchan; i++) { 222 for (ii = 0; ii < dx_r; ii++) { 223 *((mlib_u8 *) buffn + i + nchan * ii + ssize + dx_l * nchan) = 224 *((mlib_u8 *) buffn + i + nchan * (dx_l - 1) + ssize); 225 } 226 } 227 228 if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) 229 sl += sll; 230 } 231 232 /* init buffer */ 233#pragma pipeloop(0) 234 for (i = 0; i < (xsize + 7) / 8; i++) { 235 buffd[2 * i] = drnd; 236 buffd[2 * i + 1] = drnd; 237 } 238 239 for (j = 0; j < hgt; j++) { 240 mlib_d64 **buffc = buffs + buff_ind; 241 mlib_f32 *pk = karr, k0, k1, k2, k3; 242 243 for (l = 0; l < n; l++) { 244 buff[l] = buffc[l]; 245 } 246 247 buffn = buffc[n]; 248 249 mlib_ImageCopy_na((mlib_u8 *) sl, (mlib_u8 *) buffn + dx_l * nchan, ssize); 250 251 for (i = 0; i < nchan; i++) { 252 for (ii = 0; ii < dx_l; ii++) { 253 *((mlib_u8 *) buffn + i + nchan * ii) = *((mlib_u8 *) buffn + i + nchan * dx_l); 254 } 255 } 256 257 for (i = 0; i < nchan; i++) { 258 for (ii = 0; ii < dx_r; ii++) { 259 *((mlib_u8 *) buffn + i + nchan * ii + ssize + dx_l * nchan) = 260 *((mlib_u8 *) buffn + i + nchan * (dx_l - 1) + ssize); 261 } 262 } 263 264 ik_last = (m - 1); 265 266 for (jk = 0; jk < n; jk += jk_size) { 267 jk_size = n - jk; 268 269 if (jk_size >= 6) 270 jk_size = 4; 271 if (jk_size == 5) 272 jk_size = 3; 273 274 coff = 0; 275 276 if (jk_size == 1) { 277 278 for (ik = 0; ik < m; ik++, coff += nchan) { 279 if (!jk && ik == ik_last) 280 continue; 281 282 k0 = pk[ik]; 283 284 doff = coff / 8; 285 buff0 = buff[jk] + doff; 286 287 off = coff & 7; 288 vis_write_gsr(gsr_scale + off); 289 290 s01 = buff0[0]; 291#pragma pipeloop(0) 292 for (i = 0; i < (xsize + 7) / 8; i++) { 293 s00 = s01; 294 s01 = buff0[i + 1]; 295 s0 = vis_faligndata(s00, s01); 296 297 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 298 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 299 300 d0 = buffd[2 * i]; 301 d1 = buffd[2 * i + 1]; 302 d0 = vis_fpadd16(d00, d0); 303 d1 = vis_fpadd16(d01, d1); 304 buffd[2 * i] = d0; 305 buffd[2 * i + 1] = d1; 306 } 307 } 308 309 pk += m; 310 311 } 312 else if (jk_size == 2) { 313 314 for (ik = 0; ik < m; ik++, coff += nchan) { 315 if (!jk && ik == ik_last) 316 continue; 317 318 k0 = pk[ik]; 319 k1 = pk[ik + m]; 320 321 doff = coff / 8; 322 buff0 = buff[jk] + doff; 323 buff1 = buff[jk + 1] + doff; 324 325 off = coff & 7; 326 vis_write_gsr(gsr_scale + off); 327 328 s01 = buff0[0]; 329 s11 = buff1[0]; 330#pragma pipeloop(0) 331 for (i = 0; i < (xsize + 7) / 8; i++) { 332 s00 = s01; 333 s10 = s11; 334 s01 = buff0[i + 1]; 335 s11 = buff1[i + 1]; 336 s0 = vis_faligndata(s00, s01); 337 s1 = vis_faligndata(s10, s11); 338 339 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 340 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 341 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 342 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 343 344 d0 = buffd[2 * i]; 345 d1 = buffd[2 * i + 1]; 346 d0 = vis_fpadd16(d00, d0); 347 d0 = vis_fpadd16(d10, d0); 348 d1 = vis_fpadd16(d01, d1); 349 d1 = vis_fpadd16(d11, d1); 350 buffd[2 * i] = d0; 351 buffd[2 * i + 1] = d1; 352 } 353 } 354 355 pk += 2 * m; 356 357 } 358 else if (jk_size == 3) { 359 360 for (ik = 0; ik < m; ik++, coff += nchan) { 361 if (!jk && ik == ik_last) 362 continue; 363 364 k0 = pk[ik]; 365 k1 = pk[ik + m]; 366 k2 = pk[ik + 2 * m]; 367 368 doff = coff / 8; 369 buff0 = buff[jk] + doff; 370 buff1 = buff[jk + 1] + doff; 371 buff2 = buff[jk + 2] + doff; 372 373 off = coff & 7; 374 vis_write_gsr(gsr_scale + off); 375 376 if (off == 0) { 377#pragma pipeloop(0) 378 for (i = 0; i < (xsize + 7) / 8; i++) { 379 d0 = buffd[2 * i]; 380 d1 = buffd[2 * i + 1]; 381 382 s0 = buff0[i]; 383 s1 = buff1[i]; 384 s2 = buff2[i]; 385 386 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 387 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 388 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 389 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 390 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 391 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 392 393 d00 = vis_fpadd16(d00, d10); 394 d0 = vis_fpadd16(d20, d0); 395 d0 = vis_fpadd16(d00, d0); 396 d01 = vis_fpadd16(d01, d11); 397 d1 = vis_fpadd16(d21, d1); 398 d1 = vis_fpadd16(d01, d1); 399 buffd[2 * i] = d0; 400 buffd[2 * i + 1] = d1; 401 } 402 403 } 404 else if (off == 4) { 405 s01 = buff0[0]; 406 s11 = buff1[0]; 407 s21 = buff2[0]; 408#pragma pipeloop(0) 409 for (i = 0; i < (xsize + 7) / 8; i++) { 410 d0 = buffd[2 * i]; 411 d1 = buffd[2 * i + 1]; 412 413 s00 = s01; 414 s10 = s11; 415 s20 = s21; 416 s01 = buff0[i + 1]; 417 s11 = buff1[i + 1]; 418 s21 = buff2[i + 1]; 419 420 d00 = vis_fmul8x16au(vis_read_lo(s00), k0); 421 d01 = vis_fmul8x16au(vis_read_hi(s01), k0); 422 d10 = vis_fmul8x16au(vis_read_lo(s10), k1); 423 d11 = vis_fmul8x16au(vis_read_hi(s11), k1); 424 d20 = vis_fmul8x16au(vis_read_lo(s20), k2); 425 d21 = vis_fmul8x16au(vis_read_hi(s21), k2); 426 427 d00 = vis_fpadd16(d00, d10); 428 d0 = vis_fpadd16(d20, d0); 429 d0 = vis_fpadd16(d00, d0); 430 d01 = vis_fpadd16(d01, d11); 431 d1 = vis_fpadd16(d21, d1); 432 d1 = vis_fpadd16(d01, d1); 433 buffd[2 * i] = d0; 434 buffd[2 * i + 1] = d1; 435 } 436 437 } 438 else { 439 s01 = buff0[0]; 440 s11 = buff1[0]; 441 s21 = buff2[0]; 442#pragma pipeloop(0) 443 for (i = 0; i < (xsize + 7) / 8; i++) { 444 d0 = buffd[2 * i]; 445 d1 = buffd[2 * i + 1]; 446 447 s00 = s01; 448 s10 = s11; 449 s20 = s21; 450 s01 = buff0[i + 1]; 451 s11 = buff1[i + 1]; 452 s21 = buff2[i + 1]; 453 s0 = vis_faligndata(s00, s01); 454 s1 = vis_faligndata(s10, s11); 455 s2 = vis_faligndata(s20, s21); 456 457 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 458 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 459 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 460 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 461 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 462 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 463 464 d00 = vis_fpadd16(d00, d10); 465 d0 = vis_fpadd16(d20, d0); 466 d0 = vis_fpadd16(d00, d0); 467 d01 = vis_fpadd16(d01, d11); 468 d1 = vis_fpadd16(d21, d1); 469 d1 = vis_fpadd16(d01, d1); 470 buffd[2 * i] = d0; 471 buffd[2 * i + 1] = d1; 472 } 473 } 474 } 475 476 pk += 3 * m; 477 478 } 479 else { /* jk_size == 4 */ 480 481 for (ik = 0; ik < m; ik++, coff += nchan) { 482 if (!jk && ik == ik_last) 483 continue; 484 485 k0 = pk[ik]; 486 k1 = pk[ik + m]; 487 k2 = pk[ik + 2 * m]; 488 k3 = pk[ik + 3 * m]; 489 490 doff = coff / 8; 491 buff0 = buff[jk] + doff; 492 buff1 = buff[jk + 1] + doff; 493 buff2 = buff[jk + 2] + doff; 494 buff3 = buff[jk + 3] + doff; 495 496 off = coff & 7; 497 vis_write_gsr(gsr_scale + off); 498 499 if (off == 0) { 500 501#pragma pipeloop(0) 502 for (i = 0; i < (xsize + 7) / 8; i++) { 503 d0 = buffd[2 * i]; 504 d1 = buffd[2 * i + 1]; 505 506 s0 = buff0[i]; 507 s1 = buff1[i]; 508 s2 = buff2[i]; 509 s3 = buff3[i]; 510 511 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 512 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 513 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 514 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 515 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 516 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 517 d30 = vis_fmul8x16au(vis_read_hi(s3), k3); 518 d31 = vis_fmul8x16au(vis_read_lo(s3), k3); 519 520 d00 = vis_fpadd16(d00, d10); 521 d20 = vis_fpadd16(d20, d30); 522 d0 = vis_fpadd16(d0, d00); 523 d0 = vis_fpadd16(d0, d20); 524 d01 = vis_fpadd16(d01, d11); 525 d21 = vis_fpadd16(d21, d31); 526 d1 = vis_fpadd16(d1, d01); 527 d1 = vis_fpadd16(d1, d21); 528 buffd[2 * i] = d0; 529 buffd[2 * i + 1] = d1; 530 } 531 532 } 533 else if (off == 4) { 534 535 s01 = buff0[0]; 536 s11 = buff1[0]; 537 s21 = buff2[0]; 538 s31 = buff3[0]; 539#pragma pipeloop(0) 540 for (i = 0; i < (xsize + 7) / 8; i++) { 541 d0 = buffd[2 * i]; 542 d1 = buffd[2 * i + 1]; 543 544 s00 = s01; 545 s10 = s11; 546 s20 = s21; 547 s30 = s31; 548 s01 = buff0[i + 1]; 549 s11 = buff1[i + 1]; 550 s21 = buff2[i + 1]; 551 s31 = buff3[i + 1]; 552 553 d00 = vis_fmul8x16au(vis_read_lo(s00), k0); 554 d01 = vis_fmul8x16au(vis_read_hi(s01), k0); 555 d10 = vis_fmul8x16au(vis_read_lo(s10), k1); 556 d11 = vis_fmul8x16au(vis_read_hi(s11), k1); 557 d20 = vis_fmul8x16au(vis_read_lo(s20), k2); 558 d21 = vis_fmul8x16au(vis_read_hi(s21), k2); 559 d30 = vis_fmul8x16au(vis_read_lo(s30), k3); 560 d31 = vis_fmul8x16au(vis_read_hi(s31), k3); 561 562 d00 = vis_fpadd16(d00, d10); 563 d20 = vis_fpadd16(d20, d30); 564 d0 = vis_fpadd16(d0, d00); 565 d0 = vis_fpadd16(d0, d20); 566 d01 = vis_fpadd16(d01, d11); 567 d21 = vis_fpadd16(d21, d31); 568 d1 = vis_fpadd16(d1, d01); 569 d1 = vis_fpadd16(d1, d21); 570 buffd[2 * i] = d0; 571 buffd[2 * i + 1] = d1; 572 } 573 574 } 575 else { 576 577 s01 = buff0[0]; 578 s11 = buff1[0]; 579 s21 = buff2[0]; 580 s31 = buff3[0]; 581#pragma pipeloop(0) 582 for (i = 0; i < (xsize + 7) / 8; i++) { 583 d0 = buffd[2 * i]; 584 d1 = buffd[2 * i + 1]; 585 586 s00 = s01; 587 s10 = s11; 588 s20 = s21; 589 s30 = s31; 590 s01 = buff0[i + 1]; 591 s11 = buff1[i + 1]; 592 s21 = buff2[i + 1]; 593 s31 = buff3[i + 1]; 594 s0 = vis_faligndata(s00, s01); 595 s1 = vis_faligndata(s10, s11); 596 s2 = vis_faligndata(s20, s21); 597 s3 = vis_faligndata(s30, s31); 598 599 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 600 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 601 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 602 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 603 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 604 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 605 d30 = vis_fmul8x16au(vis_read_hi(s3), k3); 606 d31 = vis_fmul8x16au(vis_read_lo(s3), k3); 607 608 d00 = vis_fpadd16(d00, d10); 609 d20 = vis_fpadd16(d20, d30); 610 d0 = vis_fpadd16(d0, d00); 611 d0 = vis_fpadd16(d0, d20); 612 d01 = vis_fpadd16(d01, d11); 613 d21 = vis_fpadd16(d21, d31); 614 d1 = vis_fpadd16(d1, d01); 615 d1 = vis_fpadd16(d1, d21); 616 buffd[2 * i] = d0; 617 buffd[2 * i + 1] = d1; 618 } 619 } 620 } 621 622 pk += 4 * m; 623 } 624 } 625 626 /***************************************** 627 ***************************************** 628 ** Final iteration ** 629 ***************************************** 630 *****************************************/ 631 632 jk_size = n; 633 634 if (jk_size >= 6) 635 jk_size = 4; 636 if (jk_size == 5) 637 jk_size = 3; 638 639 k0 = karr[ik_last]; 640 k1 = karr[ik_last + m]; 641 k2 = karr[ik_last + 2 * m]; 642 k3 = karr[ik_last + 3 * m]; 643 644 off = ik_last * nchan; 645 doff = off / 8; 646 off &= 7; 647 buff0 = buff[0] + doff; 648 buff1 = buff[1] + doff; 649 buff2 = buff[2] + doff; 650 buff3 = buff[3] + doff; 651 vis_write_gsr(gsr_scale + off); 652 653 if (jk_size == 1) { 654 dp = buffe; 655 656 s01 = buff0[0]; 657#pragma pipeloop(0) 658 for (i = 0; i < (xsize + 7) / 8; i++) { 659 s00 = s01; 660 s01 = buff0[i + 1]; 661 s0 = vis_faligndata(s00, s01); 662 663 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 664 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 665 666 d0 = buffd[2 * i]; 667 d1 = buffd[2 * i + 1]; 668 d0 = vis_fpadd16(d0, d00); 669 d1 = vis_fpadd16(d1, d01); 670 671 dd = vis_fpack16_pair(d0, d1); 672 dp[i] = dd; 673 674 buffd[2 * i] = drnd; 675 buffd[2 * i + 1] = drnd; 676 } 677 678 } 679 else if (jk_size == 2) { 680 dp = buffe; 681 682 s01 = buff0[0]; 683 s11 = buff1[0]; 684#pragma pipeloop(0) 685 for (i = 0; i < (xsize + 7) / 8; i++) { 686 s00 = s01; 687 s10 = s11; 688 s01 = buff0[i + 1]; 689 s11 = buff1[i + 1]; 690 s0 = vis_faligndata(s00, s01); 691 s1 = vis_faligndata(s10, s11); 692 693 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 694 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 695 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 696 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 697 698 d0 = buffd[2 * i]; 699 d1 = buffd[2 * i + 1]; 700 d0 = vis_fpadd16(d0, d00); 701 d0 = vis_fpadd16(d0, d10); 702 d1 = vis_fpadd16(d1, d01); 703 d1 = vis_fpadd16(d1, d11); 704 705 dd = vis_fpack16_pair(d0, d1); 706 dp[i] = dd; 707 708 buffd[2 * i] = drnd; 709 buffd[2 * i + 1] = drnd; 710 } 711 712 } 713 else if (jk_size == 3) { 714 715 dp = buffe; 716 717 s01 = buff0[0]; 718 s11 = buff1[0]; 719 s21 = buff2[0]; 720#pragma pipeloop(0) 721 for (i = 0; i < (xsize + 7) / 8; i++) { 722 s00 = s01; 723 s10 = s11; 724 s20 = s21; 725 s01 = buff0[i + 1]; 726 s11 = buff1[i + 1]; 727 s21 = buff2[i + 1]; 728 s0 = vis_faligndata(s00, s01); 729 s1 = vis_faligndata(s10, s11); 730 s2 = vis_faligndata(s20, s21); 731 732 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 733 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 734 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 735 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 736 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 737 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 738 739 d0 = buffd[2 * i]; 740 d1 = buffd[2 * i + 1]; 741 d0 = vis_fpadd16(d0, d00); 742 d0 = vis_fpadd16(d0, d10); 743 d0 = vis_fpadd16(d0, d20); 744 d1 = vis_fpadd16(d1, d01); 745 d1 = vis_fpadd16(d1, d11); 746 d1 = vis_fpadd16(d1, d21); 747 748 dd = vis_fpack16_pair(d0, d1); 749 dp[i] = dd; 750 751 buffd[2 * i] = drnd; 752 buffd[2 * i + 1] = drnd; 753 } 754 755 } 756 else { /* if (jk_size == 4) */ 757 758 dp = buffe; 759 760 s01 = buff0[0]; 761 s11 = buff1[0]; 762 s21 = buff2[0]; 763 s31 = buff3[0]; 764#pragma pipeloop(0) 765 for (i = 0; i < (xsize + 7) / 8; i++) { 766 s00 = s01; 767 s10 = s11; 768 s20 = s21; 769 s30 = s31; 770 s01 = buff0[i + 1]; 771 s11 = buff1[i + 1]; 772 s21 = buff2[i + 1]; 773 s31 = buff3[i + 1]; 774 s0 = vis_faligndata(s00, s01); 775 s1 = vis_faligndata(s10, s11); 776 s2 = vis_faligndata(s20, s21); 777 s3 = vis_faligndata(s30, s31); 778 779 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 780 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 781 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 782 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 783 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 784 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 785 d30 = vis_fmul8x16au(vis_read_hi(s3), k3); 786 d31 = vis_fmul8x16au(vis_read_lo(s3), k3); 787 788 d0 = buffd[2 * i]; 789 d1 = buffd[2 * i + 1]; 790 d0 = vis_fpadd16(d0, d00); 791 d0 = vis_fpadd16(d0, d10); 792 d0 = vis_fpadd16(d0, d20); 793 d0 = vis_fpadd16(d0, d30); 794 d1 = vis_fpadd16(d1, d01); 795 d1 = vis_fpadd16(d1, d11); 796 d1 = vis_fpadd16(d1, d21); 797 d1 = vis_fpadd16(d1, d31); 798 799 dd = vis_fpack16_pair(d0, d1); 800 dp[i] = dd; 801 802 buffd[2 * i] = drnd; 803 buffd[2 * i + 1] = drnd; 804 } 805 } 806 807 mlib_ImageCopy_na((mlib_u8 *) buffe, dl, xsize); 808 809 if (j < hgt - dy_b - 2) 810 sl += sll; 811 dl += dll; 812 813 buff_ind++; 814 815 if (buff_ind >= (n + 1)) 816 buff_ind = 0; 817 } 818 819 mlib_free(pbuff); 820 821 if (buffs != buffs_local) 822 mlib_free(buffs); 823 824 return MLIB_SUCCESS; 825} 826 827/***************************************************************/ 828mlib_status mlib_convMxN_8ext_mask(mlib_image *dst, 829 const mlib_image *src, 830 mlib_s32 m, 831 mlib_s32 n, 832 mlib_s32 dx_l, 833 mlib_s32 dx_r, 834 mlib_s32 dy_t, 835 mlib_s32 dy_b, 836 const mlib_s32 *kern, 837 mlib_s32 scale, 838 mlib_s32 cmask) 839{ 840 mlib_d64 *buffs_local[3 * (MAX_N + 1)], **buffs = buffs_local, **buff; 841 mlib_d64 *buff0, *buff1, *buff2, *buff3, *buffn, *buffd, *buffe; 842 mlib_d64 s00, s01, s10, s11, s20, s21, s30, s31, s0, s1, s2, s3; 843 mlib_d64 d00, d01, d10, d11, d20, d21, d30, d31; 844 mlib_d64 dd, d0, d1; 845 mlib_s32 ik, jk, ik_last, jk_size, coff, off, doff; 846 mlib_u8 *sl, *dl; 847 mlib_s32 hgt = mlib_ImageGetHeight(src); 848 mlib_s32 wid = mlib_ImageGetWidth(src); 849 mlib_s32 sll = mlib_ImageGetStride(src); 850 mlib_s32 dll = mlib_ImageGetStride(dst); 851 mlib_u8 *adr_src = (mlib_u8 *) mlib_ImageGetData(src); 852 mlib_u8 *adr_dst = (mlib_u8 *) mlib_ImageGetData(dst); 853 mlib_s32 ssize, xsize, dsize, esize, buff_ind; 854 mlib_d64 *pbuff, *dp; 855 mlib_f32 *karr = (mlib_f32 *) kern; 856 mlib_s32 gsr_scale = (31 - scale) << 3; 857 mlib_d64 drnd = vis_to_double_dup(mlib_round_8[31 - scale]); 858 mlib_s32 i, j, l, chan, testchan; 859 mlib_s32 nchan = mlib_ImageGetChannels(dst); 860 void (*p_proc_load) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32); 861 void (*p_proc_store) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32); 862 863 if (n > MAX_N) { 864 buffs = mlib_malloc(3 * (n + 1) * sizeof(mlib_d64 *)); 865 866 if (buffs == NULL) 867 return MLIB_FAILURE; 868 } 869 870 buff = buffs + 2 * (n + 1); 871 872 ssize = (wid + (m - 1)); 873 dsize = (ssize + 7) / 8; 874 esize = dsize + 4; 875 pbuff = mlib_malloc((n + 4) * esize * sizeof(mlib_d64)); 876 877 if (pbuff == NULL) { 878 if (buffs != buffs_local) 879 mlib_free(buffs); 880 return MLIB_FAILURE; 881 } 882 883 for (i = 0; i < (n + 1); i++) 884 buffs[i] = pbuff + i * esize; 885 for (i = 0; i < (n + 1); i++) 886 buffs[(n + 1) + i] = buffs[i]; 887 buffd = buffs[n] + esize; 888 buffe = buffd + 2 * esize; 889 890 xsize = wid; 891 ssize -= (dx_l + dx_r); 892 893 vis_write_gsr(gsr_scale + 7); 894 895 if (nchan == 2) { 896 p_proc_load = &mlib_v_ImageChannelExtract_U8_21_D1; 897 p_proc_store = &mlib_v_ImageChannelInsert_U8_12_D1; 898 } 899 else if (nchan == 3) { 900 p_proc_load = &mlib_v_ImageChannelExtract_U8_31_D1; 901 p_proc_store = &mlib_v_ImageChannelInsert_U8_13_D1; 902 } 903 else { 904 p_proc_load = &mlib_v_ImageChannelExtract_U8_41_D1; 905 p_proc_store = &mlib_v_ImageChannelInsert_U8_14_D1; 906 } 907 908 testchan = 1; 909 for (chan = 0; chan < nchan; chan++) { 910 buff_ind = 0; 911 sl = adr_src; 912 dl = adr_dst; 913 914 if ((cmask & testchan) == 0) { 915 testchan <<= 1; 916 continue; 917 } 918 919 for (l = 0; l < n; l++) { 920 mlib_d64 *buffn = buffs[l]; 921 922 (*p_proc_load) ((mlib_u8 *) sl, (mlib_u8 *) buffn + dx_l, ssize, testchan); 923 924 for (i = 0; i < dx_l; i++) { 925 *((mlib_u8 *) buffn + i) = *((mlib_u8 *) buffn + dx_l); 926 } 927 928 for (i = 0; i < dx_r; i++) { 929 *((mlib_u8 *) buffn + i + ssize + dx_l) = 930 *((mlib_u8 *) buffn + (dx_l - 1) + ssize); 931 } 932 933 if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) 934 sl += sll; 935 } 936 937 /* init buffer */ 938#pragma pipeloop(0) 939 for (i = 0; i < (xsize + 7) / 8; i++) { 940 buffd[2 * i] = drnd; 941 buffd[2 * i + 1] = drnd; 942 } 943 944 for (j = 0; j < hgt; j++) { 945 mlib_d64 **buffc = buffs + buff_ind; 946 mlib_f32 *pk = karr, k0, k1, k2, k3; 947 948 for (l = 0; l < n; l++) { 949 buff[l] = buffc[l]; 950 } 951 952 buffn = buffc[n]; 953 954 (*p_proc_load) ((mlib_u8 *) sl, (mlib_u8 *) buffn + dx_l, ssize, testchan); 955 956 for (i = 0; i < dx_l; i++) { 957 *((mlib_u8 *) buffn + i) = *((mlib_u8 *) buffn + dx_l); 958 } 959 960 for (i = 0; i < dx_r; i++) { 961 *((mlib_u8 *) buffn + i + ssize + dx_l) = 962 *((mlib_u8 *) buffn + (dx_l - 1) + ssize); 963 } 964 965 ik_last = (m - 1); 966 967 for (jk = 0; jk < n; jk += jk_size) { 968 jk_size = n - jk; 969 970 if (jk_size >= 6) 971 jk_size = 4; 972 if (jk_size == 5) 973 jk_size = 3; 974 975 coff = 0; 976 977 if (jk_size == 1) { 978 979 for (ik = 0; ik < m; ik++, coff++) { 980 if (!jk && ik == ik_last) 981 continue; 982 983 k0 = pk[ik]; 984 985 doff = coff / 8; 986 buff0 = buff[jk] + doff; 987 988 off = coff & 7; 989 vis_write_gsr(gsr_scale + off); 990 991 s01 = buff0[0]; 992#pragma pipeloop(0) 993 for (i = 0; i < (xsize + 7) / 8; i++) { 994 s00 = s01; 995 s01 = buff0[i + 1]; 996 s0 = vis_faligndata(s00, s01); 997 998 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 999 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 1000 1001 d0 = buffd[2 * i]; 1002 d1 = buffd[2 * i + 1]; 1003 d0 = vis_fpadd16(d00, d0); 1004 d1 = vis_fpadd16(d01, d1); 1005 buffd[2 * i] = d0; 1006 buffd[2 * i + 1] = d1; 1007 } 1008 } 1009 1010 pk += m; 1011 1012 } 1013 else if (jk_size == 2) { 1014 1015 for (ik = 0; ik < m; ik++, coff++) { 1016 if (!jk && ik == ik_last) 1017 continue; 1018 1019 k0 = pk[ik]; 1020 k1 = pk[ik + m]; 1021 1022 doff = coff / 8; 1023 buff0 = buff[jk] + doff; 1024 buff1 = buff[jk + 1] + doff; 1025 1026 off = coff & 7; 1027 vis_write_gsr(gsr_scale + off); 1028 1029 s01 = buff0[0]; 1030 s11 = buff1[0]; 1031#pragma pipeloop(0) 1032 for (i = 0; i < (xsize + 7) / 8; i++) { 1033 s00 = s01; 1034 s10 = s11; 1035 s01 = buff0[i + 1]; 1036 s11 = buff1[i + 1]; 1037 s0 = vis_faligndata(s00, s01); 1038 s1 = vis_faligndata(s10, s11); 1039 1040 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 1041 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 1042 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 1043 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 1044 1045 d0 = buffd[2 * i]; 1046 d1 = buffd[2 * i + 1]; 1047 d0 = vis_fpadd16(d00, d0); 1048 d0 = vis_fpadd16(d10, d0); 1049 d1 = vis_fpadd16(d01, d1); 1050 d1 = vis_fpadd16(d11, d1); 1051 buffd[2 * i] = d0; 1052 buffd[2 * i + 1] = d1; 1053 } 1054 } 1055 1056 pk += 2 * m; 1057 1058 } 1059 else if (jk_size == 3) { 1060 1061 for (ik = 0; ik < m; ik++, coff++) { 1062 if (!jk && ik == ik_last) 1063 continue; 1064 1065 k0 = pk[ik]; 1066 k1 = pk[ik + m]; 1067 k2 = pk[ik + 2 * m]; 1068 1069 doff = coff / 8; 1070 buff0 = buff[jk] + doff; 1071 buff1 = buff[jk + 1] + doff; 1072 buff2 = buff[jk + 2] + doff; 1073 1074 off = coff & 7; 1075 vis_write_gsr(gsr_scale + off); 1076 1077 if (off == 0) { 1078#pragma pipeloop(0) 1079 for (i = 0; i < (xsize + 7) / 8; i++) { 1080 d0 = buffd[2 * i]; 1081 d1 = buffd[2 * i + 1]; 1082 1083 s0 = buff0[i]; 1084 s1 = buff1[i]; 1085 s2 = buff2[i]; 1086 1087 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 1088 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 1089 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 1090 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 1091 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 1092 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 1093 1094 d00 = vis_fpadd16(d00, d10); 1095 d0 = vis_fpadd16(d20, d0); 1096 d0 = vis_fpadd16(d00, d0); 1097 d01 = vis_fpadd16(d01, d11); 1098 d1 = vis_fpadd16(d21, d1); 1099 d1 = vis_fpadd16(d01, d1); 1100 buffd[2 * i] = d0; 1101 buffd[2 * i + 1] = d1; 1102 } 1103 1104 } 1105 else if (off == 4) { 1106 s01 = buff0[0]; 1107 s11 = buff1[0]; 1108 s21 = buff2[0]; 1109#pragma pipeloop(0) 1110 for (i = 0; i < (xsize + 7) / 8; i++) { 1111 d0 = buffd[2 * i]; 1112 d1 = buffd[2 * i + 1]; 1113 1114 s00 = s01; 1115 s10 = s11; 1116 s20 = s21; 1117 s01 = buff0[i + 1]; 1118 s11 = buff1[i + 1]; 1119 s21 = buff2[i + 1]; 1120 1121 d00 = vis_fmul8x16au(vis_read_lo(s00), k0); 1122 d01 = vis_fmul8x16au(vis_read_hi(s01), k0); 1123 d10 = vis_fmul8x16au(vis_read_lo(s10), k1); 1124 d11 = vis_fmul8x16au(vis_read_hi(s11), k1); 1125 d20 = vis_fmul8x16au(vis_read_lo(s20), k2); 1126 d21 = vis_fmul8x16au(vis_read_hi(s21), k2); 1127 1128 d00 = vis_fpadd16(d00, d10); 1129 d0 = vis_fpadd16(d20, d0); 1130 d0 = vis_fpadd16(d00, d0); 1131 d01 = vis_fpadd16(d01, d11); 1132 d1 = vis_fpadd16(d21, d1); 1133 d1 = vis_fpadd16(d01, d1); 1134 buffd[2 * i] = d0; 1135 buffd[2 * i + 1] = d1; 1136 } 1137 1138 } 1139 else { 1140 s01 = buff0[0]; 1141 s11 = buff1[0]; 1142 s21 = buff2[0]; 1143#pragma pipeloop(0) 1144 for (i = 0; i < (xsize + 7) / 8; i++) { 1145 d0 = buffd[2 * i]; 1146 d1 = buffd[2 * i + 1]; 1147 1148 s00 = s01; 1149 s10 = s11; 1150 s20 = s21; 1151 s01 = buff0[i + 1]; 1152 s11 = buff1[i + 1]; 1153 s21 = buff2[i + 1]; 1154 s0 = vis_faligndata(s00, s01); 1155 s1 = vis_faligndata(s10, s11); 1156 s2 = vis_faligndata(s20, s21); 1157 1158 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 1159 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 1160 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 1161 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 1162 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 1163 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 1164 1165 d00 = vis_fpadd16(d00, d10); 1166 d0 = vis_fpadd16(d20, d0); 1167 d0 = vis_fpadd16(d00, d0); 1168 d01 = vis_fpadd16(d01, d11); 1169 d1 = vis_fpadd16(d21, d1); 1170 d1 = vis_fpadd16(d01, d1); 1171 buffd[2 * i] = d0; 1172 buffd[2 * i + 1] = d1; 1173 } 1174 } 1175 } 1176 1177 pk += 3 * m; 1178 1179 } 1180 else { /* jk_size == 4 */ 1181 1182 for (ik = 0; ik < m; ik++, coff++) { 1183 if (!jk && ik == ik_last) 1184 continue; 1185 1186 k0 = pk[ik]; 1187 k1 = pk[ik + m]; 1188 k2 = pk[ik + 2 * m]; 1189 k3 = pk[ik + 3 * m]; 1190 1191 doff = coff / 8; 1192 buff0 = buff[jk] + doff; 1193 buff1 = buff[jk + 1] + doff; 1194 buff2 = buff[jk + 2] + doff; 1195 buff3 = buff[jk + 3] + doff; 1196 1197 off = coff & 7; 1198 vis_write_gsr(gsr_scale + off); 1199 1200 if (off == 0) { 1201 1202#pragma pipeloop(0) 1203 for (i = 0; i < (xsize + 7) / 8; i++) { 1204 d0 = buffd[2 * i]; 1205 d1 = buffd[2 * i + 1]; 1206 1207 s0 = buff0[i]; 1208 s1 = buff1[i]; 1209 s2 = buff2[i]; 1210 s3 = buff3[i]; 1211 1212 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 1213 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 1214 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 1215 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 1216 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 1217 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 1218 d30 = vis_fmul8x16au(vis_read_hi(s3), k3); 1219 d31 = vis_fmul8x16au(vis_read_lo(s3), k3); 1220 1221 d00 = vis_fpadd16(d00, d10); 1222 d20 = vis_fpadd16(d20, d30); 1223 d0 = vis_fpadd16(d0, d00); 1224 d0 = vis_fpadd16(d0, d20); 1225 d01 = vis_fpadd16(d01, d11); 1226 d21 = vis_fpadd16(d21, d31); 1227 d1 = vis_fpadd16(d1, d01); 1228 d1 = vis_fpadd16(d1, d21); 1229 buffd[2 * i] = d0; 1230 buffd[2 * i + 1] = d1; 1231 } 1232 1233 } 1234 else if (off == 4) { 1235 1236 s01 = buff0[0]; 1237 s11 = buff1[0]; 1238 s21 = buff2[0]; 1239 s31 = buff3[0]; 1240#pragma pipeloop(0) 1241 for (i = 0; i < (xsize + 7) / 8; i++) { 1242 d0 = buffd[2 * i]; 1243 d1 = buffd[2 * i + 1]; 1244 1245 s00 = s01; 1246 s10 = s11; 1247 s20 = s21; 1248 s30 = s31; 1249 s01 = buff0[i + 1]; 1250 s11 = buff1[i + 1]; 1251 s21 = buff2[i + 1]; 1252 s31 = buff3[i + 1]; 1253 1254 d00 = vis_fmul8x16au(vis_read_lo(s00), k0); 1255 d01 = vis_fmul8x16au(vis_read_hi(s01), k0); 1256 d10 = vis_fmul8x16au(vis_read_lo(s10), k1); 1257 d11 = vis_fmul8x16au(vis_read_hi(s11), k1); 1258 d20 = vis_fmul8x16au(vis_read_lo(s20), k2); 1259 d21 = vis_fmul8x16au(vis_read_hi(s21), k2); 1260 d30 = vis_fmul8x16au(vis_read_lo(s30), k3); 1261 d31 = vis_fmul8x16au(vis_read_hi(s31), k3); 1262 1263 d00 = vis_fpadd16(d00, d10); 1264 d20 = vis_fpadd16(d20, d30); 1265 d0 = vis_fpadd16(d0, d00); 1266 d0 = vis_fpadd16(d0, d20); 1267 d01 = vis_fpadd16(d01, d11); 1268 d21 = vis_fpadd16(d21, d31); 1269 d1 = vis_fpadd16(d1, d01); 1270 d1 = vis_fpadd16(d1, d21); 1271 buffd[2 * i] = d0; 1272 buffd[2 * i + 1] = d1; 1273 } 1274 1275 } 1276 else { 1277 1278 s01 = buff0[0]; 1279 s11 = buff1[0]; 1280 s21 = buff2[0]; 1281 s31 = buff3[0]; 1282#pragma pipeloop(0) 1283 for (i = 0; i < (xsize + 7) / 8; i++) { 1284 d0 = buffd[2 * i]; 1285 d1 = buffd[2 * i + 1]; 1286 1287 s00 = s01; 1288 s10 = s11; 1289 s20 = s21; 1290 s30 = s31; 1291 s01 = buff0[i + 1]; 1292 s11 = buff1[i + 1]; 1293 s21 = buff2[i + 1]; 1294 s31 = buff3[i + 1]; 1295 s0 = vis_faligndata(s00, s01); 1296 s1 = vis_faligndata(s10, s11); 1297 s2 = vis_faligndata(s20, s21); 1298 s3 = vis_faligndata(s30, s31); 1299 1300 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 1301 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 1302 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 1303 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 1304 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 1305 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 1306 d30 = vis_fmul8x16au(vis_read_hi(s3), k3); 1307 d31 = vis_fmul8x16au(vis_read_lo(s3), k3); 1308 1309 d00 = vis_fpadd16(d00, d10); 1310 d20 = vis_fpadd16(d20, d30); 1311 d0 = vis_fpadd16(d0, d00); 1312 d0 = vis_fpadd16(d0, d20); 1313 d01 = vis_fpadd16(d01, d11); 1314 d21 = vis_fpadd16(d21, d31); 1315 d1 = vis_fpadd16(d1, d01); 1316 d1 = vis_fpadd16(d1, d21); 1317 buffd[2 * i] = d0; 1318 buffd[2 * i + 1] = d1; 1319 } 1320 } 1321 } 1322 1323 pk += 4 * m; 1324 } 1325 } 1326 1327 /***************************************** 1328 ***************************************** 1329 ** Final iteration ** 1330 ***************************************** 1331 *****************************************/ 1332 1333 jk_size = n; 1334 1335 if (jk_size >= 6) 1336 jk_size = 4; 1337 if (jk_size == 5) 1338 jk_size = 3; 1339 1340 k0 = karr[ik_last]; 1341 k1 = karr[ik_last + m]; 1342 k2 = karr[ik_last + 2 * m]; 1343 k3 = karr[ik_last + 3 * m]; 1344 1345 off = ik_last; 1346 doff = off / 8; 1347 off &= 7; 1348 buff0 = buff[0] + doff; 1349 buff1 = buff[1] + doff; 1350 buff2 = buff[2] + doff; 1351 buff3 = buff[3] + doff; 1352 vis_write_gsr(gsr_scale + off); 1353 1354 if (jk_size == 1) { 1355 dp = buffe; 1356 1357 s01 = buff0[0]; 1358#pragma pipeloop(0) 1359 for (i = 0; i < (xsize + 7) / 8; i++) { 1360 s00 = s01; 1361 s01 = buff0[i + 1]; 1362 s0 = vis_faligndata(s00, s01); 1363 1364 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 1365 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 1366 1367 d0 = buffd[2 * i]; 1368 d1 = buffd[2 * i + 1]; 1369 d0 = vis_fpadd16(d0, d00); 1370 d1 = vis_fpadd16(d1, d01); 1371 1372 dd = vis_fpack16_pair(d0, d1); 1373 dp[i] = dd; 1374 1375 buffd[2 * i] = drnd; 1376 buffd[2 * i + 1] = drnd; 1377 } 1378 1379 } 1380 else if (jk_size == 2) { 1381 dp = buffe; 1382 1383 s01 = buff0[0]; 1384 s11 = buff1[0]; 1385#pragma pipeloop(0) 1386 for (i = 0; i < (xsize + 7) / 8; i++) { 1387 s00 = s01; 1388 s10 = s11; 1389 s01 = buff0[i + 1]; 1390 s11 = buff1[i + 1]; 1391 s0 = vis_faligndata(s00, s01); 1392 s1 = vis_faligndata(s10, s11); 1393 1394 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 1395 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 1396 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 1397 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 1398 1399 d0 = buffd[2 * i]; 1400 d1 = buffd[2 * i + 1]; 1401 d0 = vis_fpadd16(d0, d00); 1402 d0 = vis_fpadd16(d0, d10); 1403 d1 = vis_fpadd16(d1, d01); 1404 d1 = vis_fpadd16(d1, d11); 1405 1406 dd = vis_fpack16_pair(d0, d1); 1407 dp[i] = dd; 1408 1409 buffd[2 * i] = drnd; 1410 buffd[2 * i + 1] = drnd; 1411 } 1412 1413 } 1414 else if (jk_size == 3) { 1415 1416 dp = buffe; 1417 1418 s01 = buff0[0]; 1419 s11 = buff1[0]; 1420 s21 = buff2[0]; 1421#pragma pipeloop(0) 1422 for (i = 0; i < (xsize + 7) / 8; i++) { 1423 s00 = s01; 1424 s10 = s11; 1425 s20 = s21; 1426 s01 = buff0[i + 1]; 1427 s11 = buff1[i + 1]; 1428 s21 = buff2[i + 1]; 1429 s0 = vis_faligndata(s00, s01); 1430 s1 = vis_faligndata(s10, s11); 1431 s2 = vis_faligndata(s20, s21); 1432 1433 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 1434 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 1435 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 1436 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 1437 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 1438 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 1439 1440 d0 = buffd[2 * i]; 1441 d1 = buffd[2 * i + 1]; 1442 d0 = vis_fpadd16(d0, d00); 1443 d0 = vis_fpadd16(d0, d10); 1444 d0 = vis_fpadd16(d0, d20); 1445 d1 = vis_fpadd16(d1, d01); 1446 d1 = vis_fpadd16(d1, d11); 1447 d1 = vis_fpadd16(d1, d21); 1448 1449 dd = vis_fpack16_pair(d0, d1); 1450 dp[i] = dd; 1451 1452 buffd[2 * i] = drnd; 1453 buffd[2 * i + 1] = drnd; 1454 } 1455 1456 } 1457 else { /* if (jk_size == 4) */ 1458 1459 dp = buffe; 1460 1461 s01 = buff0[0]; 1462 s11 = buff1[0]; 1463 s21 = buff2[0]; 1464 s31 = buff3[0]; 1465#pragma pipeloop(0) 1466 for (i = 0; i < (xsize + 7) / 8; i++) { 1467 s00 = s01; 1468 s10 = s11; 1469 s20 = s21; 1470 s30 = s31; 1471 s01 = buff0[i + 1]; 1472 s11 = buff1[i + 1]; 1473 s21 = buff2[i + 1]; 1474 s31 = buff3[i + 1]; 1475 s0 = vis_faligndata(s00, s01); 1476 s1 = vis_faligndata(s10, s11); 1477 s2 = vis_faligndata(s20, s21); 1478 s3 = vis_faligndata(s30, s31); 1479 1480 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 1481 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 1482 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 1483 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 1484 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 1485 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 1486 d30 = vis_fmul8x16au(vis_read_hi(s3), k3); 1487 d31 = vis_fmul8x16au(vis_read_lo(s3), k3); 1488 1489 d0 = buffd[2 * i]; 1490 d1 = buffd[2 * i + 1]; 1491 d0 = vis_fpadd16(d0, d00); 1492 d0 = vis_fpadd16(d0, d10); 1493 d0 = vis_fpadd16(d0, d20); 1494 d0 = vis_fpadd16(d0, d30); 1495 d1 = vis_fpadd16(d1, d01); 1496 d1 = vis_fpadd16(d1, d11); 1497 d1 = vis_fpadd16(d1, d21); 1498 d1 = vis_fpadd16(d1, d31); 1499 1500 dd = vis_fpack16_pair(d0, d1); 1501 dp[i] = dd; 1502 1503 buffd[2 * i] = drnd; 1504 buffd[2 * i + 1] = drnd; 1505 } 1506 } 1507 1508 (*p_proc_store) ((mlib_u8 *) buffe, (mlib_u8 *) dl, xsize, testchan); 1509 1510 if (j < hgt - dy_b - 2) 1511 sl += sll; 1512 dl += dll; 1513 1514 buff_ind++; 1515 1516 if (buff_ind >= (n + 1)) 1517 buff_ind = 0; 1518 } 1519 1520 testchan <<= 1; 1521 } 1522 1523 mlib_free(pbuff); 1524 1525 if (buffs != buffs_local) 1526 mlib_free(buffs); 1527 1528 return MLIB_SUCCESS; 1529} 1530 1531/***************************************************************/ 1532