1/* 2 * Copyright (c) 1998, 2003, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 27 28/* 29 * FUNCTION 30 * mlib_convMxN_8nw - convolve a 8-bit image, MxN kernel, 31 * edge = no write 32 * 33 * SYNOPSIS 34 * mlib_status mlib_convMxNnw_u8(mlib_image *dst, 35 * const mlib_image *src, 36 * mlib_s32 kwid, 37 * mlib_s32 khgt, 38 * mlib_s32 khw, 39 * mlib_s32 khh, 40 * const mlib_s32 *skernel, 41 * mlib_s32 discardbits, 42 * mlib_s32 cmask) 43 * 44 * ARGUMENT 45 * src Ptr to source image structure 46 * dst Ptr to destination image structure 47 * khgt Kernel height (# of rows) 48 * kwid Kernel width (# of cols) 49 * skernel Ptr to convolution kernel 50 * discardbits The number of LSBits of the 32-bit accumulator that 51 * are discarded when the 32-bit accumulator is converted 52 * to 16-bit output data; discardbits must be 1-15 (it 53 * cannot be zero). Same as exponent N for scalefac=2**N. 54 * cmask Channel mask to indicate the channels to be convolved. 55 * Each bit of which represents a channel in the image. The 56 * channels corresponded to 1 bits are those to be processed. 57 * 58 * DESCRIPTION 59 * A 2-D convolution (MxN kernel) for 8-bit images. 60 * 61 */ 62 63#include "vis_proto.h" 64#include "mlib_image.h" 65#include "mlib_ImageConv.h" 66#include "mlib_c_ImageConv.h" 67#include "mlib_v_ImageConv.h" 68#include "mlib_v_ImageChannelExtract.h" 69#include "mlib_v_ImageChannelInsert.h" 70 71/***************************************************************/ 72static mlib_status mlib_convMxN_8nw_mask(mlib_image *dst, 73 const mlib_image *src, 74 mlib_s32 m, 75 mlib_s32 n, 76 mlib_s32 dm, 77 mlib_s32 dn, 78 const mlib_s32 *kern, 79 mlib_s32 scale, 80 mlib_s32 cmask); 81 82/***************************************************************/ 83static const mlib_s32 mlib_round_8[16] = { 84 0x00400040, 0x00200020, 0x00100010, 0x00080008, 85 0x00040004, 0x00020002, 0x00010001, 0x00000000, 86 0x00000000, 0x00000000, 0x00000000, 0x00000000, 87 0x00000000, 0x00000000, 0x00000000, 0x00000000 88}; 89 90/***************************************************************/ 91mlib_status mlib_convMxNnw_u8(mlib_image *dst, 92 const mlib_image *src, 93 const mlib_s32 *kernel, 94 mlib_s32 kwid, 95 mlib_s32 khgt, 96 mlib_s32 khw, 97 mlib_s32 khh, 98 mlib_s32 discardbits, 99 mlib_s32 cmask) 100{ 101 mlib_s32 nchannel, amask; 102 103 if (mlib_ImageConvVersion(kwid, khgt, discardbits, MLIB_BYTE) == 0) 104 return mlib_c_convMxNnw_u8(dst, src, kernel, kwid, khgt, khw, khh, 105 discardbits, cmask); 106 107 nchannel = mlib_ImageGetChannels(src); 108 109 if (nchannel == 1) 110 cmask = 1; 111 amask = (1 << nchannel) - 1; 112 113 if ((cmask & amask) == amask) { 114 return mlib_convMxN_8nw_f(dst, src, kwid, khgt, khw, khh, kernel, discardbits); 115 } 116 else { 117 return mlib_convMxN_8nw_mask(dst, src, kwid, khgt, khw, khh, kernel, 118 discardbits, cmask); 119 } 120} 121 122#define MAX_N 11 123 124/***************************************************************/ 125mlib_status mlib_convMxN_8nw_mask(mlib_image *dst, 126 const mlib_image *src, 127 mlib_s32 m, 128 mlib_s32 n, 129 mlib_s32 dm, 130 mlib_s32 dn, 131 const mlib_s32 *kern, 132 mlib_s32 scale, 133 mlib_s32 cmask) 134{ 135 mlib_d64 *buffs_local[3 * (MAX_N + 1)], **buffs = buffs_local, **buff; 136 mlib_d64 *buff0, *buff1, *buff2, *buff3, *buffn, *buffd, *buffe; 137 mlib_d64 s00, s01, s10, s11, s20, s21, s30, s31, s0, s1, s2, s3; 138 mlib_d64 d00, d01, d10, d11, d20, d21, d30, d31; 139 mlib_d64 dd, d0, d1; 140 mlib_s32 ik, jk, ik_last, jk_size, coff, off, doff; 141 mlib_u8 *sl, *sp, *dl; 142 mlib_s32 hgt = mlib_ImageGetHeight(src); 143 mlib_s32 wid = mlib_ImageGetWidth(src); 144 mlib_s32 sll = mlib_ImageGetStride(src); 145 mlib_s32 dll = mlib_ImageGetStride(dst); 146 mlib_u8 *adr_src = (mlib_u8 *) mlib_ImageGetData(src); 147 mlib_u8 *adr_dst = (mlib_u8 *) mlib_ImageGetData(dst); 148 mlib_s32 ssize, xsize, dsize, esize, buff_ind; 149 mlib_d64 *pbuff, *dp; 150 mlib_f32 *karr = (mlib_f32 *) kern; 151 mlib_s32 gsr_scale = (31 - scale) << 3; 152 mlib_d64 drnd = vis_to_double_dup(mlib_round_8[31 - scale]); 153 mlib_s32 i, j, l, chan, testchan; 154 mlib_s32 nchan = mlib_ImageGetChannels(dst); 155 void (*p_proc_load) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32); 156 void (*p_proc_store) (const mlib_u8 *, mlib_u8 *, mlib_s32, mlib_s32); 157 158 if (n > MAX_N) { 159 buffs = mlib_malloc(3 * (n + 1) * sizeof(mlib_d64 *)); 160 161 if (buffs == NULL) 162 return MLIB_FAILURE; 163 } 164 165 buff = buffs + 2 * (n + 1); 166 167 adr_dst += dn * dll + dm * nchan; 168 169 ssize = wid; 170 dsize = (ssize + 7) / 8; 171 esize = dsize + 4; 172 pbuff = mlib_malloc((n + 4) * esize * sizeof(mlib_d64)); 173 174 if (pbuff == NULL) { 175 if (buffs != buffs_local) 176 mlib_free(buffs); 177 return MLIB_FAILURE; 178 } 179 180 for (i = 0; i < (n + 1); i++) 181 buffs[i] = pbuff + i * esize; 182 for (i = 0; i < (n + 1); i++) 183 buffs[(n + 1) + i] = buffs[i]; 184 buffd = buffs[n] + esize; 185 buffe = buffd + 2 * esize; 186 187 hgt -= (n - 1); 188 xsize = ssize - (m - 1); 189 190 vis_write_gsr(gsr_scale + 7); 191 192 if (nchan == 2) { 193 p_proc_load = &mlib_v_ImageChannelExtract_U8_21_D1; 194 p_proc_store = &mlib_v_ImageChannelInsert_U8_12_D1; 195 } 196 else if (nchan == 3) { 197 p_proc_load = &mlib_v_ImageChannelExtract_U8_31_D1; 198 p_proc_store = &mlib_v_ImageChannelInsert_U8_13_D1; 199 } 200 else { 201 p_proc_load = &mlib_v_ImageChannelExtract_U8_41_D1; 202 p_proc_store = &mlib_v_ImageChannelInsert_U8_14_D1; 203 } 204 205 testchan = 1; 206 for (chan = 0; chan < nchan; chan++) { 207 buff_ind = 0; 208 sl = adr_src; 209 dl = adr_dst; 210 211 if ((cmask & testchan) == 0) { 212 testchan <<= 1; 213 continue; 214 } 215 216 for (l = 0; l < n; l++) { 217 mlib_d64 *buffn = buffs[l]; 218 sp = sl + l * sll; 219 220 (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan); 221 } 222 223 /* init buffer */ 224#pragma pipeloop(0) 225 for (i = 0; i < (xsize + 7) / 8; i++) { 226 buffd[2 * i] = drnd; 227 buffd[2 * i + 1] = drnd; 228 } 229 230 for (j = 0; j < hgt; j++) { 231 mlib_d64 **buffc = buffs + buff_ind; 232 mlib_f32 *pk = karr, k0, k1, k2, k3; 233 sp = sl + n * sll; 234 235 for (l = 0; l < n; l++) { 236 buff[l] = buffc[l]; 237 } 238 239 buffn = buffc[n]; 240 241 (*p_proc_load) ((mlib_u8 *) sp, (mlib_u8 *) buffn, ssize, testchan); 242 243 ik_last = (m - 1); 244 245 for (jk = 0; jk < n; jk += jk_size) { 246 jk_size = n - jk; 247 248 if (jk_size >= 6) 249 jk_size = 4; 250 251 if (jk_size == 5) 252 jk_size = 3; 253 254 coff = 0; 255 256 if (jk_size == 1) { 257 258 for (ik = 0; ik < m; ik++, coff++) { 259 if (!jk && ik == ik_last) 260 continue; 261 262 k0 = pk[ik]; 263 264 doff = coff / 8; 265 buff0 = buff[jk] + doff; 266 267 off = coff & 7; 268 vis_write_gsr(gsr_scale + off); 269 270 s01 = buff0[0]; 271#pragma pipeloop(0) 272 for (i = 0; i < (xsize + 7) / 8; i++) { 273 s00 = s01; 274 s01 = buff0[i + 1]; 275 s0 = vis_faligndata(s00, s01); 276 277 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 278 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 279 280 d0 = buffd[2 * i]; 281 d1 = buffd[2 * i + 1]; 282 d0 = vis_fpadd16(d00, d0); 283 d1 = vis_fpadd16(d01, d1); 284 buffd[2 * i] = d0; 285 buffd[2 * i + 1] = d1; 286 } 287 } 288 289 pk += m; 290 } 291 else if (jk_size == 2) { 292 293 for (ik = 0; ik < m; ik++, coff++) { 294 if (!jk && ik == ik_last) 295 continue; 296 297 k0 = pk[ik]; 298 k1 = pk[ik + m]; 299 300 doff = coff / 8; 301 buff0 = buff[jk] + doff; 302 buff1 = buff[jk + 1] + doff; 303 304 off = coff & 7; 305 vis_write_gsr(gsr_scale + off); 306 307 s01 = buff0[0]; 308 s11 = buff1[0]; 309#pragma pipeloop(0) 310 for (i = 0; i < (xsize + 7) / 8; i++) { 311 s00 = s01; 312 s10 = s11; 313 s01 = buff0[i + 1]; 314 s11 = buff1[i + 1]; 315 s0 = vis_faligndata(s00, s01); 316 s1 = vis_faligndata(s10, s11); 317 318 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 319 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 320 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 321 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 322 323 d0 = buffd[2 * i]; 324 d1 = buffd[2 * i + 1]; 325 d0 = vis_fpadd16(d00, d0); 326 d0 = vis_fpadd16(d10, d0); 327 d1 = vis_fpadd16(d01, d1); 328 d1 = vis_fpadd16(d11, d1); 329 buffd[2 * i] = d0; 330 buffd[2 * i + 1] = d1; 331 } 332 } 333 334 pk += 2 * m; 335 } 336 else if (jk_size == 3) { 337 338 for (ik = 0; ik < m; ik++, coff++) { 339 if (!jk && ik == ik_last) 340 continue; 341 342 k0 = pk[ik]; 343 k1 = pk[ik + m]; 344 k2 = pk[ik + 2 * m]; 345 346 doff = coff / 8; 347 buff0 = buff[jk] + doff; 348 buff1 = buff[jk + 1] + doff; 349 buff2 = buff[jk + 2] + doff; 350 351 off = coff & 7; 352 vis_write_gsr(gsr_scale + off); 353 354 if (off == 0) { 355#pragma pipeloop(0) 356 for (i = 0; i < (xsize + 7) / 8; i++) { 357 d0 = buffd[2 * i]; 358 d1 = buffd[2 * i + 1]; 359 360 s0 = buff0[i]; 361 s1 = buff1[i]; 362 s2 = buff2[i]; 363 364 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 365 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 366 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 367 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 368 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 369 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 370 371 d00 = vis_fpadd16(d00, d10); 372 d0 = vis_fpadd16(d20, d0); 373 d0 = vis_fpadd16(d00, d0); 374 d01 = vis_fpadd16(d01, d11); 375 d1 = vis_fpadd16(d21, d1); 376 d1 = vis_fpadd16(d01, d1); 377 buffd[2 * i] = d0; 378 buffd[2 * i + 1] = d1; 379 } 380 } 381 else if (off == 4) { 382 s01 = buff0[0]; 383 s11 = buff1[0]; 384 s21 = buff2[0]; 385#pragma pipeloop(0) 386 for (i = 0; i < (xsize + 7) / 8; i++) { 387 d0 = buffd[2 * i]; 388 d1 = buffd[2 * i + 1]; 389 390 s00 = s01; 391 s10 = s11; 392 s20 = s21; 393 s01 = buff0[i + 1]; 394 s11 = buff1[i + 1]; 395 s21 = buff2[i + 1]; 396 397 d00 = vis_fmul8x16au(vis_read_lo(s00), k0); 398 d01 = vis_fmul8x16au(vis_read_hi(s01), k0); 399 d10 = vis_fmul8x16au(vis_read_lo(s10), k1); 400 d11 = vis_fmul8x16au(vis_read_hi(s11), k1); 401 d20 = vis_fmul8x16au(vis_read_lo(s20), k2); 402 d21 = vis_fmul8x16au(vis_read_hi(s21), k2); 403 404 d00 = vis_fpadd16(d00, d10); 405 d0 = vis_fpadd16(d20, d0); 406 d0 = vis_fpadd16(d00, d0); 407 d01 = vis_fpadd16(d01, d11); 408 d1 = vis_fpadd16(d21, d1); 409 d1 = vis_fpadd16(d01, d1); 410 buffd[2 * i] = d0; 411 buffd[2 * i + 1] = d1; 412 } 413 } 414 else { 415 s01 = buff0[0]; 416 s11 = buff1[0]; 417 s21 = buff2[0]; 418#pragma pipeloop(0) 419 for (i = 0; i < (xsize + 7) / 8; i++) { 420 d0 = buffd[2 * i]; 421 d1 = buffd[2 * i + 1]; 422 423 s00 = s01; 424 s10 = s11; 425 s20 = s21; 426 s01 = buff0[i + 1]; 427 s11 = buff1[i + 1]; 428 s21 = buff2[i + 1]; 429 s0 = vis_faligndata(s00, s01); 430 s1 = vis_faligndata(s10, s11); 431 s2 = vis_faligndata(s20, s21); 432 433 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 434 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 435 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 436 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 437 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 438 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 439 440 d00 = vis_fpadd16(d00, d10); 441 d0 = vis_fpadd16(d20, d0); 442 d0 = vis_fpadd16(d00, d0); 443 d01 = vis_fpadd16(d01, d11); 444 d1 = vis_fpadd16(d21, d1); 445 d1 = vis_fpadd16(d01, d1); 446 buffd[2 * i] = d0; 447 buffd[2 * i + 1] = d1; 448 } 449 } 450 } 451 452 pk += 3 * m; 453 } 454 else { /* jk_size == 4 */ 455 456 for (ik = 0; ik < m; ik++, coff++) { 457 if (!jk && ik == ik_last) 458 continue; 459 460 k0 = pk[ik]; 461 k1 = pk[ik + m]; 462 k2 = pk[ik + 2 * m]; 463 k3 = pk[ik + 3 * m]; 464 465 doff = coff / 8; 466 buff0 = buff[jk] + doff; 467 buff1 = buff[jk + 1] + doff; 468 buff2 = buff[jk + 2] + doff; 469 buff3 = buff[jk + 3] + doff; 470 471 off = coff & 7; 472 vis_write_gsr(gsr_scale + off); 473 474 if (off == 0) { 475 476#pragma pipeloop(0) 477 for (i = 0; i < (xsize + 7) / 8; i++) { 478 d0 = buffd[2 * i]; 479 d1 = buffd[2 * i + 1]; 480 481 s0 = buff0[i]; 482 s1 = buff1[i]; 483 s2 = buff2[i]; 484 s3 = buff3[i]; 485 486 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 487 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 488 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 489 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 490 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 491 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 492 d30 = vis_fmul8x16au(vis_read_hi(s3), k3); 493 d31 = vis_fmul8x16au(vis_read_lo(s3), k3); 494 495 d00 = vis_fpadd16(d00, d10); 496 d20 = vis_fpadd16(d20, d30); 497 d0 = vis_fpadd16(d0, d00); 498 d0 = vis_fpadd16(d0, d20); 499 d01 = vis_fpadd16(d01, d11); 500 d21 = vis_fpadd16(d21, d31); 501 d1 = vis_fpadd16(d1, d01); 502 d1 = vis_fpadd16(d1, d21); 503 buffd[2 * i] = d0; 504 buffd[2 * i + 1] = d1; 505 } 506 } 507 else if (off == 4) { 508 509 s01 = buff0[0]; 510 s11 = buff1[0]; 511 s21 = buff2[0]; 512 s31 = buff3[0]; 513#pragma pipeloop(0) 514 for (i = 0; i < (xsize + 7) / 8; i++) { 515 d0 = buffd[2 * i]; 516 d1 = buffd[2 * i + 1]; 517 518 s00 = s01; 519 s10 = s11; 520 s20 = s21; 521 s30 = s31; 522 s01 = buff0[i + 1]; 523 s11 = buff1[i + 1]; 524 s21 = buff2[i + 1]; 525 s31 = buff3[i + 1]; 526 527 d00 = vis_fmul8x16au(vis_read_lo(s00), k0); 528 d01 = vis_fmul8x16au(vis_read_hi(s01), k0); 529 d10 = vis_fmul8x16au(vis_read_lo(s10), k1); 530 d11 = vis_fmul8x16au(vis_read_hi(s11), k1); 531 d20 = vis_fmul8x16au(vis_read_lo(s20), k2); 532 d21 = vis_fmul8x16au(vis_read_hi(s21), k2); 533 d30 = vis_fmul8x16au(vis_read_lo(s30), k3); 534 d31 = vis_fmul8x16au(vis_read_hi(s31), k3); 535 536 d00 = vis_fpadd16(d00, d10); 537 d20 = vis_fpadd16(d20, d30); 538 d0 = vis_fpadd16(d0, d00); 539 d0 = vis_fpadd16(d0, d20); 540 d01 = vis_fpadd16(d01, d11); 541 d21 = vis_fpadd16(d21, d31); 542 d1 = vis_fpadd16(d1, d01); 543 d1 = vis_fpadd16(d1, d21); 544 buffd[2 * i] = d0; 545 buffd[2 * i + 1] = d1; 546 } 547 } 548 else { 549 550 s01 = buff0[0]; 551 s11 = buff1[0]; 552 s21 = buff2[0]; 553 s31 = buff3[0]; 554#pragma pipeloop(0) 555 for (i = 0; i < (xsize + 7) / 8; i++) { 556 d0 = buffd[2 * i]; 557 d1 = buffd[2 * i + 1]; 558 559 s00 = s01; 560 s10 = s11; 561 s20 = s21; 562 s30 = s31; 563 s01 = buff0[i + 1]; 564 s11 = buff1[i + 1]; 565 s21 = buff2[i + 1]; 566 s31 = buff3[i + 1]; 567 s0 = vis_faligndata(s00, s01); 568 s1 = vis_faligndata(s10, s11); 569 s2 = vis_faligndata(s20, s21); 570 s3 = vis_faligndata(s30, s31); 571 572 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 573 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 574 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 575 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 576 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 577 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 578 d30 = vis_fmul8x16au(vis_read_hi(s3), k3); 579 d31 = vis_fmul8x16au(vis_read_lo(s3), k3); 580 581 d00 = vis_fpadd16(d00, d10); 582 d20 = vis_fpadd16(d20, d30); 583 d0 = vis_fpadd16(d0, d00); 584 d0 = vis_fpadd16(d0, d20); 585 d01 = vis_fpadd16(d01, d11); 586 d21 = vis_fpadd16(d21, d31); 587 d1 = vis_fpadd16(d1, d01); 588 d1 = vis_fpadd16(d1, d21); 589 buffd[2 * i] = d0; 590 buffd[2 * i + 1] = d1; 591 } 592 } 593 } 594 595 pk += 4 * m; 596 } 597 } 598 599 /***************************************** 600 ***************************************** 601 ** Final iteration ** 602 ***************************************** 603 *****************************************/ 604 605 jk_size = n; 606 607 if (jk_size >= 6) 608 jk_size = 4; 609 610 if (jk_size == 5) 611 jk_size = 3; 612 613 k0 = karr[ik_last]; 614 k1 = karr[ik_last + m]; 615 k2 = karr[ik_last + 2 * m]; 616 k3 = karr[ik_last + 3 * m]; 617 618 off = ik_last; 619 doff = off / 8; 620 off &= 7; 621 buff0 = buff[0] + doff; 622 buff1 = buff[1] + doff; 623 buff2 = buff[2] + doff; 624 buff3 = buff[3] + doff; 625 vis_write_gsr(gsr_scale + off); 626 627 if (jk_size == 1) { 628 dp = buffe; 629 630 s01 = buff0[0]; 631#pragma pipeloop(0) 632 for (i = 0; i < (xsize + 7) / 8; i++) { 633 s00 = s01; 634 s01 = buff0[i + 1]; 635 s0 = vis_faligndata(s00, s01); 636 637 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 638 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 639 640 d0 = buffd[2 * i]; 641 d1 = buffd[2 * i + 1]; 642 d0 = vis_fpadd16(d0, d00); 643 d1 = vis_fpadd16(d1, d01); 644 645 dd = vis_fpack16_pair(d0, d1); 646 dp[i] = dd; 647 648 buffd[2 * i] = drnd; 649 buffd[2 * i + 1] = drnd; 650 } 651 } 652 else if (jk_size == 2) { 653 dp = buffe; 654 655 s01 = buff0[0]; 656 s11 = buff1[0]; 657#pragma pipeloop(0) 658 for (i = 0; i < (xsize + 7) / 8; i++) { 659 s00 = s01; 660 s10 = s11; 661 s01 = buff0[i + 1]; 662 s11 = buff1[i + 1]; 663 s0 = vis_faligndata(s00, s01); 664 s1 = vis_faligndata(s10, s11); 665 666 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 667 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 668 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 669 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 670 671 d0 = buffd[2 * i]; 672 d1 = buffd[2 * i + 1]; 673 d0 = vis_fpadd16(d0, d00); 674 d0 = vis_fpadd16(d0, d10); 675 d1 = vis_fpadd16(d1, d01); 676 d1 = vis_fpadd16(d1, d11); 677 678 dd = vis_fpack16_pair(d0, d1); 679 dp[i] = dd; 680 681 buffd[2 * i] = drnd; 682 buffd[2 * i + 1] = drnd; 683 } 684 } 685 else if (jk_size == 3) { 686 687 dp = buffe; 688 689 s01 = buff0[0]; 690 s11 = buff1[0]; 691 s21 = buff2[0]; 692#pragma pipeloop(0) 693 for (i = 0; i < (xsize + 7) / 8; i++) { 694 s00 = s01; 695 s10 = s11; 696 s20 = s21; 697 s01 = buff0[i + 1]; 698 s11 = buff1[i + 1]; 699 s21 = buff2[i + 1]; 700 s0 = vis_faligndata(s00, s01); 701 s1 = vis_faligndata(s10, s11); 702 s2 = vis_faligndata(s20, s21); 703 704 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 705 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 706 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 707 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 708 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 709 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 710 711 d0 = buffd[2 * i]; 712 d1 = buffd[2 * i + 1]; 713 d0 = vis_fpadd16(d0, d00); 714 d0 = vis_fpadd16(d0, d10); 715 d0 = vis_fpadd16(d0, d20); 716 d1 = vis_fpadd16(d1, d01); 717 d1 = vis_fpadd16(d1, d11); 718 d1 = vis_fpadd16(d1, d21); 719 720 dd = vis_fpack16_pair(d0, d1); 721 dp[i] = dd; 722 723 buffd[2 * i] = drnd; 724 buffd[2 * i + 1] = drnd; 725 } 726 } 727 else { /* if (jk_size == 4) */ 728 729 dp = buffe; 730 731 s01 = buff0[0]; 732 s11 = buff1[0]; 733 s21 = buff2[0]; 734 s31 = buff3[0]; 735#pragma pipeloop(0) 736 for (i = 0; i < (xsize + 7) / 8; i++) { 737 s00 = s01; 738 s10 = s11; 739 s20 = s21; 740 s30 = s31; 741 s01 = buff0[i + 1]; 742 s11 = buff1[i + 1]; 743 s21 = buff2[i + 1]; 744 s31 = buff3[i + 1]; 745 s0 = vis_faligndata(s00, s01); 746 s1 = vis_faligndata(s10, s11); 747 s2 = vis_faligndata(s20, s21); 748 s3 = vis_faligndata(s30, s31); 749 750 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 751 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 752 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 753 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 754 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 755 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 756 d30 = vis_fmul8x16au(vis_read_hi(s3), k3); 757 d31 = vis_fmul8x16au(vis_read_lo(s3), k3); 758 759 d0 = buffd[2 * i]; 760 d1 = buffd[2 * i + 1]; 761 d0 = vis_fpadd16(d0, d00); 762 d0 = vis_fpadd16(d0, d10); 763 d0 = vis_fpadd16(d0, d20); 764 d0 = vis_fpadd16(d0, d30); 765 d1 = vis_fpadd16(d1, d01); 766 d1 = vis_fpadd16(d1, d11); 767 d1 = vis_fpadd16(d1, d21); 768 d1 = vis_fpadd16(d1, d31); 769 770 dd = vis_fpack16_pair(d0, d1); 771 dp[i] = dd; 772 773 buffd[2 * i] = drnd; 774 buffd[2 * i + 1] = drnd; 775 } 776 } 777 778 (*p_proc_store) ((mlib_u8 *) buffe, (mlib_u8 *) dl, xsize, testchan); 779 780 sl += sll; 781 dl += dll; 782 783 buff_ind++; 784 785 if (buff_ind >= (n + 1)) 786 buff_ind = 0; 787 } 788 789 testchan <<= 1; 790 } 791 792 mlib_free(pbuff); 793 794 if (buffs != buffs_local) 795 mlib_free(buffs); 796 797 return MLIB_SUCCESS; 798} 799 800/***************************************************************/ 801