1/* 2 * Copyright (c) 2000, 2003, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 27 28/* 29 * FUNCTION 30 * Internal functions for mlib_ImageConv* on U8 type 31 * and MLIB_EDGE_DST_NO_WRITE mask 32 */ 33 34#include "vis_proto.h" 35#include "mlib_image.h" 36#include "mlib_ImageCheck.h" 37#include "mlib_ImageCopy.h" 38#include "mlib_ImageConv.h" 39#include "mlib_v_ImageConv.h" 40 41/***************************************************************/ 42#define DTYPE mlib_u8 43 44/***************************************************************/ 45#define NCHAN nchan 46 47/***************************************************************/ 48#define DEF_VARS \ 49 DTYPE *sl, *sp, *dl; \ 50 mlib_s32 hgt = mlib_ImageGetHeight(src); \ 51 mlib_s32 wid = mlib_ImageGetWidth(src); \ 52 mlib_s32 sll = mlib_ImageGetStride(src) / sizeof(DTYPE); \ 53 mlib_s32 dll = mlib_ImageGetStride(dst) / sizeof(DTYPE); \ 54 DTYPE *adr_src = (DTYPE *)mlib_ImageGetData(src); \ 55 DTYPE *adr_dst = (DTYPE *)mlib_ImageGetData(dst); \ 56 mlib_s32 ssize, xsize, dsize, esize, emask, buff_ind = 0; \ 57 mlib_d64 *pbuff, *dp; \ 58 mlib_f32 *karr = (mlib_f32 *)kern; \ 59 mlib_s32 gsr_scale = (31 - scale) << 3; \ 60 mlib_d64 drnd = vis_to_double_dup(mlib_round_8[31 - scale]); \ 61 mlib_s32 i, j, l 62 63/***************************************************************/ 64#define DEF_EXTRA_VARS \ 65 mlib_s32 nchan = mlib_ImageGetChannels(dst) 66 67/***************************************************************/ 68static const mlib_s32 mlib_round_8[16] = { 69 0x00400040, 0x00200020, 0x00100010, 0x00080008, 70 0x00040004, 0x00020002, 0x00010001, 0x00000000, 71 0x00000000, 0x00000000, 0x00000000, 0x00000000, 72 0x00000000, 0x00000000, 0x00000000, 0x00000000 73}; 74 75/***************************************************************/ 76#define MAX_N 11 77 78mlib_status mlib_convMxN_8nw_f(mlib_image *dst, 79 const mlib_image *src, 80 mlib_s32 m, 81 mlib_s32 n, 82 mlib_s32 dm, 83 mlib_s32 dn, 84 const mlib_s32 *kern, 85 mlib_s32 scale) 86{ 87 mlib_d64 *buffs_local[3 * (MAX_N + 1)], **buffs = buffs_local, **buff; 88 mlib_d64 *buff0, *buff1, *buff2, *buff3, *buffn, *buffd, *buffe; 89 mlib_d64 s00, s01, s10, s11, s20, s21, s30, s31, s0, s1, s2, s3; 90 mlib_d64 d00, d01, d10, d11, d20, d21, d30, d31; 91 mlib_d64 dd, d0, d1; 92 mlib_s32 ik, jk, ik_last, jk_size, coff, off, doff; 93 DEF_VARS; 94 DEF_EXTRA_VARS; 95 96 if (n > MAX_N) { 97 buffs = mlib_malloc(3 * (n + 1) * sizeof(mlib_d64 *)); 98 99 if (buffs == NULL) 100 return MLIB_FAILURE; 101 } 102 103 buff = buffs + 2 * (n + 1); 104 105 sl = adr_src; 106 dl = adr_dst + dn * dll + dm * NCHAN; 107 108 ssize = NCHAN * wid; 109 dsize = (ssize + 7) / 8; 110 esize = dsize + 4; 111 pbuff = mlib_malloc((n + 4) * esize * sizeof(mlib_d64)); 112 113 if (pbuff == NULL) { 114 if (buffs != buffs_local) 115 mlib_free(buffs); 116 return MLIB_FAILURE; 117 } 118 119 for (i = 0; i < (n + 1); i++) 120 buffs[i] = pbuff + i * esize; 121 for (i = 0; i < (n + 1); i++) 122 buffs[(n + 1) + i] = buffs[i]; 123 buffd = buffs[n] + esize; 124 buffe = buffd + 2 * esize; 125 126 wid -= (m - 1); 127 hgt -= (n - 1); 128 xsize = ssize - NCHAN * (m - 1); 129 emask = (0xFF00 >> (xsize & 7)) & 0xFF; 130 131 vis_write_gsr(gsr_scale + 7); 132 133 for (l = 0; l < n; l++) { 134 mlib_d64 *buffn = buffs[l]; 135 sp = sl + l * sll; 136 137 if ((mlib_addr) sp & 7) 138 mlib_ImageCopy_na((void *)sp, (void *)buffn, ssize); 139 } 140 141 /* init buffer */ 142#pragma pipeloop(0) 143 for (i = 0; i < (xsize + 7) / 8; i++) { 144 buffd[2 * i] = drnd; 145 buffd[2 * i + 1] = drnd; 146 } 147 148 for (j = 0; j < hgt; j++) { 149 mlib_d64 **buffc = buffs + buff_ind; 150 mlib_f32 *pk = karr, k0, k1, k2, k3; 151 sp = sl + n * sll; 152 153 for (l = 0; l < n; l++) { 154 buff[l] = buffc[l]; 155 } 156 157 buffn = buffc[n]; 158 159 for (l = 0; l < n; l++) { 160 if ((((mlib_addr) (sl + l * sll)) & 7) == 0) 161 buff[l] = (mlib_d64 *) (sl + l * sll); 162 } 163 164 if ((mlib_addr) sp & 7) 165 mlib_ImageCopy_na((void *)sp, (void *)buffn, ssize); 166 167 ik_last = (m - 1); 168 169 for (jk = 0; jk < n; jk += jk_size) { 170 jk_size = n - jk; 171 172 if (jk_size >= 6) 173 jk_size = 4; 174 175 if (jk_size == 5) 176 jk_size = 3; 177 178 coff = 0; 179 180 if (jk_size == 1) { 181 182 for (ik = 0; ik < m; ik++, coff += NCHAN) { 183 if (!jk && ik == ik_last) 184 continue; 185 186 k0 = pk[ik]; 187 188 doff = coff / 8; 189 buff0 = buff[jk] + doff; 190 191 off = coff & 7; 192 vis_write_gsr(gsr_scale + off); 193 194 s01 = buff0[0]; 195#pragma pipeloop(0) 196 for (i = 0; i < (xsize + 7) / 8; i++) { 197 s00 = s01; 198 s01 = buff0[i + 1]; 199 s0 = vis_faligndata(s00, s01); 200 201 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 202 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 203 204 d0 = buffd[2 * i]; 205 d1 = buffd[2 * i + 1]; 206 d0 = vis_fpadd16(d00, d0); 207 d1 = vis_fpadd16(d01, d1); 208 buffd[2 * i] = d0; 209 buffd[2 * i + 1] = d1; 210 } 211 } 212 213 pk += m; 214 } 215 else if (jk_size == 2) { 216 217 for (ik = 0; ik < m; ik++, coff += NCHAN) { 218 if (!jk && ik == ik_last) 219 continue; 220 221 k0 = pk[ik]; 222 k1 = pk[ik + m]; 223 224 doff = coff / 8; 225 buff0 = buff[jk] + doff; 226 buff1 = buff[jk + 1] + doff; 227 228 off = coff & 7; 229 vis_write_gsr(gsr_scale + off); 230 231 s01 = buff0[0]; 232 s11 = buff1[0]; 233#pragma pipeloop(0) 234 for (i = 0; i < (xsize + 7) / 8; i++) { 235 s00 = s01; 236 s10 = s11; 237 s01 = buff0[i + 1]; 238 s11 = buff1[i + 1]; 239 s0 = vis_faligndata(s00, s01); 240 s1 = vis_faligndata(s10, s11); 241 242 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 243 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 244 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 245 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 246 247 d0 = buffd[2 * i]; 248 d1 = buffd[2 * i + 1]; 249 d0 = vis_fpadd16(d00, d0); 250 d0 = vis_fpadd16(d10, d0); 251 d1 = vis_fpadd16(d01, d1); 252 d1 = vis_fpadd16(d11, d1); 253 buffd[2 * i] = d0; 254 buffd[2 * i + 1] = d1; 255 } 256 } 257 258 pk += 2 * m; 259 } 260 else if (jk_size == 3) { 261 262 for (ik = 0; ik < m; ik++, coff += NCHAN) { 263 if (!jk && ik == ik_last) 264 continue; 265 266 k0 = pk[ik]; 267 k1 = pk[ik + m]; 268 k2 = pk[ik + 2 * m]; 269 270 doff = coff / 8; 271 buff0 = buff[jk] + doff; 272 buff1 = buff[jk + 1] + doff; 273 buff2 = buff[jk + 2] + doff; 274 275 off = coff & 7; 276 vis_write_gsr(gsr_scale + off); 277 278 if (off == 0) { 279#pragma pipeloop(0) 280 for (i = 0; i < (xsize + 7) / 8; i++) { 281 d0 = buffd[2 * i]; 282 d1 = buffd[2 * i + 1]; 283 284 s0 = buff0[i]; 285 s1 = buff1[i]; 286 s2 = buff2[i]; 287 288 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 289 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 290 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 291 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 292 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 293 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 294 295 d00 = vis_fpadd16(d00, d10); 296 d0 = vis_fpadd16(d20, d0); 297 d0 = vis_fpadd16(d00, d0); 298 d01 = vis_fpadd16(d01, d11); 299 d1 = vis_fpadd16(d21, d1); 300 d1 = vis_fpadd16(d01, d1); 301 buffd[2 * i] = d0; 302 buffd[2 * i + 1] = d1; 303 } 304 } 305 else if (off == 4) { 306 s01 = buff0[0]; 307 s11 = buff1[0]; 308 s21 = buff2[0]; 309#pragma pipeloop(0) 310 for (i = 0; i < (xsize + 7) / 8; i++) { 311 d0 = buffd[2 * i]; 312 d1 = buffd[2 * i + 1]; 313 314 s00 = s01; 315 s10 = s11; 316 s20 = s21; 317 s01 = buff0[i + 1]; 318 s11 = buff1[i + 1]; 319 s21 = buff2[i + 1]; 320 321 d00 = vis_fmul8x16au(vis_read_lo(s00), k0); 322 d01 = vis_fmul8x16au(vis_read_hi(s01), k0); 323 d10 = vis_fmul8x16au(vis_read_lo(s10), k1); 324 d11 = vis_fmul8x16au(vis_read_hi(s11), k1); 325 d20 = vis_fmul8x16au(vis_read_lo(s20), k2); 326 d21 = vis_fmul8x16au(vis_read_hi(s21), k2); 327 328 d00 = vis_fpadd16(d00, d10); 329 d0 = vis_fpadd16(d20, d0); 330 d0 = vis_fpadd16(d00, d0); 331 d01 = vis_fpadd16(d01, d11); 332 d1 = vis_fpadd16(d21, d1); 333 d1 = vis_fpadd16(d01, d1); 334 buffd[2 * i] = d0; 335 buffd[2 * i + 1] = d1; 336 } 337 } 338 else { 339 s01 = buff0[0]; 340 s11 = buff1[0]; 341 s21 = buff2[0]; 342#pragma pipeloop(0) 343 for (i = 0; i < (xsize + 7) / 8; i++) { 344 d0 = buffd[2 * i]; 345 d1 = buffd[2 * i + 1]; 346 347 s00 = s01; 348 s10 = s11; 349 s20 = s21; 350 s01 = buff0[i + 1]; 351 s11 = buff1[i + 1]; 352 s21 = buff2[i + 1]; 353 s0 = vis_faligndata(s00, s01); 354 s1 = vis_faligndata(s10, s11); 355 s2 = vis_faligndata(s20, s21); 356 357 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 358 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 359 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 360 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 361 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 362 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 363 364 d00 = vis_fpadd16(d00, d10); 365 d0 = vis_fpadd16(d20, d0); 366 d0 = vis_fpadd16(d00, d0); 367 d01 = vis_fpadd16(d01, d11); 368 d1 = vis_fpadd16(d21, d1); 369 d1 = vis_fpadd16(d01, d1); 370 buffd[2 * i] = d0; 371 buffd[2 * i + 1] = d1; 372 } 373 } 374 } 375 376 pk += 3 * m; 377 } 378 else { /* jk_size == 4 */ 379 380 for (ik = 0; ik < m; ik++, coff += NCHAN) { 381 if (!jk && ik == ik_last) 382 continue; 383 384 k0 = pk[ik]; 385 k1 = pk[ik + m]; 386 k2 = pk[ik + 2 * m]; 387 k3 = pk[ik + 3 * m]; 388 389 doff = coff / 8; 390 buff0 = buff[jk] + doff; 391 buff1 = buff[jk + 1] + doff; 392 buff2 = buff[jk + 2] + doff; 393 buff3 = buff[jk + 3] + doff; 394 395 off = coff & 7; 396 vis_write_gsr(gsr_scale + off); 397 398 if (off == 0) { 399 400#pragma pipeloop(0) 401 for (i = 0; i < (xsize + 7) / 8; i++) { 402 d0 = buffd[2 * i]; 403 d1 = buffd[2 * i + 1]; 404 405 s0 = buff0[i]; 406 s1 = buff1[i]; 407 s2 = buff2[i]; 408 s3 = buff3[i]; 409 410 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 411 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 412 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 413 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 414 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 415 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 416 d30 = vis_fmul8x16au(vis_read_hi(s3), k3); 417 d31 = vis_fmul8x16au(vis_read_lo(s3), k3); 418 419 d00 = vis_fpadd16(d00, d10); 420 d20 = vis_fpadd16(d20, d30); 421 d0 = vis_fpadd16(d0, d00); 422 d0 = vis_fpadd16(d0, d20); 423 d01 = vis_fpadd16(d01, d11); 424 d21 = vis_fpadd16(d21, d31); 425 d1 = vis_fpadd16(d1, d01); 426 d1 = vis_fpadd16(d1, d21); 427 buffd[2 * i] = d0; 428 buffd[2 * i + 1] = d1; 429 } 430 } 431 else if (off == 4) { 432 433 s01 = buff0[0]; 434 s11 = buff1[0]; 435 s21 = buff2[0]; 436 s31 = buff3[0]; 437#pragma pipeloop(0) 438 for (i = 0; i < (xsize + 7) / 8; i++) { 439 d0 = buffd[2 * i]; 440 d1 = buffd[2 * i + 1]; 441 442 s00 = s01; 443 s10 = s11; 444 s20 = s21; 445 s30 = s31; 446 s01 = buff0[i + 1]; 447 s11 = buff1[i + 1]; 448 s21 = buff2[i + 1]; 449 s31 = buff3[i + 1]; 450 451 d00 = vis_fmul8x16au(vis_read_lo(s00), k0); 452 d01 = vis_fmul8x16au(vis_read_hi(s01), k0); 453 d10 = vis_fmul8x16au(vis_read_lo(s10), k1); 454 d11 = vis_fmul8x16au(vis_read_hi(s11), k1); 455 d20 = vis_fmul8x16au(vis_read_lo(s20), k2); 456 d21 = vis_fmul8x16au(vis_read_hi(s21), k2); 457 d30 = vis_fmul8x16au(vis_read_lo(s30), k3); 458 d31 = vis_fmul8x16au(vis_read_hi(s31), k3); 459 460 d00 = vis_fpadd16(d00, d10); 461 d20 = vis_fpadd16(d20, d30); 462 d0 = vis_fpadd16(d0, d00); 463 d0 = vis_fpadd16(d0, d20); 464 d01 = vis_fpadd16(d01, d11); 465 d21 = vis_fpadd16(d21, d31); 466 d1 = vis_fpadd16(d1, d01); 467 d1 = vis_fpadd16(d1, d21); 468 buffd[2 * i] = d0; 469 buffd[2 * i + 1] = d1; 470 } 471 } 472 else { 473 474 s01 = buff0[0]; 475 s11 = buff1[0]; 476 s21 = buff2[0]; 477 s31 = buff3[0]; 478#pragma pipeloop(0) 479 for (i = 0; i < (xsize + 7) / 8; i++) { 480 d0 = buffd[2 * i]; 481 d1 = buffd[2 * i + 1]; 482 483 s00 = s01; 484 s10 = s11; 485 s20 = s21; 486 s30 = s31; 487 s01 = buff0[i + 1]; 488 s11 = buff1[i + 1]; 489 s21 = buff2[i + 1]; 490 s31 = buff3[i + 1]; 491 s0 = vis_faligndata(s00, s01); 492 s1 = vis_faligndata(s10, s11); 493 s2 = vis_faligndata(s20, s21); 494 s3 = vis_faligndata(s30, s31); 495 496 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 497 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 498 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 499 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 500 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 501 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 502 d30 = vis_fmul8x16au(vis_read_hi(s3), k3); 503 d31 = vis_fmul8x16au(vis_read_lo(s3), k3); 504 505 d00 = vis_fpadd16(d00, d10); 506 d20 = vis_fpadd16(d20, d30); 507 d0 = vis_fpadd16(d0, d00); 508 d0 = vis_fpadd16(d0, d20); 509 d01 = vis_fpadd16(d01, d11); 510 d21 = vis_fpadd16(d21, d31); 511 d1 = vis_fpadd16(d1, d01); 512 d1 = vis_fpadd16(d1, d21); 513 buffd[2 * i] = d0; 514 buffd[2 * i + 1] = d1; 515 } 516 } 517 } 518 519 pk += 4 * m; 520 } 521 } 522 523 /***************************************** 524 ***************************************** 525 ** Final iteration ** 526 ***************************************** 527 *****************************************/ 528 529 jk_size = n; 530 531 if (jk_size >= 6) 532 jk_size = 4; 533 534 if (jk_size == 5) 535 jk_size = 3; 536 537 k0 = karr[ik_last]; 538 k1 = karr[ik_last + m]; 539 k2 = karr[ik_last + 2 * m]; 540 k3 = karr[ik_last + 3 * m]; 541 542 off = ik_last * NCHAN; 543 doff = off / 8; 544 off &= 7; 545 buff0 = buff[0] + doff; 546 buff1 = buff[1] + doff; 547 buff2 = buff[2] + doff; 548 buff3 = buff[3] + doff; 549 vis_write_gsr(gsr_scale + off); 550 551 if (jk_size == 1) { 552 dp = ((mlib_addr) dl & 7) ? buffe : (mlib_d64 *) dl; 553 554 s01 = buff0[0]; 555#pragma pipeloop(0) 556 for (i = 0; i < xsize / 8; i++) { 557 s00 = s01; 558 s01 = buff0[i + 1]; 559 s0 = vis_faligndata(s00, s01); 560 561 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 562 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 563 564 d0 = buffd[2 * i]; 565 d1 = buffd[2 * i + 1]; 566 d0 = vis_fpadd16(d0, d00); 567 d1 = vis_fpadd16(d1, d01); 568 569 dd = vis_fpack16_pair(d0, d1); 570 dp[i] = dd; 571 572 buffd[2 * i] = drnd; 573 buffd[2 * i + 1] = drnd; 574 } 575 576 if (emask) { 577 s00 = s01; 578 s01 = buff0[i + 1]; 579 s0 = vis_faligndata(s00, s01); 580 581 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 582 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 583 584 d0 = buffd[2 * i]; 585 d1 = buffd[2 * i + 1]; 586 d0 = vis_fpadd16(d0, d00); 587 d1 = vis_fpadd16(d1, d01); 588 589 dd = vis_fpack16_pair(d0, d1); 590 vis_pst_8(dd, dp + i, emask); 591 592 buffd[2 * i] = drnd; 593 buffd[2 * i + 1] = drnd; 594 } 595 596 if ((mlib_u8 *) dp != dl) 597 mlib_ImageCopy_na((void *)buffe, dl, xsize); 598 } 599 else if (jk_size == 2) { 600 dp = ((mlib_addr) dl & 7) ? buffe : (mlib_d64 *) dl; 601 602 s01 = buff0[0]; 603 s11 = buff1[0]; 604#pragma pipeloop(0) 605 for (i = 0; i < xsize / 8; i++) { 606 s00 = s01; 607 s10 = s11; 608 s01 = buff0[i + 1]; 609 s11 = buff1[i + 1]; 610 s0 = vis_faligndata(s00, s01); 611 s1 = vis_faligndata(s10, s11); 612 613 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 614 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 615 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 616 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 617 618 d0 = buffd[2 * i]; 619 d1 = buffd[2 * i + 1]; 620 d0 = vis_fpadd16(d0, d00); 621 d0 = vis_fpadd16(d0, d10); 622 d1 = vis_fpadd16(d1, d01); 623 d1 = vis_fpadd16(d1, d11); 624 625 dd = vis_fpack16_pair(d0, d1); 626 dp[i] = dd; 627 628 buffd[2 * i] = drnd; 629 buffd[2 * i + 1] = drnd; 630 } 631 632 if (emask) { 633 s00 = s01; 634 s10 = s11; 635 s01 = buff0[i + 1]; 636 s11 = buff1[i + 1]; 637 s0 = vis_faligndata(s00, s01); 638 s1 = vis_faligndata(s10, s11); 639 640 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 641 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 642 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 643 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 644 645 d0 = buffd[2 * i]; 646 d1 = buffd[2 * i + 1]; 647 d0 = vis_fpadd16(d0, d00); 648 d0 = vis_fpadd16(d0, d10); 649 d1 = vis_fpadd16(d1, d01); 650 d1 = vis_fpadd16(d1, d11); 651 652 dd = vis_fpack16_pair(d0, d1); 653 vis_pst_8(dd, dp + i, emask); 654 655 buffd[2 * i] = drnd; 656 buffd[2 * i + 1] = drnd; 657 } 658 659 if ((mlib_u8 *) dp != dl) 660 mlib_ImageCopy_na((void *)buffe, dl, xsize); 661 } 662 else if (jk_size == 3) { 663 664 dp = ((mlib_addr) dl & 7) ? buffe : (mlib_d64 *) dl; 665 666 s01 = buff0[0]; 667 s11 = buff1[0]; 668 s21 = buff2[0]; 669#pragma pipeloop(0) 670 for (i = 0; i < xsize / 8; i++) { 671 s00 = s01; 672 s10 = s11; 673 s20 = s21; 674 s01 = buff0[i + 1]; 675 s11 = buff1[i + 1]; 676 s21 = buff2[i + 1]; 677 s0 = vis_faligndata(s00, s01); 678 s1 = vis_faligndata(s10, s11); 679 s2 = vis_faligndata(s20, s21); 680 681 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 682 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 683 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 684 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 685 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 686 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 687 688 d0 = buffd[2 * i]; 689 d1 = buffd[2 * i + 1]; 690 d0 = vis_fpadd16(d0, d00); 691 d0 = vis_fpadd16(d0, d10); 692 d0 = vis_fpadd16(d0, d20); 693 d1 = vis_fpadd16(d1, d01); 694 d1 = vis_fpadd16(d1, d11); 695 d1 = vis_fpadd16(d1, d21); 696 697 dd = vis_fpack16_pair(d0, d1); 698 dp[i] = dd; 699 700 buffd[2 * i] = drnd; 701 buffd[2 * i + 1] = drnd; 702 } 703 704 if (emask) { 705 s00 = s01; 706 s10 = s11; 707 s20 = s21; 708 s01 = buff0[i + 1]; 709 s11 = buff1[i + 1]; 710 s21 = buff2[i + 1]; 711 s0 = vis_faligndata(s00, s01); 712 s1 = vis_faligndata(s10, s11); 713 s2 = vis_faligndata(s20, s21); 714 715 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 716 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 717 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 718 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 719 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 720 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 721 722 d0 = buffd[2 * i]; 723 d1 = buffd[2 * i + 1]; 724 d0 = vis_fpadd16(d0, d00); 725 d0 = vis_fpadd16(d0, d10); 726 d0 = vis_fpadd16(d0, d20); 727 d1 = vis_fpadd16(d1, d01); 728 d1 = vis_fpadd16(d1, d11); 729 d1 = vis_fpadd16(d1, d21); 730 731 dd = vis_fpack16_pair(d0, d1); 732 vis_pst_8(dd, dp + i, emask); 733 734 buffd[2 * i] = drnd; 735 buffd[2 * i + 1] = drnd; 736 } 737 738 if ((mlib_u8 *) dp != dl) 739 mlib_ImageCopy_na((void *)buffe, dl, xsize); 740 } 741 else { /* if (jk_size == 4) */ 742 743 dp = ((mlib_addr) dl & 7) ? buffe : (mlib_d64 *) dl; 744 745 s01 = buff0[0]; 746 s11 = buff1[0]; 747 s21 = buff2[0]; 748 s31 = buff3[0]; 749#pragma pipeloop(0) 750 for (i = 0; i < xsize / 8; i++) { 751 s00 = s01; 752 s10 = s11; 753 s20 = s21; 754 s30 = s31; 755 s01 = buff0[i + 1]; 756 s11 = buff1[i + 1]; 757 s21 = buff2[i + 1]; 758 s31 = buff3[i + 1]; 759 s0 = vis_faligndata(s00, s01); 760 s1 = vis_faligndata(s10, s11); 761 s2 = vis_faligndata(s20, s21); 762 s3 = vis_faligndata(s30, s31); 763 764 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 765 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 766 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 767 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 768 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 769 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 770 d30 = vis_fmul8x16au(vis_read_hi(s3), k3); 771 d31 = vis_fmul8x16au(vis_read_lo(s3), k3); 772 773 d0 = buffd[2 * i]; 774 d1 = buffd[2 * i + 1]; 775 d0 = vis_fpadd16(d0, d00); 776 d0 = vis_fpadd16(d0, d10); 777 d0 = vis_fpadd16(d0, d20); 778 d0 = vis_fpadd16(d0, d30); 779 d1 = vis_fpadd16(d1, d01); 780 d1 = vis_fpadd16(d1, d11); 781 d1 = vis_fpadd16(d1, d21); 782 d1 = vis_fpadd16(d1, d31); 783 784 dd = vis_fpack16_pair(d0, d1); 785 dp[i] = dd; 786 787 buffd[2 * i] = drnd; 788 buffd[2 * i + 1] = drnd; 789 } 790 791 if (emask) { 792 s00 = s01; 793 s10 = s11; 794 s20 = s21; 795 s30 = s31; 796 s01 = buff0[i + 1]; 797 s11 = buff1[i + 1]; 798 s21 = buff2[i + 1]; 799 s31 = buff3[i + 1]; 800 s0 = vis_faligndata(s00, s01); 801 s1 = vis_faligndata(s10, s11); 802 s2 = vis_faligndata(s20, s21); 803 s3 = vis_faligndata(s30, s31); 804 805 d00 = vis_fmul8x16au(vis_read_hi(s0), k0); 806 d01 = vis_fmul8x16au(vis_read_lo(s0), k0); 807 d10 = vis_fmul8x16au(vis_read_hi(s1), k1); 808 d11 = vis_fmul8x16au(vis_read_lo(s1), k1); 809 d20 = vis_fmul8x16au(vis_read_hi(s2), k2); 810 d21 = vis_fmul8x16au(vis_read_lo(s2), k2); 811 d30 = vis_fmul8x16au(vis_read_hi(s3), k3); 812 d31 = vis_fmul8x16au(vis_read_lo(s3), k3); 813 814 d0 = buffd[2 * i]; 815 d1 = buffd[2 * i + 1]; 816 d0 = vis_fpadd16(d0, d00); 817 d0 = vis_fpadd16(d0, d10); 818 d0 = vis_fpadd16(d0, d20); 819 d0 = vis_fpadd16(d0, d30); 820 d1 = vis_fpadd16(d1, d01); 821 d1 = vis_fpadd16(d1, d11); 822 d1 = vis_fpadd16(d1, d21); 823 d1 = vis_fpadd16(d1, d31); 824 825 dd = vis_fpack16_pair(d0, d1); 826 vis_pst_8(dd, dp + i, emask); 827 828 buffd[2 * i] = drnd; 829 buffd[2 * i + 1] = drnd; 830 } 831 832 if ((mlib_u8 *) dp != dl) 833 mlib_ImageCopy_na((void *)buffe, dl, xsize); 834 } 835 836 sl += sll; 837 dl += dll; 838 839 buff_ind++; 840 841 if (buff_ind >= (n + 1)) 842 buff_ind = 0; 843 } 844 845 mlib_free(pbuff); 846 847 if (buffs != buffs_local) 848 mlib_free(buffs); 849 850 return MLIB_SUCCESS; 851} 852 853/***************************************************************/ 854