1/* 2 * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 27/* 28 * FUNCTION 29 * Internal functions for mlib_ImageConv* on U8/S16/U16 types and 30 * MLIB_EDGE_DST_NO_WRITE mask 31 */ 32 33#include "mlib_image.h" 34#include "mlib_ImageConv.h" 35#include "mlib_c_ImageConv.h" 36 37/* 38 This define switches between functions of different data types 39*/ 40#define IMG_TYPE 1 41 42/***************************************************************/ 43#if IMG_TYPE == 1 44 45#define DTYPE mlib_u8 46#define CONV_FUNC(KERN) mlib_c_conv##KERN##nw_u8 47#define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u8 48#define DSCALE (1 << 24) 49#define FROM_S32(x) (((x) >> 24) ^ 128) 50#define S64TOS32(x) (x) 51#define SAT_OFF -(1u << 31) 52 53#elif IMG_TYPE == 2 54 55#define DTYPE mlib_s16 56#define CONV_FUNC(KERN) mlib_conv##KERN##nw_s16 57#define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_s16 58#define DSCALE 65536.0 59#define FROM_S32(x) ((x) >> 16) 60#define S64TOS32(x) ((x) & 0xffffffff) 61#define SAT_OFF 62 63#elif IMG_TYPE == 3 64 65#define DTYPE mlib_u16 66#define CONV_FUNC(KERN) mlib_conv##KERN##nw_u16 67#define CONV_FUNC_I(KERN) mlib_i_conv##KERN##nw_u16 68#define DSCALE 65536.0 69#define FROM_S32(x) (((x) >> 16) ^ 0x8000) 70#define S64TOS32(x) (x) 71#define SAT_OFF -(1u << 31) 72 73#endif /* IMG_TYPE == 1 */ 74 75/***************************************************************/ 76#define BUFF_SIZE 1600 77 78#define CACHE_SIZE (64*1024) 79 80/***************************************************************/ 81#define FTYPE mlib_d64 82 83#ifndef MLIB_USE_FTOI_CLAMPING 84 85#define CLAMP_S32(x) \ 86 (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x))) 87 88#else 89 90#define CLAMP_S32(x) ((mlib_s32)(x)) 91 92#endif /* MLIB_USE_FTOI_CLAMPING */ 93 94/***************************************************************/ 95#define D2I(x) CLAMP_S32((x) SAT_OFF) 96 97/***************************************************************/ 98#ifdef _LITTLE_ENDIAN 99 100#define STORE2(res0, res1) \ 101 dp[0 ] = res1; \ 102 dp[chan1] = res0 103 104#else 105 106#define STORE2(res0, res1) \ 107 dp[0 ] = res0; \ 108 dp[chan1] = res1 109 110#endif /* _LITTLE_ENDIAN */ 111 112/***************************************************************/ 113#ifdef _NO_LONGLONG 114 115#define LOAD_BUFF(buff) \ 116 buff[i ] = sp[0]; \ 117 buff[i + 1] = sp[chan1] 118 119#else /* _NO_LONGLONG */ 120 121#ifdef _LITTLE_ENDIAN 122 123#define LOAD_BUFF(buff) \ 124 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0]) 125 126#else /* _LITTLE_ENDIAN */ 127 128#define LOAD_BUFF(buff) \ 129 *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1]) 130 131#endif /* _LITTLE_ENDIAN */ 132#endif /* _NO_LONGLONG */ 133 134/***************************************************************/ 135typedef union { 136 mlib_d64 d64; 137 struct { 138 mlib_s32 i0; 139 mlib_s32 i1; 140 } i32s; 141 struct { 142 mlib_s32 f0; 143 mlib_s32 f1; 144 } f32s; 145} d64_2x32; 146 147/***************************************************************/ 148#define DEF_VARS(type) \ 149 type *adr_src, *sl, *sp = NULL; \ 150 type *adr_dst, *dl, *dp = NULL; \ 151 FTYPE *pbuff = buff; \ 152 mlib_s32 wid, hgt, sll, dll; \ 153 mlib_s32 nchannel, chan1; \ 154 mlib_s32 i, j, c 155 156/***************************************************************/ 157#define GET_SRC_DST_PARAMETERS(type) \ 158 hgt = mlib_ImageGetHeight(src); \ 159 wid = mlib_ImageGetWidth(src); \ 160 nchannel = mlib_ImageGetChannels(src); \ 161 sll = mlib_ImageGetStride(src) / sizeof(type); \ 162 dll = mlib_ImageGetStride(dst) / sizeof(type); \ 163 adr_src = (type *)mlib_ImageGetData(src); \ 164 adr_dst = (type *)mlib_ImageGetData(dst) 165 166/***************************************************************/ 167#ifndef __sparc 168 169#if IMG_TYPE == 1 170 171/* Test for the presence of any "1" bit in bits 172 8 to 31 of val. If present, then val is either 173 negative or >255. If over/underflows of 8 bits 174 are uncommon, then this technique can be a win, 175 since only a single test, rather than two, is 176 necessary to determine if clamping is needed. 177 On the other hand, if over/underflows are common, 178 it adds an extra test. 179*/ 180#define CLAMP_STORE(dst, val) \ 181 if (val & 0xffffff00) { \ 182 if (val < MLIB_U8_MIN) \ 183 dst = MLIB_U8_MIN; \ 184 else \ 185 dst = MLIB_U8_MAX; \ 186 } else { \ 187 dst = (mlib_u8)val; \ 188 } 189 190#elif IMG_TYPE == 2 191 192#define CLAMP_STORE(dst, val) \ 193 if (val >= MLIB_S16_MAX) \ 194 dst = MLIB_S16_MAX; \ 195 else if (val <= MLIB_S16_MIN) \ 196 dst = MLIB_S16_MIN; \ 197 else \ 198 dst = (mlib_s16)val 199 200#elif IMG_TYPE == 3 201 202#define CLAMP_STORE(dst, val) \ 203 if (val >= MLIB_U16_MAX) \ 204 dst = MLIB_U16_MAX; \ 205 else if (val <= MLIB_U16_MIN) \ 206 dst = MLIB_U16_MIN; \ 207 else \ 208 dst = (mlib_u16)val 209 210#endif /* IMG_TYPE == 1 */ 211#endif /* __sparc */ 212 213/***************************************************************/ 214#define MAX_KER 7 215#define MAX_N 15 216 217static mlib_status mlib_ImageConv1xN(mlib_image *dst, 218 const mlib_image *src, 219 const mlib_d64 *k, 220 mlib_s32 n, 221 mlib_s32 dn, 222 mlib_s32 cmask) 223{ 224 FTYPE buff[BUFF_SIZE]; 225 mlib_s32 off, kh; 226 mlib_s32 d0, d1; 227 const FTYPE *pk; 228 FTYPE k0, k1, k2, k3; 229 FTYPE p0, p1, p2, p3, p4; 230 DEF_VARS(DTYPE); 231 DTYPE *sl_c, *dl_c, *sl0; 232 mlib_s32 l, hsize, max_hsize; 233 GET_SRC_DST_PARAMETERS(DTYPE); 234 235 hgt -= (n - 1); 236 adr_dst += dn*dll; 237 238 max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll; 239 240 if (!max_hsize) max_hsize = 1; 241 242 if (max_hsize > BUFF_SIZE) { 243 pbuff = mlib_malloc(sizeof(FTYPE)*max_hsize); 244 } 245 246 chan1 = nchannel; 247 248 sl_c = adr_src; 249 dl_c = adr_dst; 250 251 for (l = 0; l < hgt; l += hsize) { 252 hsize = hgt - l; 253 254 if (hsize > max_hsize) hsize = max_hsize; 255 256 for (c = 0; c < nchannel; c++) { 257 if (!(cmask & (1 << (chan1 - 1 - c)))) continue; 258 259 sl = sl_c + c; 260 dl = dl_c + c; 261 262#ifdef __SUNPRO_C 263#pragma pipeloop(0) 264#endif /* __SUNPRO_C */ 265 for (j = 0; j < hsize; j++) pbuff[j] = 0.0; 266 267 for (i = 0; i < wid; i++) { 268 sl0 = sl; 269 270 for (off = 0; off < (n - 4); off += 4) { 271 pk = k + off; 272 sp = sl0; 273 274 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; 275 p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll]; 276 sp += 3*sll; 277 278#ifdef __SUNPRO_C 279#pragma pipeloop(0) 280#endif /* __SUNPRO_C */ 281 for (j = 0; j < hsize; j += 2) { 282 p0 = p2; p1 = p3; p2 = p4; 283 p3 = sp[0]; 284 p4 = sp[sll]; 285 286 pbuff[j ] += p0*k0 + p1*k1 + p2*k2 + p3*k3; 287 pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3; 288 289 sp += 2*sll; 290 } 291 292 sl0 += 4*sll; 293 } 294 295 pk = k + off; 296 sp = sl0; 297 298 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; 299 p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll]; 300 301 dp = dl; 302 kh = n - off; 303 304 if (kh == 4) { 305 sp += 3*sll; 306 307#ifdef __SUNPRO_C 308#pragma pipeloop(0) 309#endif /* __SUNPRO_C */ 310 for (j = 0; j <= (hsize - 2); j += 2) { 311 p0 = p2; p1 = p3; p2 = p4; 312 p3 = sp[0]; 313 p4 = sp[sll]; 314 315 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]); 316 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1]); 317 318 dp[0 ] = FROM_S32(d0); 319 dp[dll] = FROM_S32(d1); 320 321 pbuff[j] = 0; 322 pbuff[j + 1] = 0; 323 324 sp += 2*sll; 325 dp += 2*dll; 326 } 327 328 if (j < hsize) { 329 p0 = p2; p1 = p3; p2 = p4; 330 p3 = sp[0]; 331 332 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]); 333 334 pbuff[j] = 0; 335 336 dp[0] = FROM_S32(d0); 337 } 338 339 } else if (kh == 3) { 340 sp += 2*sll; 341 342#ifdef __SUNPRO_C 343#pragma pipeloop(0) 344#endif /* __SUNPRO_C */ 345 for (j = 0; j <= (hsize - 2); j += 2) { 346 p0 = p2; p1 = p3; 347 p2 = sp[0]; 348 p3 = sp[sll]; 349 350 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]); 351 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1]); 352 353 dp[0 ] = FROM_S32(d0); 354 dp[dll] = FROM_S32(d1); 355 356 pbuff[j] = 0; 357 pbuff[j + 1] = 0; 358 359 sp += 2*sll; 360 dp += 2*dll; 361 } 362 363 if (j < hsize) { 364 p0 = p2; p1 = p3; 365 p2 = sp[0]; 366 367 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]); 368 369 pbuff[j] = 0; 370 371 dp[0] = FROM_S32(d0); 372 } 373 374 } else if (kh == 2) { 375 sp += sll; 376 377#ifdef __SUNPRO_C 378#pragma pipeloop(0) 379#endif /* __SUNPRO_C */ 380 for (j = 0; j <= (hsize - 2); j += 2) { 381 p0 = p2; 382 p1 = sp[0]; 383 p2 = sp[sll]; 384 385 d0 = D2I(p0*k0 + p1*k1 + pbuff[j]); 386 d1 = D2I(p1*k0 + p2*k1 + pbuff[j + 1]); 387 388 dp[0 ] = FROM_S32(d0); 389 dp[dll] = FROM_S32(d1); 390 391 pbuff[j] = 0; 392 pbuff[j + 1] = 0; 393 394 sp += 2*sll; 395 dp += 2*dll; 396 } 397 398 if (j < hsize) { 399 p0 = p2; 400 p1 = sp[0]; 401 402 d0 = D2I(p0*k0 + p1*k1 + pbuff[j]); 403 404 pbuff[j] = 0; 405 406 dp[0] = FROM_S32(d0); 407 } 408 409 } else /* if (kh == 1) */ { 410#ifdef __SUNPRO_C 411#pragma pipeloop(0) 412#endif /* __SUNPRO_C */ 413 for (j = 0; j < hsize; j++) { 414 p0 = sp[0]; 415 416 d0 = D2I(p0*k0 + pbuff[j]); 417 418 dp[0] = FROM_S32(d0); 419 420 pbuff[j] = 0; 421 422 sp += sll; 423 dp += dll; 424 } 425 } 426 427 sl += chan1; 428 dl += chan1; 429 } 430 } 431 432 sl_c += max_hsize*sll; 433 dl_c += max_hsize*dll; 434 } 435 436 if (pbuff != buff) mlib_free(pbuff); 437 438 return MLIB_SUCCESS; 439} 440 441/***************************************************************/ 442mlib_status CONV_FUNC(MxN)(mlib_image *dst, 443 const mlib_image *src, 444 const mlib_s32 *kernel, 445 mlib_s32 m, 446 mlib_s32 n, 447 mlib_s32 dm, 448 mlib_s32 dn, 449 mlib_s32 scale, 450 mlib_s32 cmask) 451{ 452 FTYPE buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)]; 453 FTYPE **buffs = buffs_arr, *buffd; 454 FTYPE akernel[256], *k = akernel, fscale = DSCALE; 455 mlib_s32 mn, l, off, kw, bsize, buff_ind; 456 mlib_s32 d0, d1; 457 FTYPE k0, k1, k2, k3, k4, k5, k6; 458 FTYPE p0, p1, p2, p3, p4, p5, p6, p7; 459 d64_2x32 dd; 460 DEF_VARS(DTYPE); 461 mlib_s32 chan2; 462 mlib_s32 *buffo, *buffi; 463 mlib_status status = MLIB_SUCCESS; 464 465 GET_SRC_DST_PARAMETERS(DTYPE); 466 467 if (scale > 30) { 468 fscale *= 1.0/(1 << 30); 469 scale -= 30; 470 } 471 472 fscale /= (1 << scale); 473 474 mn = m*n; 475 476 if (mn > 256) { 477 k = mlib_malloc(mn*sizeof(mlib_d64)); 478 479 if (k == NULL) return MLIB_FAILURE; 480 } 481 482 for (i = 0; i < mn; i++) { 483 k[i] = kernel[i]*fscale; 484 } 485 486 if (m == 1) { 487 status = mlib_ImageConv1xN(dst, src, k, n, dn, cmask); 488 FREE_AND_RETURN_STATUS; 489 } 490 491 bsize = (n + 3)*wid; 492 493 if ((bsize > BUFF_SIZE) || (n > MAX_N)) { 494 pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1)); 495 496 if (pbuff == NULL) { 497 status = MLIB_FAILURE; 498 FREE_AND_RETURN_STATUS; 499 } 500 buffs = (FTYPE **)(pbuff + bsize); 501 } 502 503 for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*wid; 504 for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l]; 505 buffd = buffs[n] + wid; 506 buffo = (mlib_s32*)(buffd + wid); 507 buffi = buffo + (wid &~ 1); 508 509 chan1 = nchannel; 510 chan2 = chan1 + chan1; 511 512 wid -= (m - 1); 513 hgt -= (n - 1); 514 adr_dst += dn*dll + dm*nchannel; 515 516 for (c = 0; c < nchannel; c++) { 517 if (!(cmask & (1 << (chan1 - 1 - c)))) continue; 518 519 sl = adr_src + c; 520 dl = adr_dst + c; 521 522 for (l = 0; l < n; l++) { 523 FTYPE *buff = buffs[l]; 524 525#ifdef __SUNPRO_C 526#pragma pipeloop(0) 527#endif /* __SUNPRO_C */ 528 for (i = 0; i < wid + (m - 1); i++) { 529 buff[i] = (FTYPE)sl[i*chan1]; 530 } 531 532 sl += sll; 533 } 534 535 buff_ind = 0; 536 537#ifdef __SUNPRO_C 538#pragma pipeloop(0) 539#endif /* __SUNPRO_C */ 540 for (i = 0; i < wid; i++) buffd[i] = 0.0; 541 542 for (j = 0; j < hgt; j++) { 543 FTYPE **buffc = buffs + buff_ind; 544 FTYPE *buffn = buffc[n]; 545 FTYPE *pk = k; 546 547 for (l = 0; l < n; l++) { 548 FTYPE *buff_l = buffc[l]; 549 550 for (off = 0; off < m;) { 551 FTYPE *buff = buff_l + off; 552 553 kw = m - off; 554 555 if (kw > 2*MAX_KER) kw = MAX_KER; else 556 if (kw > MAX_KER) kw = kw/2; 557 off += kw; 558 559 sp = sl; 560 dp = dl; 561 562 p2 = buff[0]; p3 = buff[1]; p4 = buff[2]; 563 p5 = buff[3]; p6 = buff[4]; p7 = buff[5]; 564 565 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; 566 k4 = pk[4]; k5 = pk[5]; k6 = pk[6]; 567 pk += kw; 568 569 if (kw == 7) { 570 571 if (l < (n - 1) || off < m) { 572#ifdef __SUNPRO_C 573#pragma pipeloop(0) 574#endif /* __SUNPRO_C */ 575 for (i = 0; i <= (wid - 2); i += 2) { 576 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; 577 578 p6 = buff[i + 6]; p7 = buff[i + 7]; 579 580 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6; 581 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6; 582 } 583 584 } else { 585#ifdef __SUNPRO_C 586#pragma pipeloop(0) 587#endif /* __SUNPRO_C */ 588 for (i = 0; i <= (wid - 2); i += 2) { 589 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; 590 591 p6 = buff[i + 6]; p7 = buff[i + 7]; 592 593 LOAD_BUFF(buffi); 594 595 dd.d64 = *(FTYPE *)(buffi + i); 596 buffn[i ] = (FTYPE)dd.i32s.i0; 597 buffn[i + 1] = (FTYPE)dd.i32s.i1; 598 599 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]); 600 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]); 601 602 dp[0 ] = FROM_S32(d0); 603 dp[chan1] = FROM_S32(d1); 604 605 buffd[i ] = 0.0; 606 buffd[i + 1] = 0.0; 607 608 sp += chan2; 609 dp += chan2; 610 } 611 } 612 613 } else if (kw == 6) { 614 615 if (l < (n - 1) || off < m) { 616#ifdef __SUNPRO_C 617#pragma pipeloop(0) 618#endif /* __SUNPRO_C */ 619 for (i = 0; i <= (wid - 2); i += 2) { 620 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; 621 622 p5 = buff[i + 5]; p6 = buff[i + 6]; 623 624 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5; 625 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5; 626 } 627 628 } else { 629#ifdef __SUNPRO_C 630#pragma pipeloop(0) 631#endif /* __SUNPRO_C */ 632 for (i = 0; i <= (wid - 2); i += 2) { 633 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; 634 635 p5 = buff[i + 5]; p6 = buff[i + 6]; 636 637 buffn[i ] = (FTYPE)sp[0]; 638 buffn[i + 1] = (FTYPE)sp[chan1]; 639 640 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]); 641 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]); 642 643 dp[0 ] = FROM_S32(d0); 644 dp[chan1] = FROM_S32(d1); 645 646 buffd[i ] = 0.0; 647 buffd[i + 1] = 0.0; 648 649 sp += chan2; 650 dp += chan2; 651 } 652 } 653 654 } else if (kw == 5) { 655 656 if (l < (n - 1) || off < m) { 657#ifdef __SUNPRO_C 658#pragma pipeloop(0) 659#endif /* __SUNPRO_C */ 660 for (i = 0; i <= (wid - 2); i += 2) { 661 p0 = p2; p1 = p3; p2 = p4; p3 = p5; 662 663 p4 = buff[i + 4]; p5 = buff[i + 5]; 664 665 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4; 666 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4; 667 } 668 669 } else { 670#ifdef __SUNPRO_C 671#pragma pipeloop(0) 672#endif /* __SUNPRO_C */ 673 for (i = 0; i <= (wid - 2); i += 2) { 674 p0 = p2; p1 = p3; p2 = p4; p3 = p5; 675 676 p4 = buff[i + 4]; p5 = buff[i + 5]; 677 678 buffn[i ] = (FTYPE)sp[0]; 679 buffn[i + 1] = (FTYPE)sp[chan1]; 680 681 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]); 682 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]); 683 684 dp[0 ] = FROM_S32(d0); 685 dp[chan1] = FROM_S32(d1); 686 687 buffd[i ] = 0.0; 688 buffd[i + 1] = 0.0; 689 690 sp += chan2; 691 dp += chan2; 692 } 693 } 694 695 } else if (kw == 4) { 696 697 if (l < (n - 1) || off < m) { 698#ifdef __SUNPRO_C 699#pragma pipeloop(0) 700#endif /* __SUNPRO_C */ 701 for (i = 0; i <= (wid - 2); i += 2) { 702 p0 = p2; p1 = p3; p2 = p4; 703 704 p3 = buff[i + 3]; p4 = buff[i + 4]; 705 706 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3; 707 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3; 708 } 709 710 } else { 711#ifdef __SUNPRO_C 712#pragma pipeloop(0) 713#endif /* __SUNPRO_C */ 714 for (i = 0; i <= (wid - 2); i += 2) { 715 p0 = p2; p1 = p3; p2 = p4; 716 717 p3 = buff[i + 3]; p4 = buff[i + 4]; 718 719 buffn[i ] = (FTYPE)sp[0]; 720 buffn[i + 1] = (FTYPE)sp[chan1]; 721 722 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]); 723 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]); 724 725 dp[0 ] = FROM_S32(d0); 726 dp[chan1] = FROM_S32(d1); 727 728 buffd[i ] = 0.0; 729 buffd[i + 1] = 0.0; 730 731 sp += chan2; 732 dp += chan2; 733 } 734 } 735 736 } else if (kw == 3) { 737 738 if (l < (n - 1) || off < m) { 739#ifdef __SUNPRO_C 740#pragma pipeloop(0) 741#endif /* __SUNPRO_C */ 742 for (i = 0; i <= (wid - 2); i += 2) { 743 p0 = p2; p1 = p3; 744 745 p2 = buff[i + 2]; p3 = buff[i + 3]; 746 747 buffd[i ] += p0*k0 + p1*k1 + p2*k2; 748 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2; 749 } 750 751 } else { 752#ifdef __SUNPRO_C 753#pragma pipeloop(0) 754#endif /* __SUNPRO_C */ 755 for (i = 0; i <= (wid - 2); i += 2) { 756 p0 = p2; p1 = p3; 757 758 p2 = buff[i + 2]; p3 = buff[i + 3]; 759 760 buffn[i ] = (FTYPE)sp[0]; 761 buffn[i + 1] = (FTYPE)sp[chan1]; 762 763 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i ]); 764 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]); 765 766 dp[0 ] = FROM_S32(d0); 767 dp[chan1] = FROM_S32(d1); 768 769 buffd[i ] = 0.0; 770 buffd[i + 1] = 0.0; 771 772 sp += chan2; 773 dp += chan2; 774 } 775 } 776 777 } else /*if (kw == 2)*/ { 778 779 if (l < (n - 1) || off < m) { 780#ifdef __SUNPRO_C 781#pragma pipeloop(0) 782#endif /* __SUNPRO_C */ 783 for (i = 0; i <= (wid - 2); i += 2) { 784 p0 = p2; 785 786 p1 = buff[i + 1]; p2 = buff[i + 2]; 787 788 buffd[i ] += p0*k0 + p1*k1; 789 buffd[i + 1] += p1*k0 + p2*k1; 790 } 791 792 } else { 793#ifdef __SUNPRO_C 794#pragma pipeloop(0) 795#endif /* __SUNPRO_C */ 796 for (i = 0; i <= (wid - 2); i += 2) { 797 p0 = p2; 798 799 p1 = buff[i + 1]; p2 = buff[i + 2]; 800 801 buffn[i ] = (FTYPE)sp[0]; 802 buffn[i + 1] = (FTYPE)sp[chan1]; 803 804 d0 = D2I(p0*k0 + p1*k1 + buffd[i ]); 805 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]); 806 807 dp[0 ] = FROM_S32(d0); 808 dp[chan1] = FROM_S32(d1); 809 810 buffd[i ] = 0.0; 811 buffd[i + 1] = 0.0; 812 813 sp += chan2; 814 dp += chan2; 815 } 816 } 817 } 818 } 819 } 820 821 /* last pixels */ 822 for (; i < wid; i++) { 823 FTYPE *pk = k, s = 0; 824 mlib_s32 x, d0; 825 826 for (l = 0; l < n; l++) { 827 FTYPE *buff = buffc[l] + i; 828 829 for (x = 0; x < m; x++) s += buff[x] * (*pk++); 830 } 831 832 d0 = D2I(s); 833 dp[0] = FROM_S32(d0); 834 835 buffn[i] = (FTYPE)sp[0]; 836 837 sp += chan1; 838 dp += chan1; 839 } 840 841 for (l = 0; l < (m - 1); l++) buffn[wid + l] = sp[l*chan1]; 842 843 /* next line */ 844 sl += sll; 845 dl += dll; 846 847 buff_ind++; 848 849 if (buff_ind >= n + 1) buff_ind = 0; 850 } 851 } 852 853 FREE_AND_RETURN_STATUS; 854} 855 856/***************************************************************/ 857#ifndef __sparc /* for x86, using integer multiplies is faster */ 858 859#define STORE_RES(res, x) \ 860 x >>= shift2; \ 861 CLAMP_STORE(res, x) 862 863mlib_status CONV_FUNC_I(MxN)(mlib_image *dst, 864 const mlib_image *src, 865 const mlib_s32 *kernel, 866 mlib_s32 m, 867 mlib_s32 n, 868 mlib_s32 dm, 869 mlib_s32 dn, 870 mlib_s32 scale, 871 mlib_s32 cmask) 872{ 873 mlib_s32 buff[BUFF_SIZE], *buffd = buff; 874 mlib_s32 l, off, kw; 875 mlib_s32 d0, d1, shift1, shift2; 876 mlib_s32 k0, k1, k2, k3, k4, k5, k6; 877 mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7; 878 DTYPE *adr_src, *sl, *sp = NULL; 879 DTYPE *adr_dst, *dl, *dp = NULL; 880 mlib_s32 wid, hgt, sll, dll; 881 mlib_s32 nchannel, chan1; 882 mlib_s32 i, j, c; 883 mlib_s32 chan2; 884 mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl; 885 GET_SRC_DST_PARAMETERS(DTYPE); 886 887#if IMG_TYPE != 1 888 shift1 = 16; 889#else 890 shift1 = 8; 891#endif /* IMG_TYPE != 1 */ 892 shift2 = scale - shift1; 893 894 chan1 = nchannel; 895 chan2 = chan1 + chan1; 896 897 wid -= (m - 1); 898 hgt -= (n - 1); 899 adr_dst += dn*dll + dm*nchannel; 900 901 if (wid > BUFF_SIZE) { 902 buffd = mlib_malloc(sizeof(mlib_s32)*wid); 903 904 if (buffd == NULL) return MLIB_FAILURE; 905 } 906 907 if (m*n > MAX_N*MAX_N) { 908 k = mlib_malloc(sizeof(mlib_s32)*(m*n)); 909 910 if (k == NULL) { 911 if (buffd != buff) mlib_free(buffd); 912 return MLIB_FAILURE; 913 } 914 } 915 916 for (i = 0; i < m*n; i++) { 917 k[i] = kernel[i] >> shift1; 918 } 919 920 for (c = 0; c < nchannel; c++) { 921 if (!(cmask & (1 << (nchannel - 1 - c)))) continue; 922 923 sl = adr_src + c; 924 dl = adr_dst + c; 925 926#ifdef __SUNPRO_C 927#pragma pipeloop(0) 928#endif /* __SUNPRO_C */ 929 for (i = 0; i < wid; i++) buffd[i] = 0; 930 931 for (j = 0; j < hgt; j++) { 932 mlib_s32 *pk = k; 933 934 for (l = 0; l < n; l++) { 935 DTYPE *sp0 = sl + l*sll; 936 937 for (off = 0; off < m;) { 938 sp = sp0 + off*chan1; 939 dp = dl; 940 941 kw = m - off; 942 943 if (kw > 2*MAX_KER) kw = MAX_KER; else 944 if (kw > MAX_KER) kw = kw/2; 945 off += kw; 946 947 p2 = sp[0]; p3 = sp[chan1]; p4 = sp[chan2]; 948 p5 = sp[chan2 + chan1]; p6 = sp[chan2 + chan2]; p7 = sp[5*chan1]; 949 950 k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; 951 k4 = pk[4]; k5 = pk[5]; k6 = pk[6]; 952 pk += kw; 953 954 sp += (kw - 1)*chan1; 955 956 if (kw == 7) { 957 958 if (l < (n - 1) || off < m) { 959#ifdef __SUNPRO_C 960#pragma pipeloop(0) 961#endif /* __SUNPRO_C */ 962 for (i = 0; i <= (wid - 2); i += 2) { 963 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; 964 p6 = sp[0]; 965 p7 = sp[chan1]; 966 967 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6; 968 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6; 969 970 sp += chan2; 971 } 972 973 } else { 974#ifdef __SUNPRO_C 975#pragma pipeloop(0) 976#endif /* __SUNPRO_C */ 977 for (i = 0; i <= (wid - 2); i += 2) { 978 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; 979 p6 = sp[0]; 980 p7 = sp[chan1]; 981 982 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]); 983 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]); 984 985 STORE_RES(dp[0 ], d0); 986 STORE_RES(dp[chan1], d1); 987 988 buffd[i ] = 0; 989 buffd[i + 1] = 0; 990 991 sp += chan2; 992 dp += chan2; 993 } 994 } 995 996 } else if (kw == 6) { 997 998 if (l < (n - 1) || off < m) { 999#ifdef __SUNPRO_C 1000#pragma pipeloop(0) 1001#endif /* __SUNPRO_C */ 1002 for (i = 0; i <= (wid - 2); i += 2) { 1003 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; 1004 p5 = sp[0]; 1005 p6 = sp[chan1]; 1006 1007 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5; 1008 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5; 1009 1010 sp += chan2; 1011 } 1012 1013 } else { 1014#ifdef __SUNPRO_C 1015#pragma pipeloop(0) 1016#endif /* __SUNPRO_C */ 1017 for (i = 0; i <= (wid - 2); i += 2) { 1018 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; 1019 p5 = sp[0]; 1020 p6 = sp[chan1]; 1021 1022 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]); 1023 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]); 1024 1025 STORE_RES(dp[0 ], d0); 1026 STORE_RES(dp[chan1], d1); 1027 1028 buffd[i ] = 0; 1029 buffd[i + 1] = 0; 1030 1031 sp += chan2; 1032 dp += chan2; 1033 } 1034 } 1035 1036 } else if (kw == 5) { 1037 1038 if (l < (n - 1) || off < m) { 1039#ifdef __SUNPRO_C 1040#pragma pipeloop(0) 1041#endif /* __SUNPRO_C */ 1042 for (i = 0; i <= (wid - 2); i += 2) { 1043 p0 = p2; p1 = p3; p2 = p4; p3 = p5; 1044 p4 = sp[0]; 1045 p5 = sp[chan1]; 1046 1047 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4; 1048 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4; 1049 1050 sp += chan2; 1051 } 1052 1053 } else { 1054#ifdef __SUNPRO_C 1055#pragma pipeloop(0) 1056#endif /* __SUNPRO_C */ 1057 for (i = 0; i <= (wid - 2); i += 2) { 1058 p0 = p2; p1 = p3; p2 = p4; p3 = p5; 1059 p4 = sp[0]; 1060 p5 = sp[chan1]; 1061 1062 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]); 1063 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]); 1064 1065 STORE_RES(dp[0 ], d0); 1066 STORE_RES(dp[chan1], d1); 1067 1068 buffd[i ] = 0; 1069 buffd[i + 1] = 0; 1070 1071 sp += chan2; 1072 dp += chan2; 1073 } 1074 } 1075 1076 } else if (kw == 4) { 1077 1078 if (l < (n - 1) || off < m) { 1079#ifdef __SUNPRO_C 1080#pragma pipeloop(0) 1081#endif /* __SUNPRO_C */ 1082 for (i = 0; i <= (wid - 2); i += 2) { 1083 p0 = p2; p1 = p3; p2 = p4; 1084 p3 = sp[0]; 1085 p4 = sp[chan1]; 1086 1087 buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3; 1088 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3; 1089 1090 sp += chan2; 1091 } 1092 1093 } else { 1094#ifdef __SUNPRO_C 1095#pragma pipeloop(0) 1096#endif /* __SUNPRO_C */ 1097 for (i = 0; i <= (wid - 2); i += 2) { 1098 p0 = p2; p1 = p3; p2 = p4; 1099 p3 = sp[0]; 1100 p4 = sp[chan1]; 1101 1102 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]); 1103 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]); 1104 1105 STORE_RES(dp[0 ], d0); 1106 STORE_RES(dp[chan1], d1); 1107 1108 buffd[i ] = 0; 1109 buffd[i + 1] = 0; 1110 1111 sp += chan2; 1112 dp += chan2; 1113 } 1114 } 1115 1116 } else if (kw == 3) { 1117 1118 if (l < (n - 1) || off < m) { 1119#ifdef __SUNPRO_C 1120#pragma pipeloop(0) 1121#endif /* __SUNPRO_C */ 1122 for (i = 0; i <= (wid - 2); i += 2) { 1123 p0 = p2; p1 = p3; 1124 p2 = sp[0]; 1125 p3 = sp[chan1]; 1126 1127 buffd[i ] += p0*k0 + p1*k1 + p2*k2; 1128 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2; 1129 1130 sp += chan2; 1131 } 1132 1133 } else { 1134#ifdef __SUNPRO_C 1135#pragma pipeloop(0) 1136#endif /* __SUNPRO_C */ 1137 for (i = 0; i <= (wid - 2); i += 2) { 1138 p0 = p2; p1 = p3; 1139 p2 = sp[0]; 1140 p3 = sp[chan1]; 1141 1142 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i ]); 1143 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]); 1144 1145 STORE_RES(dp[0 ], d0); 1146 STORE_RES(dp[chan1], d1); 1147 1148 buffd[i ] = 0; 1149 buffd[i + 1] = 0; 1150 1151 sp += chan2; 1152 dp += chan2; 1153 } 1154 } 1155 1156 } else if (kw == 2) { 1157 1158 if (l < (n - 1) || off < m) { 1159#ifdef __SUNPRO_C 1160#pragma pipeloop(0) 1161#endif /* __SUNPRO_C */ 1162 for (i = 0; i <= (wid - 2); i += 2) { 1163 p0 = p2; 1164 p1 = sp[0]; 1165 p2 = sp[chan1]; 1166 1167 buffd[i ] += p0*k0 + p1*k1; 1168 buffd[i + 1] += p1*k0 + p2*k1; 1169 1170 sp += chan2; 1171 } 1172 1173 } else { 1174#ifdef __SUNPRO_C 1175#pragma pipeloop(0) 1176#endif /* __SUNPRO_C */ 1177 for (i = 0; i <= (wid - 2); i += 2) { 1178 p0 = p2; 1179 p1 = sp[0]; 1180 p2 = sp[chan1]; 1181 1182 d0 = (p0*k0 + p1*k1 + buffd[i ]); 1183 d1 = (p1*k0 + p2*k1 + buffd[i + 1]); 1184 1185 STORE_RES(dp[0 ], d0); 1186 STORE_RES(dp[chan1], d1); 1187 1188 buffd[i ] = 0; 1189 buffd[i + 1] = 0; 1190 1191 sp += chan2; 1192 dp += chan2; 1193 } 1194 } 1195 1196 } else /*if (kw == 1)*/ { 1197 1198 if (l < (n - 1) || off < m) { 1199#ifdef __SUNPRO_C 1200#pragma pipeloop(0) 1201#endif /* __SUNPRO_C */ 1202 for (i = 0; i <= (wid - 2); i += 2) { 1203 p0 = sp[0]; 1204 p1 = sp[chan1]; 1205 1206 buffd[i ] += p0*k0; 1207 buffd[i + 1] += p1*k0; 1208 1209 sp += chan2; 1210 } 1211 1212 } else { 1213#ifdef __SUNPRO_C 1214#pragma pipeloop(0) 1215#endif /* __SUNPRO_C */ 1216 for (i = 0; i <= (wid - 2); i += 2) { 1217 p0 = sp[0]; 1218 p1 = sp[chan1]; 1219 1220 d0 = (p0*k0 + buffd[i ]); 1221 d1 = (p1*k0 + buffd[i + 1]); 1222 1223 STORE_RES(dp[0 ], d0); 1224 STORE_RES(dp[chan1], d1); 1225 1226 buffd[i ] = 0; 1227 buffd[i + 1] = 0; 1228 1229 sp += chan2; 1230 dp += chan2; 1231 } 1232 } 1233 } 1234 } 1235 } 1236 1237 /* last pixels */ 1238 for (; i < wid; i++) { 1239 mlib_s32 *pk = k, s = 0; 1240 mlib_s32 x; 1241 1242 for (l = 0; l < n; l++) { 1243 sp = sl + l*sll + i*chan1; 1244 1245 for (x = 0; x < m; x++) { 1246 s += sp[0] * pk[0]; 1247 sp += chan1; 1248 pk ++; 1249 } 1250 } 1251 1252 STORE_RES(dp[0], s); 1253 1254 sp += chan1; 1255 dp += chan1; 1256 } 1257 1258 sl += sll; 1259 dl += dll; 1260 } 1261 } 1262 1263 if (buffd != buff) mlib_free(buffd); 1264 if (k != k_locl) mlib_free(k); 1265 1266 return MLIB_SUCCESS; 1267} 1268 1269/***************************************************************/ 1270#endif /* __sparc ( for x86, using integer multiplies is faster ) */ 1271 1272/***************************************************************/ 1273