1/* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file 2 * 3 * For Intel x86 CPU and Microsoft Visual C++ compiler 4 * 5 * libpng 1.0.8 - July 24, 2000 6 * For conditions of distribution and use, see copyright notice in png.h 7 * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson 8 * Copyright (c) 1998, Intel Corporation 9 * 10 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998 11 * Interface to libpng contributed by Gilles Vollant, 1999 12 * 13 */ 14 15/* $Id: pngvcrd.c 14574 2005-10-29 16:27:43Z bonefish $ */ 16 17#define PNG_INTERNAL 18#include "png.h" 19 20#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD) 21 22/* 23 One of these might need to be defined. 24#define DISABLE_PNGVCRD_COMBINE 25#define DISABLE_PNGVCRD_INTERLACE 26*/ 27 28static int mmx_supported=2; 29 30void /* PRIVATE */ 31png_read_filter_row_c(png_structp png_ptr, png_row_infop row_info, 32 png_bytep row, png_bytep prev_row, int filter); 33 34static int mmxsupport() 35{ 36 int mmx_supported_local = 0; 37 _asm { 38 push ebx //CPUID will trash these 39 push ecx 40 push edx 41 pushfd //Save Eflag to stack 42 pop eax //Get Eflag from stack into eax 43 mov ecx, eax //Make another copy of Eflag in ecx 44 xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)] 45 push eax //Save modified Eflag back to stack 46 47 popfd //Restored modified value back to Eflag reg 48 pushfd //Save Eflag to stack 49 pop eax //Get Eflag from stack 50 xor eax, ecx //Compare the new Eflag with the original Eflag 51 jz NOT_SUPPORTED //If the same, CPUID instruction is not supported, 52 //skip following instructions and jump to 53 //NOT_SUPPORTED label 54 55 xor eax, eax //Set eax to zero 56 57 _asm _emit 0x0f //CPUID instruction (two bytes opcode) 58 _asm _emit 0xa2 59 60 cmp eax, 1 //make sure eax return non-zero value 61 jl NOT_SUPPORTED //If eax is zero, mmx not supported 62 63 xor eax, eax //set eax to zero 64 inc eax //Now increment eax to 1. This instruction is 65 //faster than the instruction "mov eax, 1" 66 67 _asm _emit 0x0f //CPUID instruction 68 _asm _emit 0xa2 69 70 and edx, 0x00800000 //mask out all bits but mmx bit(24) 71 cmp edx, 0 // 0 = mmx not supported 72 jz NOT_SUPPORTED // non-zero = Yes, mmx IS supported 73 74 mov mmx_supported_local, 1 //set return value to 1 75 76NOT_SUPPORTED: 77 mov eax, mmx_supported_local //move return value to eax 78 pop edx //CPUID trashed these 79 pop ecx 80 pop ebx 81 } 82 83 //mmx_supported_local=0; // test code for force don't support MMX 84 //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local); 85 86 return mmx_supported_local; 87} 88 89/* Combines the row recently read in with the previous row. 90 This routine takes care of alpha and transparency if requested. 91 This routine also handles the two methods of progressive display 92 of interlaced images, depending on the mask value. 93 The mask value describes which pixels are to be combined with 94 the row. The pattern always repeats every 8 pixels, so just 8 95 bits are needed. A one indicates the pixel is to be combined; a 96 zero indicates the pixel is to be skipped. This is in addition 97 to any alpha or transparency value associated with the pixel. If 98 you want all pixels to be combined, pass 0xff (255) in mask. */ 99 100/* Use this routine for x86 platform - uses faster MMX routine if machine 101 supports MMX */ 102 103void /* PRIVATE */ 104png_combine_row(png_structp png_ptr, png_bytep row, int mask) 105{ 106#ifdef PNG_USE_LOCAL_ARRAYS 107 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1}; 108#endif 109#ifdef DISABLE_PNGVCRD_COMBINE 110 int save_mmx_supported = mmx_supported; 111#endif 112 113 png_debug(1,"in png_combine_row_asm\n"); 114 115#ifdef DISABLE_PNGVCRD_COMBINE 116 if ((png_ptr->transformations & PNG_INTERLACE) && png_ptr->pass != 6) 117 mmx_supported = 0; 118 else 119#endif 120 if (mmx_supported == 2) 121 mmx_supported = mmxsupport(); 122 123 if (mask == 0xff) 124 { 125 png_memcpy(row, png_ptr->row_buf + 1, 126 (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3)); 127 } 128 /* GRR: add "else if (mask == 0)" case? 129 * or does png_combine_row() not even get called in that case? */ 130 else 131 { 132 switch (png_ptr->row_info.pixel_depth) 133 { 134 case 1: 135 { 136 png_bytep sp; 137 png_bytep dp; 138 int s_inc, s_start, s_end; 139 int m; 140 int shift; 141 png_uint_32 i; 142 143 sp = png_ptr->row_buf + 1; 144 dp = row; 145 m = 0x80; 146#if defined(PNG_READ_PACKSWAP_SUPPORTED) 147 if (png_ptr->transformations & PNG_PACKSWAP) 148 { 149 s_start = 0; 150 s_end = 7; 151 s_inc = 1; 152 } 153 else 154#endif 155 { 156 s_start = 7; 157 s_end = 0; 158 s_inc = -1; 159 } 160 161 shift = s_start; 162 163 for (i = 0; i < png_ptr->width; i++) 164 { 165 if (m & mask) 166 { 167 int value; 168 169 value = (*sp >> shift) & 0x1; 170 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff); 171 *dp |= (png_byte)(value << shift); 172 } 173 174 if (shift == s_end) 175 { 176 shift = s_start; 177 sp++; 178 dp++; 179 } 180 else 181 shift += s_inc; 182 183 if (m == 1) 184 m = 0x80; 185 else 186 m >>= 1; 187 } 188 break; 189 } 190 191 case 2: 192 { 193 png_bytep sp; 194 png_bytep dp; 195 int s_start, s_end, s_inc; 196 int m; 197 int shift; 198 png_uint_32 i; 199 int value; 200 201 sp = png_ptr->row_buf + 1; 202 dp = row; 203 m = 0x80; 204#if defined(PNG_READ_PACKSWAP_SUPPORTED) 205 if (png_ptr->transformations & PNG_PACKSWAP) 206 { 207 s_start = 0; 208 s_end = 6; 209 s_inc = 2; 210 } 211 else 212#endif 213 { 214 s_start = 6; 215 s_end = 0; 216 s_inc = -2; 217 } 218 219 shift = s_start; 220 221 for (i = 0; i < png_ptr->width; i++) 222 { 223 if (m & mask) 224 { 225 value = (*sp >> shift) & 0x3; 226 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff); 227 *dp |= (png_byte)(value << shift); 228 } 229 230 if (shift == s_end) 231 { 232 shift = s_start; 233 sp++; 234 dp++; 235 } 236 else 237 shift += s_inc; 238 if (m == 1) 239 m = 0x80; 240 else 241 m >>= 1; 242 } 243 break; 244 } 245 246 case 4: 247 { 248 png_bytep sp; 249 png_bytep dp; 250 int s_start, s_end, s_inc; 251 int m; 252 int shift; 253 png_uint_32 i; 254 int value; 255 256 sp = png_ptr->row_buf + 1; 257 dp = row; 258 m = 0x80; 259#if defined(PNG_READ_PACKSWAP_SUPPORTED) 260 if (png_ptr->transformations & PNG_PACKSWAP) 261 { 262 s_start = 0; 263 s_end = 4; 264 s_inc = 4; 265 } 266 else 267#endif 268 { 269 s_start = 4; 270 s_end = 0; 271 s_inc = -4; 272 } 273 shift = s_start; 274 275 for (i = 0; i < png_ptr->width; i++) 276 { 277 if (m & mask) 278 { 279 value = (*sp >> shift) & 0xf; 280 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff); 281 *dp |= (png_byte)(value << shift); 282 } 283 284 if (shift == s_end) 285 { 286 shift = s_start; 287 sp++; 288 dp++; 289 } 290 else 291 shift += s_inc; 292 if (m == 1) 293 m = 0x80; 294 else 295 m >>= 1; 296 } 297 break; 298 } 299 300 case 8: 301 { 302 png_bytep srcptr; 303 png_bytep dstptr; 304 png_uint_32 len; 305 int m; 306 int diff, unmask; 307 308 __int64 mask0=0x0102040810204080; 309 310 if (mmx_supported) 311 { 312 srcptr = png_ptr->row_buf + 1; 313 dstptr = row; 314 m = 0x80; 315 unmask = ~mask; 316 len = png_ptr->width &~7; //reduce to multiple of 8 317 diff = png_ptr->width & 7; //amount lost 318 319 _asm 320 { 321 movd mm7, unmask //load bit pattern 322 psubb mm6,mm6 //zero mm6 323 punpcklbw mm7,mm7 324 punpcklwd mm7,mm7 325 punpckldq mm7,mm7 //fill register with 8 masks 326 327 movq mm0,mask0 328 329 pand mm0,mm7 //nonzero if keep byte 330 pcmpeqb mm0,mm6 //zeros->1s, v versa 331 332 mov ecx,len //load length of line (pixels) 333 mov esi,srcptr //load source 334 mov ebx,dstptr //load dest 335 cmp ecx,0 //lcr 336 je mainloop8end 337 338mainloop8: 339 movq mm4,[esi] 340 pand mm4,mm0 341 movq mm6,mm0 342 pandn mm6,[ebx] 343 por mm4,mm6 344 movq [ebx],mm4 345 346 add esi,8 //inc by 8 bytes processed 347 add ebx,8 348 sub ecx,8 //dec by 8 pixels processed 349 350 ja mainloop8 351mainloop8end: 352 353 mov ecx,diff 354 cmp ecx,0 355 jz end8 356 357 mov edx,mask 358 sal edx,24 //make low byte the high byte 359 360secondloop8: 361 sal edx,1 //move high bit to CF 362 jnc skip8 //if CF = 0 363 mov al,[esi] 364 mov [ebx],al 365skip8: 366 inc esi 367 inc ebx 368 369 dec ecx 370 jnz secondloop8 371end8: 372 emms 373 } 374 } 375 else /* mmx not supported - use modified C routine */ 376 { 377 register unsigned int incr1, initial_val, final_val; 378 png_size_t pixel_bytes; 379 png_uint_32 i; 380 register int disp = png_pass_inc[png_ptr->pass]; 381 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; 382 383 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); 384 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* 385 pixel_bytes; 386 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; 387 initial_val = offset_table[png_ptr->pass]*pixel_bytes; 388 final_val = png_ptr->width*pixel_bytes; 389 incr1 = (disp)*pixel_bytes; 390 for (i = initial_val; i < final_val; i += incr1) 391 { 392 png_memcpy(dstptr, srcptr, pixel_bytes); 393 srcptr += incr1; 394 dstptr += incr1; 395 } 396 } /* end of else */ 397 398 break; 399 } // end 8 bpp 400 401 case 16: 402 { 403 png_bytep srcptr; 404 png_bytep dstptr; 405 png_uint_32 len; 406 int unmask, diff; 407 __int64 mask1=0x0101020204040808, 408 mask0=0x1010202040408080; 409 410 if (mmx_supported) 411 { 412 srcptr = png_ptr->row_buf + 1; 413 dstptr = row; 414 415 unmask = ~mask; 416 len = (png_ptr->width)&~7; 417 diff = (png_ptr->width)&7; 418 _asm 419 { 420 movd mm7, unmask //load bit pattern 421 psubb mm6,mm6 //zero mm6 422 punpcklbw mm7,mm7 423 punpcklwd mm7,mm7 424 punpckldq mm7,mm7 //fill register with 8 masks 425 426 movq mm0,mask0 427 movq mm1,mask1 428 429 pand mm0,mm7 430 pand mm1,mm7 431 432 pcmpeqb mm0,mm6 433 pcmpeqb mm1,mm6 434 435 mov ecx,len //load length of line 436 mov esi,srcptr //load source 437 mov ebx,dstptr //load dest 438 cmp ecx,0 //lcr 439 jz mainloop16end 440 441mainloop16: 442 movq mm4,[esi] 443 pand mm4,mm0 444 movq mm6,mm0 445 movq mm7,[ebx] 446 pandn mm6,mm7 447 por mm4,mm6 448 movq [ebx],mm4 449 450 movq mm5,[esi+8] 451 pand mm5,mm1 452 movq mm7,mm1 453 movq mm6,[ebx+8] 454 pandn mm7,mm6 455 por mm5,mm7 456 movq [ebx+8],mm5 457 458 add esi,16 //inc by 16 bytes processed 459 add ebx,16 460 sub ecx,8 //dec by 8 pixels processed 461 462 ja mainloop16 463 464mainloop16end: 465 mov ecx,diff 466 cmp ecx,0 467 jz end16 468 469 mov edx,mask 470 sal edx,24 //make low byte the high byte 471secondloop16: 472 sal edx,1 //move high bit to CF 473 jnc skip16 //if CF = 0 474 mov ax,[esi] 475 mov [ebx],ax 476skip16: 477 add esi,2 478 add ebx,2 479 480 dec ecx 481 jnz secondloop16 482end16: 483 emms 484 } 485 } 486 else /* mmx not supported - use modified C routine */ 487 { 488 register unsigned int incr1, initial_val, final_val; 489 png_size_t pixel_bytes; 490 png_uint_32 i; 491 register int disp = png_pass_inc[png_ptr->pass]; 492 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; 493 494 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); 495 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* 496 pixel_bytes; 497 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; 498 initial_val = offset_table[png_ptr->pass]*pixel_bytes; 499 final_val = png_ptr->width*pixel_bytes; 500 incr1 = (disp)*pixel_bytes; 501 for (i = initial_val; i < final_val; i += incr1) 502 { 503 png_memcpy(dstptr, srcptr, pixel_bytes); 504 srcptr += incr1; 505 dstptr += incr1; 506 } 507 } /* end of else */ 508 509 break; 510 } // end 16 bpp 511 512 case 24: 513 { 514 png_bytep srcptr; 515 png_bytep dstptr; 516 png_uint_32 len; 517 int unmask, diff; 518 519 __int64 mask2=0x0101010202020404, //24bpp 520 mask1=0x0408080810101020, 521 mask0=0x2020404040808080; 522 523 srcptr = png_ptr->row_buf + 1; 524 dstptr = row; 525 526 unmask = ~mask; 527 len = (png_ptr->width)&~7; 528 diff = (png_ptr->width)&7; 529 530 if (mmx_supported) 531 { 532 _asm 533 { 534 movd mm7, unmask //load bit pattern 535 psubb mm6,mm6 //zero mm6 536 punpcklbw mm7,mm7 537 punpcklwd mm7,mm7 538 punpckldq mm7,mm7 //fill register with 8 masks 539 540 movq mm0,mask0 541 movq mm1,mask1 542 movq mm2,mask2 543 544 pand mm0,mm7 545 pand mm1,mm7 546 pand mm2,mm7 547 548 pcmpeqb mm0,mm6 549 pcmpeqb mm1,mm6 550 pcmpeqb mm2,mm6 551 552 mov ecx,len //load length of line 553 mov esi,srcptr //load source 554 mov ebx,dstptr //load dest 555 cmp ecx,0 556 jz mainloop24end 557 558mainloop24: 559 movq mm4,[esi] 560 pand mm4,mm0 561 movq mm6,mm0 562 movq mm7,[ebx] 563 pandn mm6,mm7 564 por mm4,mm6 565 movq [ebx],mm4 566 567 568 movq mm5,[esi+8] 569 pand mm5,mm1 570 movq mm7,mm1 571 movq mm6,[ebx+8] 572 pandn mm7,mm6 573 por mm5,mm7 574 movq [ebx+8],mm5 575 576 movq mm6,[esi+16] 577 pand mm6,mm2 578 movq mm4,mm2 579 movq mm7,[ebx+16] 580 pandn mm4,mm7 581 por mm6,mm4 582 movq [ebx+16],mm6 583 584 add esi,24 //inc by 24 bytes processed 585 add ebx,24 586 sub ecx,8 //dec by 8 pixels processed 587 588 ja mainloop24 589 590mainloop24end: 591 mov ecx,diff 592 cmp ecx,0 593 jz end24 594 595 mov edx,mask 596 sal edx,24 //make low byte the high byte 597secondloop24: 598 sal edx,1 //move high bit to CF 599 jnc skip24 //if CF = 0 600 mov ax,[esi] 601 mov [ebx],ax 602 xor eax,eax 603 mov al,[esi+2] 604 mov [ebx+2],al 605skip24: 606 add esi,3 607 add ebx,3 608 609 dec ecx 610 jnz secondloop24 611 612end24: 613 emms 614 } 615 } 616 else /* mmx not supported - use modified C routine */ 617 { 618 register unsigned int incr1, initial_val, final_val; 619 png_size_t pixel_bytes; 620 png_uint_32 i; 621 register int disp = png_pass_inc[png_ptr->pass]; 622 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; 623 624 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); 625 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* 626 pixel_bytes; 627 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; 628 initial_val = offset_table[png_ptr->pass]*pixel_bytes; 629 final_val = png_ptr->width*pixel_bytes; 630 incr1 = (disp)*pixel_bytes; 631 for (i = initial_val; i < final_val; i += incr1) 632 { 633 png_memcpy(dstptr, srcptr, pixel_bytes); 634 srcptr += incr1; 635 dstptr += incr1; 636 } 637 } /* end of else */ 638 639 break; 640 } // end 24 bpp 641 642 case 32: 643 { 644 png_bytep srcptr; 645 png_bytep dstptr; 646 png_uint_32 len; 647 int unmask, diff; 648 649 __int64 mask3=0x0101010102020202, //32bpp 650 mask2=0x0404040408080808, 651 mask1=0x1010101020202020, 652 mask0=0x4040404080808080; 653 654 srcptr = png_ptr->row_buf + 1; 655 dstptr = row; 656 657 unmask = ~mask; 658 len = (png_ptr->width)&~7; 659 diff = (png_ptr->width)&7; 660 661 if (mmx_supported) 662 { 663 _asm 664 { 665 movd mm7, unmask //load bit pattern 666 psubb mm6,mm6 //zero mm6 667 punpcklbw mm7,mm7 668 punpcklwd mm7,mm7 669 punpckldq mm7,mm7 //fill register with 8 masks 670 671 movq mm0,mask0 672 movq mm1,mask1 673 movq mm2,mask2 674 movq mm3,mask3 675 676 pand mm0,mm7 677 pand mm1,mm7 678 pand mm2,mm7 679 pand mm3,mm7 680 681 pcmpeqb mm0,mm6 682 pcmpeqb mm1,mm6 683 pcmpeqb mm2,mm6 684 pcmpeqb mm3,mm6 685 686 mov ecx,len //load length of line 687 mov esi,srcptr //load source 688 mov ebx,dstptr //load dest 689 690 cmp ecx,0 //lcr 691 jz mainloop32end 692 693mainloop32: 694 movq mm4,[esi] 695 pand mm4,mm0 696 movq mm6,mm0 697 movq mm7,[ebx] 698 pandn mm6,mm7 699 por mm4,mm6 700 movq [ebx],mm4 701 702 movq mm5,[esi+8] 703 pand mm5,mm1 704 movq mm7,mm1 705 movq mm6,[ebx+8] 706 pandn mm7,mm6 707 por mm5,mm7 708 movq [ebx+8],mm5 709 710 movq mm6,[esi+16] 711 pand mm6,mm2 712 movq mm4,mm2 713 movq mm7,[ebx+16] 714 pandn mm4,mm7 715 por mm6,mm4 716 movq [ebx+16],mm6 717 718 movq mm7,[esi+24] 719 pand mm7,mm3 720 movq mm5,mm3 721 movq mm4,[ebx+24] 722 pandn mm5,mm4 723 por mm7,mm5 724 movq [ebx+24],mm7 725 726 add esi,32 //inc by 32 bytes processed 727 add ebx,32 728 sub ecx,8 //dec by 8 pixels processed 729 730 ja mainloop32 731 732mainloop32end: 733 mov ecx,diff 734 cmp ecx,0 735 jz end32 736 737 mov edx,mask 738 sal edx,24 //make low byte the high byte 739secondloop32: 740 sal edx,1 //move high bit to CF 741 jnc skip32 //if CF = 0 742 mov eax,[esi] 743 mov [ebx],eax 744skip32: 745 add esi,4 746 add ebx,4 747 748 dec ecx 749 jnz secondloop32 750 751end32: 752 emms 753 } 754 } 755 else /* mmx _not supported - Use modified C routine */ 756 { 757 register unsigned int incr1, initial_val, final_val; 758 png_size_t pixel_bytes; 759 png_uint_32 i; 760 register int disp = png_pass_inc[png_ptr->pass]; 761 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; 762 763 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); 764 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* 765 pixel_bytes; 766 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; 767 initial_val = offset_table[png_ptr->pass]*pixel_bytes; 768 final_val = png_ptr->width*pixel_bytes; 769 incr1 = (disp)*pixel_bytes; 770 for (i = initial_val; i < final_val; i += incr1) 771 { 772 png_memcpy(dstptr, srcptr, pixel_bytes); 773 srcptr += incr1; 774 dstptr += incr1; 775 } 776 } /* end of else */ 777 778 break; 779 } // end 32 bpp 780 781 case 48: 782 { 783 png_bytep srcptr; 784 png_bytep dstptr; 785 png_uint_32 len; 786 int unmask, diff; 787 788 __int64 mask5=0x0101010101010202, 789 mask4=0x0202020204040404, 790 mask3=0x0404080808080808, 791 mask2=0x1010101010102020, 792 mask1=0x2020202040404040, 793 mask0=0x4040808080808080; 794 795 if (mmx_supported) 796 { 797 srcptr = png_ptr->row_buf + 1; 798 dstptr = row; 799 800 unmask = ~mask; 801 len = (png_ptr->width)&~7; 802 diff = (png_ptr->width)&7; 803 _asm 804 { 805 movd mm7, unmask //load bit pattern 806 psubb mm6,mm6 //zero mm6 807 punpcklbw mm7,mm7 808 punpcklwd mm7,mm7 809 punpckldq mm7,mm7 //fill register with 8 masks 810 811 movq mm0,mask0 812 movq mm1,mask1 813 movq mm2,mask2 814 movq mm3,mask3 815 movq mm4,mask4 816 movq mm5,mask5 817 818 pand mm0,mm7 819 pand mm1,mm7 820 pand mm2,mm7 821 pand mm3,mm7 822 pand mm4,mm7 823 pand mm5,mm7 824 825 pcmpeqb mm0,mm6 826 pcmpeqb mm1,mm6 827 pcmpeqb mm2,mm6 828 pcmpeqb mm3,mm6 829 pcmpeqb mm4,mm6 830 pcmpeqb mm5,mm6 831 832 mov ecx,len //load length of line 833 mov esi,srcptr //load source 834 mov ebx,dstptr //load dest 835 836 cmp ecx,0 837 jz mainloop48end 838 839mainloop48: 840 movq mm7,[esi] 841 pand mm7,mm0 842 movq mm6,mm0 843 pandn mm6,[ebx] 844 por mm7,mm6 845 movq [ebx],mm7 846 847 movq mm6,[esi+8] 848 pand mm6,mm1 849 movq mm7,mm1 850 pandn mm7,[ebx+8] 851 por mm6,mm7 852 movq [ebx+8],mm6 853 854 movq mm6,[esi+16] 855 pand mm6,mm2 856 movq mm7,mm2 857 pandn mm7,[ebx+16] 858 por mm6,mm7 859 movq [ebx+16],mm6 860 861 movq mm7,[esi+24] 862 pand mm7,mm3 863 movq mm6,mm3 864 pandn mm6,[ebx+24] 865 por mm7,mm6 866 movq [ebx+24],mm7 867 868 movq mm6,[esi+32] 869 pand mm6,mm4 870 movq mm7,mm4 871 pandn mm7,[ebx+32] 872 por mm6,mm7 873 movq [ebx+32],mm6 874 875 movq mm7,[esi+40] 876 pand mm7,mm5 877 movq mm6,mm5 878 pandn mm6,[ebx+40] 879 por mm7,mm6 880 movq [ebx+40],mm7 881 882 add esi,48 //inc by 32 bytes processed 883 add ebx,48 884 sub ecx,8 //dec by 8 pixels processed 885 886 ja mainloop48 887mainloop48end: 888 889 mov ecx,diff 890 cmp ecx,0 891 jz end48 892 893 mov edx,mask 894 sal edx,24 //make low byte the high byte 895 896secondloop48: 897 sal edx,1 //move high bit to CF 898 jnc skip48 //if CF = 0 899 mov eax,[esi] 900 mov [ebx],eax 901skip48: 902 add esi,4 903 add ebx,4 904 905 dec ecx 906 jnz secondloop48 907 908end48: 909 emms 910 } 911 } 912 else /* mmx _not supported - Use modified C routine */ 913 { 914 register unsigned int incr1, initial_val, final_val; 915 png_size_t pixel_bytes; 916 png_uint_32 i; 917 register int disp = png_pass_inc[png_ptr->pass]; 918 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; 919 920 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); 921 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* 922 pixel_bytes; 923 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; 924 initial_val = offset_table[png_ptr->pass]*pixel_bytes; 925 final_val = png_ptr->width*pixel_bytes; 926 incr1 = (disp)*pixel_bytes; 927 for (i = initial_val; i < final_val; i += incr1) 928 { 929 png_memcpy(dstptr, srcptr, pixel_bytes); 930 srcptr += incr1; 931 dstptr += incr1; 932 } 933 } /* end of else */ 934 935 break; 936 } // end 48 bpp 937 938 default: 939 { 940 png_bytep sptr; 941 png_bytep dp; 942 png_size_t pixel_bytes; 943 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; 944 unsigned int i; 945 register int disp = png_pass_inc[png_ptr->pass]; // get the offset 946 register unsigned int incr1, initial_val, final_val; 947 948 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); 949 sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* 950 pixel_bytes; 951 dp = row + offset_table[png_ptr->pass]*pixel_bytes; 952 initial_val = offset_table[png_ptr->pass]*pixel_bytes; 953 final_val = png_ptr->width*pixel_bytes; 954 incr1 = (disp)*pixel_bytes; 955 for (i = initial_val; i < final_val; i += incr1) 956 { 957 png_memcpy(dp, sptr, pixel_bytes); 958 sptr += incr1; 959 dp += incr1; 960 } 961 break; 962 } 963 } /* end switch (png_ptr->row_info.pixel_depth) */ 964 } /* end if (non-trivial mask) */ 965 966#ifdef DISABLE_PNGVCRD_COMBINE 967 mmx_supported = save_mmx_supported; 968#endif 969 970} /* end png_combine_row() */ 971 972 973#if defined(PNG_READ_INTERLACING_SUPPORTED) 974 975void /* PRIVATE */ 976png_do_read_interlace(png_row_infop row_info, png_bytep row, int pass, 977 png_uint_32 transformations) 978{ 979#ifdef PNG_USE_LOCAL_ARRAYS 980 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1}; 981#endif 982#ifdef DISABLE_PNGVCRD_INTERLACE 983 int save_mmx_supported = mmx_supported; 984#endif 985 986 png_debug(1,"in png_do_read_interlace\n"); 987 988#ifdef DISABLE_PNGVCRD_INTERLACE 989 /* In libpng versions 1.0.3a through 1.0.4d, 990 * a sign error in the post-MMX cleanup code for each pixel_depth resulted 991 * in bad pixels at the beginning of some rows of some images, and also 992 * (due to out-of-range memory reads and writes) caused heap corruption 993 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e, 994 * and the code appears to work completely correctly, so it is enabled 995 * by default. 996 */ 997 if (1) /* all passes caused a heap problem in the old code */ 998 mmx_supported = 0; 999 else 1000#endif 1001 if (mmx_supported == 2) 1002 mmx_supported = mmxsupport(); 1003 1004 if (row != NULL && row_info != NULL) 1005 { 1006 png_uint_32 final_width; 1007 1008 final_width = row_info->width * png_pass_inc[pass]; 1009 1010 switch (row_info->pixel_depth) 1011 { 1012 case 1: 1013 { 1014 png_bytep sp, dp; 1015 int sshift, dshift; 1016 int s_start, s_end, s_inc; 1017 png_byte v; 1018 png_uint_32 i; 1019 int j; 1020 1021 sp = row + (png_size_t)((row_info->width - 1) >> 3); 1022 dp = row + (png_size_t)((final_width - 1) >> 3); 1023#if defined(PNG_READ_PACKSWAP_SUPPORTED) 1024 if (transformations & PNG_PACKSWAP) 1025 { 1026 sshift = (int)((row_info->width + 7) & 7); 1027 dshift = (int)((final_width + 7) & 7); 1028 s_start = 7; 1029 s_end = 0; 1030 s_inc = -1; 1031 } 1032 else 1033#endif 1034 { 1035 sshift = 7 - (int)((row_info->width + 7) & 7); 1036 dshift = 7 - (int)((final_width + 7) & 7); 1037 s_start = 0; 1038 s_end = 7; 1039 s_inc = 1; 1040 } 1041 1042 for (i = row_info->width; i; i--) 1043 { 1044 v = (png_byte)((*sp >> sshift) & 0x1); 1045 for (j = 0; j < png_pass_inc[pass]; j++) 1046 { 1047 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff); 1048 *dp |= (png_byte)(v << dshift); 1049 if (dshift == s_end) 1050 { 1051 dshift = s_start; 1052 dp--; 1053 } 1054 else 1055 dshift += s_inc; 1056 } 1057 if (sshift == s_end) 1058 { 1059 sshift = s_start; 1060 sp--; 1061 } 1062 else 1063 sshift += s_inc; 1064 } 1065 break; 1066 } 1067 1068 case 2: 1069 { 1070 png_bytep sp, dp; 1071 int sshift, dshift; 1072 int s_start, s_end, s_inc; 1073 png_uint_32 i; 1074 1075 sp = row + (png_size_t)((row_info->width - 1) >> 2); 1076 dp = row + (png_size_t)((final_width - 1) >> 2); 1077#if defined(PNG_READ_PACKSWAP_SUPPORTED) 1078 if (transformations & PNG_PACKSWAP) 1079 { 1080 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1); 1081 dshift = (png_size_t)(((final_width + 3) & 3) << 1); 1082 s_start = 6; 1083 s_end = 0; 1084 s_inc = -2; 1085 } 1086 else 1087#endif 1088 { 1089 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1); 1090 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1); 1091 s_start = 0; 1092 s_end = 6; 1093 s_inc = 2; 1094 } 1095 1096 for (i = row_info->width; i; i--) 1097 { 1098 png_byte v; 1099 int j; 1100 1101 v = (png_byte)((*sp >> sshift) & 0x3); 1102 for (j = 0; j < png_pass_inc[pass]; j++) 1103 { 1104 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff); 1105 *dp |= (png_byte)(v << dshift); 1106 if (dshift == s_end) 1107 { 1108 dshift = s_start; 1109 dp--; 1110 } 1111 else 1112 dshift += s_inc; 1113 } 1114 if (sshift == s_end) 1115 { 1116 sshift = s_start; 1117 sp--; 1118 } 1119 else 1120 sshift += s_inc; 1121 } 1122 break; 1123 } 1124 1125 case 4: 1126 { 1127 png_bytep sp, dp; 1128 int sshift, dshift; 1129 int s_start, s_end, s_inc; 1130 png_uint_32 i; 1131 1132 sp = row + (png_size_t)((row_info->width - 1) >> 1); 1133 dp = row + (png_size_t)((final_width - 1) >> 1); 1134#if defined(PNG_READ_PACKSWAP_SUPPORTED) 1135 if (transformations & PNG_PACKSWAP) 1136 { 1137 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2); 1138 dshift = (png_size_t)(((final_width + 1) & 1) << 2); 1139 s_start = 4; 1140 s_end = 0; 1141 s_inc = -4; 1142 } 1143 else 1144#endif 1145 { 1146 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2); 1147 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2); 1148 s_start = 0; 1149 s_end = 4; 1150 s_inc = 4; 1151 } 1152 1153 for (i = row_info->width; i; i--) 1154 { 1155 png_byte v; 1156 int j; 1157 1158 v = (png_byte)((*sp >> sshift) & 0xf); 1159 for (j = 0; j < png_pass_inc[pass]; j++) 1160 { 1161 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff); 1162 *dp |= (png_byte)(v << dshift); 1163 if (dshift == s_end) 1164 { 1165 dshift = s_start; 1166 dp--; 1167 } 1168 else 1169 dshift += s_inc; 1170 } 1171 if (sshift == s_end) 1172 { 1173 sshift = s_start; 1174 sp--; 1175 } 1176 else 1177 sshift += s_inc; 1178 } 1179 break; 1180 } 1181 1182 default: // This is the place where the routine is modified 1183 { 1184 __int64 const4 = 0x0000000000FFFFFF; 1185 // __int64 const5 = 0x000000FFFFFF0000; // unused... 1186 __int64 const6 = 0x00000000000000FF; 1187 png_bytep sptr, dp; 1188 png_uint_32 i; 1189 png_size_t pixel_bytes; 1190 int width = row_info->width; 1191 1192 pixel_bytes = (row_info->pixel_depth >> 3); 1193 1194 sptr = row + (width - 1) * pixel_bytes; 1195 dp = row + (final_width - 1) * pixel_bytes; 1196 // New code by Nirav Chhatrapati - Intel Corporation 1197 // sign fix by GRR 1198 // NOTE: there is NO MMX code for 48-bit and 64-bit images 1199 1200 if (mmx_supported) // use MMX routine if machine supports it 1201 { 1202 if (pixel_bytes == 3) 1203 { 1204 if (((pass == 0) || (pass == 1)) && width) 1205 { 1206 _asm 1207 { 1208 mov esi, sptr 1209 mov edi, dp 1210 mov ecx, width 1211 sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes 1212loop_pass0: 1213 movd mm0, [esi] ; X X X X X v2 v1 v0 1214 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0 1215 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0 1216 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0 1217 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0 1218 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0 1219 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1 1220 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0 1221 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1 1222 movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1 1223 psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0 1224 movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1 1225 punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2 1226 movq [edi+16] , mm4 1227 psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0 1228 movq [edi+8] , mm3 1229 punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0 1230 sub esi, 3 1231 movq [edi], mm0 1232 sub edi, 24 1233 //sub esi, 3 1234 dec ecx 1235 jnz loop_pass0 1236 EMMS 1237 } 1238 } 1239 else if (((pass == 2) || (pass == 3)) && width) 1240 { 1241 _asm 1242 { 1243 mov esi, sptr 1244 mov edi, dp 1245 mov ecx, width 1246 sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes 1247loop_pass2: 1248 movd mm0, [esi] ; X X X X X v2 v1 v0 1249 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0 1250 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0 1251 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0 1252 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0 1253 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0 1254 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1 1255 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0 1256 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1 1257 movq [edi+4], mm0 ; move to memory 1258 psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0 1259 movd [edi], mm0 ; move to memory 1260 sub esi, 3 1261 sub edi, 12 1262 dec ecx 1263 jnz loop_pass2 1264 EMMS 1265 } 1266 } 1267 else if (width) /* && ((pass == 4) || (pass == 5)) */ 1268 { 1269 int width_mmx = ((width >> 1) << 1) - 8; 1270 if (width_mmx < 0) 1271 width_mmx = 0; 1272 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes 1273 if (width_mmx) 1274 { 1275 _asm 1276 { 1277 mov esi, sptr 1278 mov edi, dp 1279 mov ecx, width_mmx 1280 sub esi, 3 1281 sub edi, 9 1282loop_pass4: 1283 movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3 1284 movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3 1285 movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3 1286 psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0 1287 pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3 1288 psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0 1289 por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3 1290 movq mm5, mm6 ; 0 0 0 X X v2 v1 v0 1291 psllq mm6, 8 ; 0 0 X X v2 v1 v0 0 1292 movq [edi], mm0 ; move quad to memory 1293 psrlq mm5, 16 ; 0 0 0 0 0 X X v2 1294 pand mm5, const6 ; 0 0 0 0 0 0 0 v2 1295 por mm6, mm5 ; 0 0 X X v2 v1 v0 v2 1296 movd [edi+8], mm6 ; move double to memory 1297 sub esi, 6 1298 sub edi, 12 1299 sub ecx, 2 1300 jnz loop_pass4 1301 EMMS 1302 } 1303 } 1304 1305 sptr -= width_mmx*3; 1306 dp -= width_mmx*6; 1307 for (i = width; i; i--) 1308 { 1309 png_byte v[8]; 1310 int j; 1311 1312 png_memcpy(v, sptr, 3); 1313 for (j = 0; j < png_pass_inc[pass]; j++) 1314 { 1315 png_memcpy(dp, v, 3); 1316 dp -= 3; 1317 } 1318 sptr -= 3; 1319 } 1320 } 1321 } /* end of pixel_bytes == 3 */ 1322 1323 else if (pixel_bytes == 1) 1324 { 1325 if (((pass == 0) || (pass == 1)) && width) 1326 { 1327 int width_mmx = ((width >> 2) << 2); 1328 width -= width_mmx; 1329 if (width_mmx) 1330 { 1331 _asm 1332 { 1333 mov esi, sptr 1334 mov edi, dp 1335 mov ecx, width_mmx 1336 sub edi, 31 1337 sub esi, 3 1338loop1_pass0: 1339 movd mm0, [esi] ; X X X X v0 v1 v2 v3 1340 movq mm1, mm0 ; X X X X v0 v1 v2 v3 1341 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 1342 movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 1343 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3 1344 movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3 1345 punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3 1346 punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2 1347 movq [edi], mm0 ; move to memory v3 1348 punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1 1349 movq [edi+8], mm3 ; move to memory v2 1350 movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1 1351 punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1 1352 punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0 1353 movq [edi+16], mm2 ; move to memory v1 1354 movq [edi+24], mm4 ; move to memory v0 1355 sub esi, 4 1356 sub edi, 32 1357 sub ecx, 4 1358 jnz loop1_pass0 1359 EMMS 1360 } 1361 } 1362 1363 sptr -= width_mmx; 1364 dp -= width_mmx*8; 1365 for (i = width; i; i--) 1366 { 1367 int j; 1368 1369 /* I simplified this part in version 1.0.4e 1370 * here and in several other instances where 1371 * pixel_bytes == 1 -- GR-P 1372 * 1373 * Original code: 1374 * 1375 * png_byte v[8]; 1376 * png_memcpy(v, sptr, pixel_bytes); 1377 * for (j = 0; j < png_pass_inc[pass]; j++) 1378 * { 1379 * png_memcpy(dp, v, pixel_bytes); 1380 * dp -= pixel_bytes; 1381 * } 1382 * sptr -= pixel_bytes; 1383 * 1384 * Replacement code is in the next three lines: 1385 */ 1386 1387 for (j = 0; j < png_pass_inc[pass]; j++) 1388 *dp-- = *sptr; 1389 sptr--; 1390 } 1391 } 1392 else if (((pass == 2) || (pass == 3)) && width) 1393 { 1394 int width_mmx = ((width >> 2) << 2); 1395 width -= width_mmx; 1396 if (width_mmx) 1397 { 1398 _asm 1399 { 1400 mov esi, sptr 1401 mov edi, dp 1402 mov ecx, width_mmx 1403 sub edi, 15 1404 sub esi, 3 1405loop1_pass2: 1406 movd mm0, [esi] ; X X X X v0 v1 v2 v3 1407 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 1408 movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 1409 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3 1410 punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1 1411 movq [edi], mm0 ; move to memory v2 and v3 1412 sub esi, 4 1413 movq [edi+8], mm1 ; move to memory v1 and v0 1414 sub edi, 16 1415 sub ecx, 4 1416 jnz loop1_pass2 1417 EMMS 1418 } 1419 } 1420 1421 sptr -= width_mmx; 1422 dp -= width_mmx*4; 1423 for (i = width; i; i--) 1424 { 1425 int j; 1426 1427 for (j = 0; j < png_pass_inc[pass]; j++) 1428 { 1429 *dp-- = *sptr; 1430 } 1431 sptr --; 1432 } 1433 } 1434 else if (width) /* && ((pass == 4) || (pass == 5))) */ 1435 { 1436 int width_mmx = ((width >> 3) << 3); 1437 width -= width_mmx; 1438 if (width_mmx) 1439 { 1440 _asm 1441 { 1442 mov esi, sptr 1443 mov edi, dp 1444 mov ecx, width_mmx 1445 sub edi, 15 1446 sub esi, 7 1447loop1_pass4: 1448 movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7 1449 movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7 1450 punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7 1451 //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 1452 punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3 1453 movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3 1454 sub esi, 8 1455 movq [edi], mm0 ; move to memory v4 v5 v6 and v7 1456 //sub esi, 4 1457 sub edi, 16 1458 sub ecx, 8 1459 jnz loop1_pass4 1460 EMMS 1461 } 1462 } 1463 1464 sptr -= width_mmx; 1465 dp -= width_mmx*2; 1466 for (i = width; i; i--) 1467 { 1468 int j; 1469 1470 for (j = 0; j < png_pass_inc[pass]; j++) 1471 { 1472 *dp-- = *sptr; 1473 } 1474 sptr --; 1475 } 1476 } 1477 } /* end of pixel_bytes == 1 */ 1478 1479 else if (pixel_bytes == 2) 1480 { 1481 if (((pass == 0) || (pass == 1)) && width) 1482 { 1483 int width_mmx = ((width >> 1) << 1); 1484 width -= width_mmx; 1485 if (width_mmx) 1486 { 1487 _asm 1488 { 1489 mov esi, sptr 1490 mov edi, dp 1491 mov ecx, width_mmx 1492 sub esi, 2 1493 sub edi, 30 1494loop2_pass0: 1495 movd mm0, [esi] ; X X X X v1 v0 v3 v2 1496 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 1497 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 1498 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2 1499 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0 1500 movq [edi], mm0 1501 movq [edi + 8], mm0 1502 movq [edi + 16], mm1 1503 movq [edi + 24], mm1 1504 sub esi, 4 1505 sub edi, 32 1506 sub ecx, 2 1507 jnz loop2_pass0 1508 EMMS 1509 } 1510 } 1511 1512 sptr -= (width_mmx*2 - 2); // sign fixed 1513 dp -= (width_mmx*16 - 2); // sign fixed 1514 for (i = width; i; i--) 1515 { 1516 png_byte v[8]; 1517 int j; 1518 sptr -= 2; 1519 png_memcpy(v, sptr, 2); 1520 for (j = 0; j < png_pass_inc[pass]; j++) 1521 { 1522 dp -= 2; 1523 png_memcpy(dp, v, 2); 1524 } 1525 } 1526 } 1527 else if (((pass == 2) || (pass == 3)) && width) 1528 { 1529 int width_mmx = ((width >> 1) << 1) ; 1530 width -= width_mmx; 1531 if (width_mmx) 1532 { 1533 _asm 1534 { 1535 mov esi, sptr 1536 mov edi, dp 1537 mov ecx, width_mmx 1538 sub esi, 2 1539 sub edi, 14 1540loop2_pass2: 1541 movd mm0, [esi] ; X X X X v1 v0 v3 v2 1542 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 1543 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 1544 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2 1545 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0 1546 movq [edi], mm0 1547 sub esi, 4 1548 movq [edi + 8], mm1 1549 //sub esi, 4 1550 sub edi, 16 1551 sub ecx, 2 1552 jnz loop2_pass2 1553 EMMS 1554 } 1555 } 1556 1557 sptr -= (width_mmx*2 - 2); // sign fixed 1558 dp -= (width_mmx*8 - 2); // sign fixed 1559 for (i = width; i; i--) 1560 { 1561 png_byte v[8]; 1562 int j; 1563 sptr -= 2; 1564 png_memcpy(v, sptr, 2); 1565 for (j = 0; j < png_pass_inc[pass]; j++) 1566 { 1567 dp -= 2; 1568 png_memcpy(dp, v, 2); 1569 } 1570 } 1571 } 1572 else if (width) // pass == 4 or 5 1573 { 1574 int width_mmx = ((width >> 1) << 1) ; 1575 width -= width_mmx; 1576 if (width_mmx) 1577 { 1578 _asm 1579 { 1580 mov esi, sptr 1581 mov edi, dp 1582 mov ecx, width_mmx 1583 sub esi, 2 1584 sub edi, 6 1585loop2_pass4: 1586 movd mm0, [esi] ; X X X X v1 v0 v3 v2 1587 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 1588 sub esi, 4 1589 movq [edi], mm0 1590 sub edi, 8 1591 sub ecx, 2 1592 jnz loop2_pass4 1593 EMMS 1594 } 1595 } 1596 1597 sptr -= (width_mmx*2 - 2); // sign fixed 1598 dp -= (width_mmx*4 - 2); // sign fixed 1599 for (i = width; i; i--) 1600 { 1601 png_byte v[8]; 1602 int j; 1603 sptr -= 2; 1604 png_memcpy(v, sptr, 2); 1605 for (j = 0; j < png_pass_inc[pass]; j++) 1606 { 1607 dp -= 2; 1608 png_memcpy(dp, v, 2); 1609 } 1610 } 1611 } 1612 } /* end of pixel_bytes == 2 */ 1613 1614 else if (pixel_bytes == 4) 1615 { 1616 if (((pass == 0) || (pass == 1)) && width) 1617 { 1618 int width_mmx = ((width >> 1) << 1) ; 1619 width -= width_mmx; 1620 if (width_mmx) 1621 { 1622 _asm 1623 { 1624 mov esi, sptr 1625 mov edi, dp 1626 mov ecx, width_mmx 1627 sub esi, 4 1628 sub edi, 60 1629loop4_pass0: 1630 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4 1631 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4 1632 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4 1633 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0 1634 movq [edi], mm0 1635 movq [edi + 8], mm0 1636 movq [edi + 16], mm0 1637 movq [edi + 24], mm0 1638 movq [edi+32], mm1 1639 movq [edi + 40], mm1 1640 movq [edi+ 48], mm1 1641 sub esi, 8 1642 movq [edi + 56], mm1 1643 sub edi, 64 1644 sub ecx, 2 1645 jnz loop4_pass0 1646 EMMS 1647 } 1648 } 1649 1650 sptr -= (width_mmx*4 - 4); // sign fixed 1651 dp -= (width_mmx*32 - 4); // sign fixed 1652 for (i = width; i; i--) 1653 { 1654 png_byte v[8]; 1655 int j; 1656 sptr -= 4; 1657 png_memcpy(v, sptr, 4); 1658 for (j = 0; j < png_pass_inc[pass]; j++) 1659 { 1660 dp -= 4; 1661 png_memcpy(dp, v, 4); 1662 } 1663 } 1664 } 1665 else if (((pass == 2) || (pass == 3)) && width) 1666 { 1667 int width_mmx = ((width >> 1) << 1) ; 1668 width -= width_mmx; 1669 if (width_mmx) 1670 { 1671 _asm 1672 { 1673 mov esi, sptr 1674 mov edi, dp 1675 mov ecx, width_mmx 1676 sub esi, 4 1677 sub edi, 28 1678loop4_pass2: 1679 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4 1680 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4 1681 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4 1682 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0 1683 movq [edi], mm0 1684 movq [edi + 8], mm0 1685 movq [edi+16], mm1 1686 movq [edi + 24], mm1 1687 sub esi, 8 1688 sub edi, 32 1689 sub ecx, 2 1690 jnz loop4_pass2 1691 EMMS 1692 } 1693 } 1694 1695 sptr -= (width_mmx*4 - 4); // sign fixed 1696 dp -= (width_mmx*16 - 4); // sign fixed 1697 for (i = width; i; i--) 1698 { 1699 png_byte v[8]; 1700 int j; 1701 sptr -= 4; 1702 png_memcpy(v, sptr, 4); 1703 for (j = 0; j < png_pass_inc[pass]; j++) 1704 { 1705 dp -= 4; 1706 png_memcpy(dp, v, 4); 1707 } 1708 } 1709 } 1710 else if (width) // pass == 4 or 5 1711 { 1712 int width_mmx = ((width >> 1) << 1) ; 1713 width -= width_mmx; 1714 if (width_mmx) 1715 { 1716 _asm 1717 { 1718 mov esi, sptr 1719 mov edi, dp 1720 mov ecx, width_mmx 1721 sub esi, 4 1722 sub edi, 12 1723loop4_pass4: 1724 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4 1725 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4 1726 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4 1727 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0 1728 movq [edi], mm0 1729 sub esi, 8 1730 movq [edi + 8], mm1 1731 sub edi, 16 1732 sub ecx, 2 1733 jnz loop4_pass4 1734 EMMS 1735 } 1736 } 1737 1738 sptr -= (width_mmx*4 - 4); // sign fixed 1739 dp -= (width_mmx*8 - 4); // sign fixed 1740 for (i = width; i; i--) 1741 { 1742 png_byte v[8]; 1743 int j; 1744 sptr -= 4; 1745 png_memcpy(v, sptr, 4); 1746 for (j = 0; j < png_pass_inc[pass]; j++) 1747 { 1748 dp -= 4; 1749 png_memcpy(dp, v, 4); 1750 } 1751 } 1752 } 1753 1754 } /* end of pixel_bytes == 4 */ 1755 1756 else if (pixel_bytes == 6) 1757 { 1758 for (i = width; i; i--) 1759 { 1760 png_byte v[8]; 1761 int j; 1762 png_memcpy(v, sptr, 6); 1763 for (j = 0; j < png_pass_inc[pass]; j++) 1764 { 1765 png_memcpy(dp, v, 6); 1766 dp -= 6; 1767 } 1768 sptr -= 6; 1769 } 1770 } /* end of pixel_bytes == 6 */ 1771 1772 else 1773 { 1774 for (i = width; i; i--) 1775 { 1776 png_byte v[8]; 1777 int j; 1778 png_memcpy(v, sptr, pixel_bytes); 1779 for (j = 0; j < png_pass_inc[pass]; j++) 1780 { 1781 png_memcpy(dp, v, pixel_bytes); 1782 dp -= pixel_bytes; 1783 } 1784 sptr-= pixel_bytes; 1785 } 1786 } 1787 } /* end of mmx_supported */ 1788 1789 else /* MMX not supported: use modified C code - takes advantage 1790 * of inlining of memcpy for a constant */ 1791 { 1792 if (pixel_bytes == 1) 1793 { 1794 for (i = width; i; i--) 1795 { 1796 int j; 1797 for (j = 0; j < png_pass_inc[pass]; j++) 1798 *dp-- = *sptr; 1799 sptr--; 1800 } 1801 } 1802 else if (pixel_bytes == 3) 1803 { 1804 for (i = width; i; i--) 1805 { 1806 png_byte v[8]; 1807 int j; 1808 png_memcpy(v, sptr, pixel_bytes); 1809 for (j = 0; j < png_pass_inc[pass]; j++) 1810 { 1811 png_memcpy(dp, v, pixel_bytes); 1812 dp -= pixel_bytes; 1813 } 1814 sptr -= pixel_bytes; 1815 } 1816 } 1817 else if (pixel_bytes == 2) 1818 { 1819 for (i = width; i; i--) 1820 { 1821 png_byte v[8]; 1822 int j; 1823 png_memcpy(v, sptr, pixel_bytes); 1824 for (j = 0; j < png_pass_inc[pass]; j++) 1825 { 1826 png_memcpy(dp, v, pixel_bytes); 1827 dp -= pixel_bytes; 1828 } 1829 sptr -= pixel_bytes; 1830 } 1831 } 1832 else if (pixel_bytes == 4) 1833 { 1834 for (i = width; i; i--) 1835 { 1836 png_byte v[8]; 1837 int j; 1838 png_memcpy(v, sptr, pixel_bytes); 1839 for (j = 0; j < png_pass_inc[pass]; j++) 1840 { 1841 png_memcpy(dp, v, pixel_bytes); 1842 dp -= pixel_bytes; 1843 } 1844 sptr -= pixel_bytes; 1845 } 1846 } 1847 else if (pixel_bytes == 6) 1848 { 1849 for (i = width; i; i--) 1850 { 1851 png_byte v[8]; 1852 int j; 1853 png_memcpy(v, sptr, pixel_bytes); 1854 for (j = 0; j < png_pass_inc[pass]; j++) 1855 { 1856 png_memcpy(dp, v, pixel_bytes); 1857 dp -= pixel_bytes; 1858 } 1859 sptr -= pixel_bytes; 1860 } 1861 } 1862 else 1863 { 1864 for (i = width; i; i--) 1865 { 1866 png_byte v[8]; 1867 int j; 1868 png_memcpy(v, sptr, pixel_bytes); 1869 for (j = 0; j < png_pass_inc[pass]; j++) 1870 { 1871 png_memcpy(dp, v, pixel_bytes); 1872 dp -= pixel_bytes; 1873 } 1874 sptr -= pixel_bytes; 1875 } 1876 } 1877 1878 } /* end of MMX not supported */ 1879 break; 1880 } 1881 } /* end switch (row_info->pixel_depth) */ 1882 1883 row_info->width = final_width; 1884 row_info->rowbytes = ((final_width * 1885 (png_uint_32)row_info->pixel_depth + 7) >> 3); 1886 } 1887 1888#ifdef DISABLE_PNGVCRD_INTERLACE 1889 mmx_supported = save_mmx_supported; 1890#endif 1891} 1892 1893#endif /* PNG_READ_INTERLACING_SUPPORTED */ 1894 1895 1896// These variables are utilized in the functions below. They are declared 1897// globally here to ensure alignment on 8-byte boundaries. 1898 1899union uAll { 1900 __int64 use; 1901 double align; 1902} LBCarryMask = {0x0101010101010101}, 1903 HBClearMask = {0x7f7f7f7f7f7f7f7f}, 1904 ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem; 1905 1906 1907// Optimized code for PNG Average filter decoder 1908void /* PRIVATE */ 1909png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row 1910 , png_bytep prev_row) 1911{ 1912 int bpp; 1913 png_uint_32 FullLength; 1914 png_uint_32 MMXLength; 1915 //png_uint_32 len; 1916 int diff; 1917 1918 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel 1919 FullLength = row_info->rowbytes; // # of bytes to filter 1920 _asm { 1921 // Init address pointers and offset 1922 mov edi, row // edi ==> Avg(x) 1923 xor ebx, ebx // ebx ==> x 1924 mov edx, edi 1925 mov esi, prev_row // esi ==> Prior(x) 1926 sub edx, bpp // edx ==> Raw(x-bpp) 1927 1928 xor eax, eax 1929 // Compute the Raw value for the first bpp bytes 1930 // Raw(x) = Avg(x) + (Prior(x)/2) 1931davgrlp: 1932 mov al, [esi + ebx] // Load al with Prior(x) 1933 inc ebx 1934 shr al, 1 // divide by 2 1935 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx 1936 cmp ebx, bpp 1937 mov [edi+ebx-1], al // Write back Raw(x); 1938 // mov does not affect flags; -1 to offset inc ebx 1939 jb davgrlp 1940 // get # of bytes to alignment 1941 mov diff, edi // take start of row 1942 add diff, ebx // add bpp 1943 add diff, 0xf // add 7 + 8 to incr past alignment boundary 1944 and diff, 0xfffffff8 // mask to alignment boundary 1945 sub diff, edi // subtract from start ==> value ebx at alignment 1946 jz davggo 1947 // fix alignment 1948 // Compute the Raw value for the bytes upto the alignment boundary 1949 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) 1950 xor ecx, ecx 1951davglp1: 1952 xor eax, eax 1953 mov cl, [esi + ebx] // load cl with Prior(x) 1954 mov al, [edx + ebx] // load al with Raw(x-bpp) 1955 add ax, cx 1956 inc ebx 1957 shr ax, 1 // divide by 2 1958 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx 1959 cmp ebx, diff // Check if at alignment boundary 1960 mov [edi+ebx-1], al // Write back Raw(x); 1961 // mov does not affect flags; -1 to offset inc ebx 1962 jb davglp1 // Repeat until at alignment boundary 1963davggo: 1964 mov eax, FullLength 1965 mov ecx, eax 1966 sub eax, ebx // subtract alignment fix 1967 and eax, 0x00000007 // calc bytes over mult of 8 1968 sub ecx, eax // drop over bytes from original length 1969 mov MMXLength, ecx 1970 } // end _asm block 1971 // Now do the math for the rest of the row 1972 switch ( bpp ) 1973 { 1974 case 3: 1975 { 1976 ActiveMask.use = 0x0000000000ffffff; 1977 ShiftBpp.use = 24; // == 3 * 8 1978 ShiftRem.use = 40; // == 64 - 24 1979 _asm { 1980 // Re-init address pointers and offset 1981 movq mm7, ActiveMask 1982 mov ebx, diff // ebx ==> x = offset to alignment boundary 1983 movq mm5, LBCarryMask 1984 mov edi, row // edi ==> Avg(x) 1985 movq mm4, HBClearMask 1986 mov esi, prev_row // esi ==> Prior(x) 1987 // PRIME the pump (load the first Raw(x-bpp) data set 1988 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes 1989 // (we correct position in loop below) 1990davg3lp: 1991 movq mm0, [edi + ebx] // Load mm0 with Avg(x) 1992 // Add (Prev_row/2) to Average 1993 movq mm3, mm5 1994 psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data 1995 movq mm1, [esi + ebx] // Load mm1 with Prior(x) 1996 movq mm6, mm7 1997 pand mm3, mm1 // get lsb for each prev_row byte 1998 psrlq mm1, 1 // divide prev_row bytes by 2 1999 pand mm1, mm4 // clear invalid bit 7 of each byte 2000 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte 2001 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry 2002 movq mm1, mm3 // now use mm1 for getting LBCarrys 2003 pand mm1, mm2 // get LBCarrys for each byte where both 2004 // lsb's were == 1 (Only valid for active group) 2005 psrlq mm2, 1 // divide raw bytes by 2 2006 pand mm2, mm4 // clear invalid bit 7 of each byte 2007 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte 2008 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg 2009 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active 2010 // byte 2011 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry 2012 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5 2013 movq mm2, mm0 // mov updated Raws to mm2 2014 psllq mm2, ShiftBpp // shift data to position correctly 2015 movq mm1, mm3 // now use mm1 for getting LBCarrys 2016 pand mm1, mm2 // get LBCarrys for each byte where both 2017 // lsb's were == 1 (Only valid for active group) 2018 psrlq mm2, 1 // divide raw bytes by 2 2019 pand mm2, mm4 // clear invalid bit 7 of each byte 2020 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte 2021 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg 2022 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active 2023 // byte 2024 2025 // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry 2026 psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two 2027 // bytes 2028 movq mm2, mm0 // mov updated Raws to mm2 2029 psllq mm2, ShiftBpp // shift data to position correctly 2030 // Data only needs to be shifted once here to 2031 // get the correct x-bpp offset. 2032 movq mm1, mm3 // now use mm1 for getting LBCarrys 2033 pand mm1, mm2 // get LBCarrys for each byte where both 2034 // lsb's were == 1 (Only valid for active group) 2035 psrlq mm2, 1 // divide raw bytes by 2 2036 pand mm2, mm4 // clear invalid bit 7 of each byte 2037 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte 2038 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg 2039 add ebx, 8 2040 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active 2041 // byte 2042 2043 // Now ready to write back to memory 2044 movq [edi + ebx - 8], mm0 2045 // Move updated Raw(x) to use as Raw(x-bpp) for next loop 2046 cmp ebx, MMXLength 2047 movq mm2, mm0 // mov updated Raw(x) to mm2 2048 jb davg3lp 2049 } // end _asm block 2050 } 2051 break; 2052 2053 case 6: 2054 case 4: 2055 case 7: 2056 case 5: 2057 { 2058 ActiveMask.use = 0xffffffffffffffff; // use shift below to clear 2059 // appropriate inactive bytes 2060 ShiftBpp.use = bpp << 3; 2061 ShiftRem.use = 64 - ShiftBpp.use; 2062 _asm { 2063 movq mm4, HBClearMask 2064 // Re-init address pointers and offset 2065 mov ebx, diff // ebx ==> x = offset to alignment boundary 2066 // Load ActiveMask and clear all bytes except for 1st active group 2067 movq mm7, ActiveMask 2068 mov edi, row // edi ==> Avg(x) 2069 psrlq mm7, ShiftRem 2070 mov esi, prev_row // esi ==> Prior(x) 2071 movq mm6, mm7 2072 movq mm5, LBCarryMask 2073 psllq mm6, ShiftBpp // Create mask for 2nd active group 2074 // PRIME the pump (load the first Raw(x-bpp) data set 2075 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes 2076 // (we correct position in loop below) 2077davg4lp: 2078 movq mm0, [edi + ebx] 2079 psrlq mm2, ShiftRem // shift data to position correctly 2080 movq mm1, [esi + ebx] 2081 // Add (Prev_row/2) to Average 2082 movq mm3, mm5 2083 pand mm3, mm1 // get lsb for each prev_row byte 2084 psrlq mm1, 1 // divide prev_row bytes by 2 2085 pand mm1, mm4 // clear invalid bit 7 of each byte 2086 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte 2087 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry 2088 movq mm1, mm3 // now use mm1 for getting LBCarrys 2089 pand mm1, mm2 // get LBCarrys for each byte where both 2090 // lsb's were == 1 (Only valid for active group) 2091 psrlq mm2, 1 // divide raw bytes by 2 2092 pand mm2, mm4 // clear invalid bit 7 of each byte 2093 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte 2094 pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg 2095 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active 2096 // byte 2097 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry 2098 movq mm2, mm0 // mov updated Raws to mm2 2099 psllq mm2, ShiftBpp // shift data to position correctly 2100 add ebx, 8 2101 movq mm1, mm3 // now use mm1 for getting LBCarrys 2102 pand mm1, mm2 // get LBCarrys for each byte where both 2103 // lsb's were == 1 (Only valid for active group) 2104 psrlq mm2, 1 // divide raw bytes by 2 2105 pand mm2, mm4 // clear invalid bit 7 of each byte 2106 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte 2107 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg 2108 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active 2109 // byte 2110 cmp ebx, MMXLength 2111 // Now ready to write back to memory 2112 movq [edi + ebx - 8], mm0 2113 // Prep Raw(x-bpp) for next loop 2114 movq mm2, mm0 // mov updated Raws to mm2 2115 jb davg4lp 2116 } // end _asm block 2117 } 2118 break; 2119 case 2: 2120 { 2121 ActiveMask.use = 0x000000000000ffff; 2122 ShiftBpp.use = 24; // == 3 * 8 2123 ShiftRem.use = 40; // == 64 - 24 2124 _asm { 2125 // Load ActiveMask 2126 movq mm7, ActiveMask 2127 // Re-init address pointers and offset 2128 mov ebx, diff // ebx ==> x = offset to alignment boundary 2129 movq mm5, LBCarryMask 2130 mov edi, row // edi ==> Avg(x) 2131 movq mm4, HBClearMask 2132 mov esi, prev_row // esi ==> Prior(x) 2133 // PRIME the pump (load the first Raw(x-bpp) data set 2134 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes 2135 // (we correct position in loop below) 2136davg2lp: 2137 movq mm0, [edi + ebx] 2138 psllq mm2, ShiftRem // shift data to position correctly 2139 movq mm1, [esi + ebx] 2140 // Add (Prev_row/2) to Average 2141 movq mm3, mm5 2142 pand mm3, mm1 // get lsb for each prev_row byte 2143 psrlq mm1, 1 // divide prev_row bytes by 2 2144 pand mm1, mm4 // clear invalid bit 7 of each byte 2145 movq mm6, mm7 2146 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte 2147 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry 2148 movq mm1, mm3 // now use mm1 for getting LBCarrys 2149 pand mm1, mm2 // get LBCarrys for each byte where both 2150 // lsb's were == 1 (Only valid for active group) 2151 psrlq mm2, 1 // divide raw bytes by 2 2152 pand mm2, mm4 // clear invalid bit 7 of each byte 2153 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte 2154 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg 2155 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte 2156 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry 2157 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3 2158 movq mm2, mm0 // mov updated Raws to mm2 2159 psllq mm2, ShiftBpp // shift data to position correctly 2160 movq mm1, mm3 // now use mm1 for getting LBCarrys 2161 pand mm1, mm2 // get LBCarrys for each byte where both 2162 // lsb's were == 1 (Only valid for active group) 2163 psrlq mm2, 1 // divide raw bytes by 2 2164 pand mm2, mm4 // clear invalid bit 7 of each byte 2165 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte 2166 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg 2167 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte 2168 2169 // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry 2170 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5 2171 movq mm2, mm0 // mov updated Raws to mm2 2172 psllq mm2, ShiftBpp // shift data to position correctly 2173 // Data only needs to be shifted once here to 2174 // get the correct x-bpp offset. 2175 movq mm1, mm3 // now use mm1 for getting LBCarrys 2176 pand mm1, mm2 // get LBCarrys for each byte where both 2177 // lsb's were == 1 (Only valid for active group) 2178 psrlq mm2, 1 // divide raw bytes by 2 2179 pand mm2, mm4 // clear invalid bit 7 of each byte 2180 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte 2181 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg 2182 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte 2183 2184 // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry 2185 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7 2186 movq mm2, mm0 // mov updated Raws to mm2 2187 psllq mm2, ShiftBpp // shift data to position correctly 2188 // Data only needs to be shifted once here to 2189 // get the correct x-bpp offset. 2190 add ebx, 8 2191 movq mm1, mm3 // now use mm1 for getting LBCarrys 2192 pand mm1, mm2 // get LBCarrys for each byte where both 2193 // lsb's were == 1 (Only valid for active group) 2194 psrlq mm2, 1 // divide raw bytes by 2 2195 pand mm2, mm4 // clear invalid bit 7 of each byte 2196 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte 2197 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg 2198 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte 2199 2200 cmp ebx, MMXLength 2201 // Now ready to write back to memory 2202 movq [edi + ebx - 8], mm0 2203 // Prep Raw(x-bpp) for next loop 2204 movq mm2, mm0 // mov updated Raws to mm2 2205 jb davg2lp 2206 } // end _asm block 2207 } 2208 break; 2209 2210 case 1: // bpp == 1 2211 { 2212 _asm { 2213 // Re-init address pointers and offset 2214 mov ebx, diff // ebx ==> x = offset to alignment boundary 2215 mov edi, row // edi ==> Avg(x) 2216 cmp ebx, FullLength // Test if offset at end of array 2217 jnb davg1end 2218 // Do Paeth decode for remaining bytes 2219 mov esi, prev_row // esi ==> Prior(x) 2220 mov edx, edi 2221 xor ecx, ecx // zero ecx before using cl & cx in loop below 2222 sub edx, bpp // edx ==> Raw(x-bpp) 2223davg1lp: 2224 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) 2225 xor eax, eax 2226 mov cl, [esi + ebx] // load cl with Prior(x) 2227 mov al, [edx + ebx] // load al with Raw(x-bpp) 2228 add ax, cx 2229 inc ebx 2230 shr ax, 1 // divide by 2 2231 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx 2232 cmp ebx, FullLength // Check if at end of array 2233 mov [edi+ebx-1], al // Write back Raw(x); 2234 // mov does not affect flags; -1 to offset inc ebx 2235 jb davg1lp 2236davg1end: 2237 } // end _asm block 2238 } 2239 return; 2240 2241 case 8: // bpp == 8 2242 { 2243 _asm { 2244 // Re-init address pointers and offset 2245 mov ebx, diff // ebx ==> x = offset to alignment boundary 2246 movq mm5, LBCarryMask 2247 mov edi, row // edi ==> Avg(x) 2248 movq mm4, HBClearMask 2249 mov esi, prev_row // esi ==> Prior(x) 2250 // PRIME the pump (load the first Raw(x-bpp) data set 2251 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes 2252 // (NO NEED to correct position in loop below) 2253davg8lp: 2254 movq mm0, [edi + ebx] 2255 movq mm3, mm5 2256 movq mm1, [esi + ebx] 2257 add ebx, 8 2258 pand mm3, mm1 // get lsb for each prev_row byte 2259 psrlq mm1, 1 // divide prev_row bytes by 2 2260 pand mm3, mm2 // get LBCarrys for each byte where both 2261 // lsb's were == 1 2262 psrlq mm2, 1 // divide raw bytes by 2 2263 pand mm1, mm4 // clear invalid bit 7 of each byte 2264 paddb mm0, mm3 // add LBCarrys to Avg for each byte 2265 pand mm2, mm4 // clear invalid bit 7 of each byte 2266 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte 2267 paddb mm0, mm2 // add (Raw/2) to Avg for each byte 2268 cmp ebx, MMXLength 2269 movq [edi + ebx - 8], mm0 2270 movq mm2, mm0 // reuse as Raw(x-bpp) 2271 jb davg8lp 2272 } // end _asm block 2273 } 2274 break; 2275 default: // bpp greater than 8 2276 { 2277 _asm { 2278 movq mm5, LBCarryMask 2279 // Re-init address pointers and offset 2280 mov ebx, diff // ebx ==> x = offset to alignment boundary 2281 mov edi, row // edi ==> Avg(x) 2282 movq mm4, HBClearMask 2283 mov edx, edi 2284 mov esi, prev_row // esi ==> Prior(x) 2285 sub edx, bpp // edx ==> Raw(x-bpp) 2286davgAlp: 2287 movq mm0, [edi + ebx] 2288 movq mm3, mm5 2289 movq mm1, [esi + ebx] 2290 pand mm3, mm1 // get lsb for each prev_row byte 2291 movq mm2, [edx + ebx] 2292 psrlq mm1, 1 // divide prev_row bytes by 2 2293 pand mm3, mm2 // get LBCarrys for each byte where both 2294 // lsb's were == 1 2295 psrlq mm2, 1 // divide raw bytes by 2 2296 pand mm1, mm4 // clear invalid bit 7 of each byte 2297 paddb mm0, mm3 // add LBCarrys to Avg for each byte 2298 pand mm2, mm4 // clear invalid bit 7 of each byte 2299 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte 2300 add ebx, 8 2301 paddb mm0, mm2 // add (Raw/2) to Avg for each byte 2302 cmp ebx, MMXLength 2303 movq [edi + ebx - 8], mm0 2304 jb davgAlp 2305 } // end _asm block 2306 } 2307 break; 2308 } // end switch ( bpp ) 2309 2310 _asm { 2311 // MMX acceleration complete now do clean-up 2312 // Check if any remaining bytes left to decode 2313 mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX 2314 mov edi, row // edi ==> Avg(x) 2315 cmp ebx, FullLength // Test if offset at end of array 2316 jnb davgend 2317 // Do Paeth decode for remaining bytes 2318 mov esi, prev_row // esi ==> Prior(x) 2319 mov edx, edi 2320 xor ecx, ecx // zero ecx before using cl & cx in loop below 2321 sub edx, bpp // edx ==> Raw(x-bpp) 2322davglp2: 2323 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) 2324 xor eax, eax 2325 mov cl, [esi + ebx] // load cl with Prior(x) 2326 mov al, [edx + ebx] // load al with Raw(x-bpp) 2327 add ax, cx 2328 inc ebx 2329 shr ax, 1 // divide by 2 2330 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx 2331 cmp ebx, FullLength // Check if at end of array 2332 mov [edi+ebx-1], al // Write back Raw(x); 2333 // mov does not affect flags; -1 to offset inc ebx 2334 jb davglp2 2335davgend: 2336 emms // End MMX instructions; prep for possible FP instrs. 2337 } // end _asm block 2338} 2339 2340// Optimized code for PNG Paeth filter decoder 2341void /* PRIVATE */ 2342png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row, 2343 png_bytep prev_row) 2344{ 2345 png_uint_32 FullLength; 2346 png_uint_32 MMXLength; 2347 //png_uint_32 len; 2348 int bpp; 2349 int diff; 2350 //int ptemp; 2351 int patemp, pbtemp, pctemp; 2352 2353 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel 2354 FullLength = row_info->rowbytes; // # of bytes to filter 2355 _asm 2356 { 2357 xor ebx, ebx // ebx ==> x offset 2358 mov edi, row 2359 xor edx, edx // edx ==> x-bpp offset 2360 mov esi, prev_row 2361 xor eax, eax 2362 2363 // Compute the Raw value for the first bpp bytes 2364 // Note: the formula works out to be always 2365 // Paeth(x) = Raw(x) + Prior(x) where x < bpp 2366dpthrlp: 2367 mov al, [edi + ebx] 2368 add al, [esi + ebx] 2369 inc ebx 2370 cmp ebx, bpp 2371 mov [edi + ebx - 1], al 2372 jb dpthrlp 2373 // get # of bytes to alignment 2374 mov diff, edi // take start of row 2375 add diff, ebx // add bpp 2376 xor ecx, ecx 2377 add diff, 0xf // add 7 + 8 to incr past alignment boundary 2378 and diff, 0xfffffff8 // mask to alignment boundary 2379 sub diff, edi // subtract from start ==> value ebx at alignment 2380 jz dpthgo 2381 // fix alignment 2382dpthlp1: 2383 xor eax, eax 2384 // pav = p - a = (a + b - c) - a = b - c 2385 mov al, [esi + ebx] // load Prior(x) into al 2386 mov cl, [esi + edx] // load Prior(x-bpp) into cl 2387 sub eax, ecx // subtract Prior(x-bpp) 2388 mov patemp, eax // Save pav for later use 2389 xor eax, eax 2390 // pbv = p - b = (a + b - c) - b = a - c 2391 mov al, [edi + edx] // load Raw(x-bpp) into al 2392 sub eax, ecx // subtract Prior(x-bpp) 2393 mov ecx, eax 2394 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 2395 add eax, patemp // pcv = pav + pbv 2396 // pc = abs(pcv) 2397 test eax, 0x80000000 2398 jz dpthpca 2399 neg eax // reverse sign of neg values 2400dpthpca: 2401 mov pctemp, eax // save pc for later use 2402 // pb = abs(pbv) 2403 test ecx, 0x80000000 2404 jz dpthpba 2405 neg ecx // reverse sign of neg values 2406dpthpba: 2407 mov pbtemp, ecx // save pb for later use 2408 // pa = abs(pav) 2409 mov eax, patemp 2410 test eax, 0x80000000 2411 jz dpthpaa 2412 neg eax // reverse sign of neg values 2413dpthpaa: 2414 mov patemp, eax // save pa for later use 2415 // test if pa <= pb 2416 cmp eax, ecx 2417 jna dpthabb 2418 // pa > pb; now test if pb <= pc 2419 cmp ecx, pctemp 2420 jna dpthbbc 2421 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) 2422 mov cl, [esi + edx] // load Prior(x-bpp) into cl 2423 jmp dpthpaeth 2424dpthbbc: 2425 // pb <= pc; Raw(x) = Paeth(x) + Prior(x) 2426 mov cl, [esi + ebx] // load Prior(x) into cl 2427 jmp dpthpaeth 2428dpthabb: 2429 // pa <= pb; now test if pa <= pc 2430 cmp eax, pctemp 2431 jna dpthabc 2432 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) 2433 mov cl, [esi + edx] // load Prior(x-bpp) into cl 2434 jmp dpthpaeth 2435dpthabc: 2436 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) 2437 mov cl, [edi + edx] // load Raw(x-bpp) into cl 2438dpthpaeth: 2439 inc ebx 2440 inc edx 2441 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 2442 add [edi + ebx - 1], cl 2443 cmp ebx, diff 2444 jb dpthlp1 2445dpthgo: 2446 mov ecx, FullLength 2447 mov eax, ecx 2448 sub eax, ebx // subtract alignment fix 2449 and eax, 0x00000007 // calc bytes over mult of 8 2450 sub ecx, eax // drop over bytes from original length 2451 mov MMXLength, ecx 2452 } // end _asm block 2453 // Now do the math for the rest of the row 2454 switch ( bpp ) 2455 { 2456 case 3: 2457 { 2458 ActiveMask.use = 0x0000000000ffffff; 2459 ActiveMaskEnd.use = 0xffff000000000000; 2460 ShiftBpp.use = 24; // == bpp(3) * 8 2461 ShiftRem.use = 40; // == 64 - 24 2462 _asm 2463 { 2464 mov ebx, diff 2465 mov edi, row 2466 mov esi, prev_row 2467 pxor mm0, mm0 2468 // PRIME the pump (load the first Raw(x-bpp) data set 2469 movq mm1, [edi+ebx-8] 2470dpth3lp: 2471 psrlq mm1, ShiftRem // shift last 3 bytes to 1st 3 bytes 2472 movq mm2, [esi + ebx] // load b=Prior(x) 2473 punpcklbw mm1, mm0 // Unpack High bytes of a 2474 movq mm3, [esi+ebx-8] // Prep c=Prior(x-bpp) bytes 2475 punpcklbw mm2, mm0 // Unpack High bytes of b 2476 psrlq mm3, ShiftRem // shift last 3 bytes to 1st 3 bytes 2477 // pav = p - a = (a + b - c) - a = b - c 2478 movq mm4, mm2 2479 punpcklbw mm3, mm0 // Unpack High bytes of c 2480 // pbv = p - b = (a + b - c) - b = a - c 2481 movq mm5, mm1 2482 psubw mm4, mm3 2483 pxor mm7, mm7 2484 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 2485 movq mm6, mm4 2486 psubw mm5, mm3 2487 2488 // pa = abs(p-a) = abs(pav) 2489 // pb = abs(p-b) = abs(pbv) 2490 // pc = abs(p-c) = abs(pcv) 2491 pcmpgtw mm0, mm4 // Create mask pav bytes < 0 2492 paddw mm6, mm5 2493 pand mm0, mm4 // Only pav bytes < 0 in mm7 2494 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 2495 psubw mm4, mm0 2496 pand mm7, mm5 // Only pbv bytes < 0 in mm0 2497 psubw mm4, mm0 2498 psubw mm5, mm7 2499 pxor mm0, mm0 2500 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 2501 pand mm0, mm6 // Only pav bytes < 0 in mm7 2502 psubw mm5, mm7 2503 psubw mm6, mm0 2504 // test pa <= pb 2505 movq mm7, mm4 2506 psubw mm6, mm0 2507 pcmpgtw mm7, mm5 // pa > pb? 2508 movq mm0, mm7 2509 // use mm7 mask to merge pa & pb 2510 pand mm5, mm7 2511 // use mm0 mask copy to merge a & b 2512 pand mm2, mm0 2513 pandn mm7, mm4 2514 pandn mm0, mm1 2515 paddw mm7, mm5 2516 paddw mm0, mm2 2517 // test ((pa <= pb)? pa:pb) <= pc 2518 pcmpgtw mm7, mm6 // pab > pc? 2519 pxor mm1, mm1 2520 pand mm3, mm7 2521 pandn mm7, mm0 2522 paddw mm7, mm3 2523 pxor mm0, mm0 2524 packuswb mm7, mm1 2525 movq mm3, [esi + ebx] // load c=Prior(x-bpp) 2526 pand mm7, ActiveMask 2527 movq mm2, mm3 // load b=Prior(x) step 1 2528 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x) 2529 punpcklbw mm3, mm0 // Unpack High bytes of c 2530 movq [edi + ebx], mm7 // write back updated value 2531 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp) 2532 // Now do Paeth for 2nd set of bytes (3-5) 2533 psrlq mm2, ShiftBpp // load b=Prior(x) step 2 2534 punpcklbw mm1, mm0 // Unpack High bytes of a 2535 pxor mm7, mm7 2536 punpcklbw mm2, mm0 // Unpack High bytes of b 2537 // pbv = p - b = (a + b - c) - b = a - c 2538 movq mm5, mm1 2539 // pav = p - a = (a + b - c) - a = b - c 2540 movq mm4, mm2 2541 psubw mm5, mm3 2542 psubw mm4, mm3 2543 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = 2544 // pav + pbv = pbv + pav 2545 movq mm6, mm5 2546 paddw mm6, mm4 2547 2548 // pa = abs(p-a) = abs(pav) 2549 // pb = abs(p-b) = abs(pbv) 2550 // pc = abs(p-c) = abs(pcv) 2551 pcmpgtw mm0, mm5 // Create mask pbv bytes < 0 2552 pcmpgtw mm7, mm4 // Create mask pav bytes < 0 2553 pand mm0, mm5 // Only pbv bytes < 0 in mm0 2554 pand mm7, mm4 // Only pav bytes < 0 in mm7 2555 psubw mm5, mm0 2556 psubw mm4, mm7 2557 psubw mm5, mm0 2558 psubw mm4, mm7 2559 pxor mm0, mm0 2560 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 2561 pand mm0, mm6 // Only pav bytes < 0 in mm7 2562 psubw mm6, mm0 2563 // test pa <= pb 2564 movq mm7, mm4 2565 psubw mm6, mm0 2566 pcmpgtw mm7, mm5 // pa > pb? 2567 movq mm0, mm7 2568 // use mm7 mask to merge pa & pb 2569 pand mm5, mm7 2570 // use mm0 mask copy to merge a & b 2571 pand mm2, mm0 2572 pandn mm7, mm4 2573 pandn mm0, mm1 2574 paddw mm7, mm5 2575 paddw mm0, mm2 2576 // test ((pa <= pb)? pa:pb) <= pc 2577 pcmpgtw mm7, mm6 // pab > pc? 2578 movq mm2, [esi + ebx] // load b=Prior(x) 2579 pand mm3, mm7 2580 pandn mm7, mm0 2581 pxor mm1, mm1 2582 paddw mm7, mm3 2583 pxor mm0, mm0 2584 packuswb mm7, mm1 2585 movq mm3, mm2 // load c=Prior(x-bpp) step 1 2586 pand mm7, ActiveMask 2587 punpckhbw mm2, mm0 // Unpack High bytes of b 2588 psllq mm7, ShiftBpp // Shift bytes to 2nd group of 3 bytes 2589 // pav = p - a = (a + b - c) - a = b - c 2590 movq mm4, mm2 2591 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x) 2592 psllq mm3, ShiftBpp // load c=Prior(x-bpp) step 2 2593 movq [edi + ebx], mm7 // write back updated value 2594 movq mm1, mm7 2595 punpckhbw mm3, mm0 // Unpack High bytes of c 2596 psllq mm1, ShiftBpp // Shift bytes 2597 // Now mm1 will be used as Raw(x-bpp) 2598 // Now do Paeth for 3rd, and final, set of bytes (6-7) 2599 pxor mm7, mm7 2600 punpckhbw mm1, mm0 // Unpack High bytes of a 2601 psubw mm4, mm3 2602 // pbv = p - b = (a + b - c) - b = a - c 2603 movq mm5, mm1 2604 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 2605 movq mm6, mm4 2606 psubw mm5, mm3 2607 pxor mm0, mm0 2608 paddw mm6, mm5 2609 2610 // pa = abs(p-a) = abs(pav) 2611 // pb = abs(p-b) = abs(pbv) 2612 // pc = abs(p-c) = abs(pcv) 2613 pcmpgtw mm0, mm4 // Create mask pav bytes < 0 2614 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 2615 pand mm0, mm4 // Only pav bytes < 0 in mm7 2616 pand mm7, mm5 // Only pbv bytes < 0 in mm0 2617 psubw mm4, mm0 2618 psubw mm5, mm7 2619 psubw mm4, mm0 2620 psubw mm5, mm7 2621 pxor mm0, mm0 2622 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 2623 pand mm0, mm6 // Only pav bytes < 0 in mm7 2624 psubw mm6, mm0 2625 // test pa <= pb 2626 movq mm7, mm4 2627 psubw mm6, mm0 2628 pcmpgtw mm7, mm5 // pa > pb? 2629 movq mm0, mm7 2630 // use mm0 mask copy to merge a & b 2631 pand mm2, mm0 2632 // use mm7 mask to merge pa & pb 2633 pand mm5, mm7 2634 pandn mm0, mm1 2635 pandn mm7, mm4 2636 paddw mm0, mm2 2637 paddw mm7, mm5 2638 // test ((pa <= pb)? pa:pb) <= pc 2639 pcmpgtw mm7, mm6 // pab > pc? 2640 pand mm3, mm7 2641 pandn mm7, mm0 2642 paddw mm7, mm3 2643 pxor mm1, mm1 2644 packuswb mm1, mm7 2645 // Step ebx to next set of 8 bytes and repeat loop til done 2646 add ebx, 8 2647 pand mm1, ActiveMaskEnd 2648 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x) 2649 2650 cmp ebx, MMXLength 2651 pxor mm0, mm0 // pxor does not affect flags 2652 movq [edi + ebx - 8], mm1 // write back updated value 2653 // mm1 will be used as Raw(x-bpp) next loop 2654 // mm3 ready to be used as Prior(x-bpp) next loop 2655 jb dpth3lp 2656 } // end _asm block 2657 } 2658 break; 2659 2660 case 6: 2661 case 7: 2662 case 5: 2663 { 2664 ActiveMask.use = 0x00000000ffffffff; 2665 ActiveMask2.use = 0xffffffff00000000; 2666 ShiftBpp.use = bpp << 3; // == bpp * 8 2667 ShiftRem.use = 64 - ShiftBpp.use; 2668 _asm 2669 { 2670 mov ebx, diff 2671 mov edi, row 2672 mov esi, prev_row 2673 // PRIME the pump (load the first Raw(x-bpp) data set 2674 movq mm1, [edi+ebx-8] 2675 pxor mm0, mm0 2676dpth6lp: 2677 // Must shift to position Raw(x-bpp) data 2678 psrlq mm1, ShiftRem 2679 // Do first set of 4 bytes 2680 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes 2681 punpcklbw mm1, mm0 // Unpack Low bytes of a 2682 movq mm2, [esi + ebx] // load b=Prior(x) 2683 punpcklbw mm2, mm0 // Unpack Low bytes of b 2684 // Must shift to position Prior(x-bpp) data 2685 psrlq mm3, ShiftRem 2686 // pav = p - a = (a + b - c) - a = b - c 2687 movq mm4, mm2 2688 punpcklbw mm3, mm0 // Unpack Low bytes of c 2689 // pbv = p - b = (a + b - c) - b = a - c 2690 movq mm5, mm1 2691 psubw mm4, mm3 2692 pxor mm7, mm7 2693 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 2694 movq mm6, mm4 2695 psubw mm5, mm3 2696 // pa = abs(p-a) = abs(pav) 2697 // pb = abs(p-b) = abs(pbv) 2698 // pc = abs(p-c) = abs(pcv) 2699 pcmpgtw mm0, mm4 // Create mask pav bytes < 0 2700 paddw mm6, mm5 2701 pand mm0, mm4 // Only pav bytes < 0 in mm7 2702 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 2703 psubw mm4, mm0 2704 pand mm7, mm5 // Only pbv bytes < 0 in mm0 2705 psubw mm4, mm0 2706 psubw mm5, mm7 2707 pxor mm0, mm0 2708 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 2709 pand mm0, mm6 // Only pav bytes < 0 in mm7 2710 psubw mm5, mm7 2711 psubw mm6, mm0 2712 // test pa <= pb 2713 movq mm7, mm4 2714 psubw mm6, mm0 2715 pcmpgtw mm7, mm5 // pa > pb? 2716 movq mm0, mm7 2717 // use mm7 mask to merge pa & pb 2718 pand mm5, mm7 2719 // use mm0 mask copy to merge a & b 2720 pand mm2, mm0 2721 pandn mm7, mm4 2722 pandn mm0, mm1 2723 paddw mm7, mm5 2724 paddw mm0, mm2 2725 // test ((pa <= pb)? pa:pb) <= pc 2726 pcmpgtw mm7, mm6 // pab > pc? 2727 pxor mm1, mm1 2728 pand mm3, mm7 2729 pandn mm7, mm0 2730 paddw mm7, mm3 2731 pxor mm0, mm0 2732 packuswb mm7, mm1 2733 movq mm3, [esi + ebx - 8] // load c=Prior(x-bpp) 2734 pand mm7, ActiveMask 2735 psrlq mm3, ShiftRem 2736 movq mm2, [esi + ebx] // load b=Prior(x) step 1 2737 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x) 2738 movq mm6, mm2 2739 movq [edi + ebx], mm7 // write back updated value 2740 movq mm1, [edi+ebx-8] 2741 psllq mm6, ShiftBpp 2742 movq mm5, mm7 2743 psrlq mm1, ShiftRem 2744 por mm3, mm6 2745 psllq mm5, ShiftBpp 2746 punpckhbw mm3, mm0 // Unpack High bytes of c 2747 por mm1, mm5 2748 // Do second set of 4 bytes 2749 punpckhbw mm2, mm0 // Unpack High bytes of b 2750 punpckhbw mm1, mm0 // Unpack High bytes of a 2751 // pav = p - a = (a + b - c) - a = b - c 2752 movq mm4, mm2 2753 // pbv = p - b = (a + b - c) - b = a - c 2754 movq mm5, mm1 2755 psubw mm4, mm3 2756 pxor mm7, mm7 2757 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 2758 movq mm6, mm4 2759 psubw mm5, mm3 2760 // pa = abs(p-a) = abs(pav) 2761 // pb = abs(p-b) = abs(pbv) 2762 // pc = abs(p-c) = abs(pcv) 2763 pcmpgtw mm0, mm4 // Create mask pav bytes < 0 2764 paddw mm6, mm5 2765 pand mm0, mm4 // Only pav bytes < 0 in mm7 2766 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 2767 psubw mm4, mm0 2768 pand mm7, mm5 // Only pbv bytes < 0 in mm0 2769 psubw mm4, mm0 2770 psubw mm5, mm7 2771 pxor mm0, mm0 2772 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 2773 pand mm0, mm6 // Only pav bytes < 0 in mm7 2774 psubw mm5, mm7 2775 psubw mm6, mm0 2776 // test pa <= pb 2777 movq mm7, mm4 2778 psubw mm6, mm0 2779 pcmpgtw mm7, mm5 // pa > pb? 2780 movq mm0, mm7 2781 // use mm7 mask to merge pa & pb 2782 pand mm5, mm7 2783 // use mm0 mask copy to merge a & b 2784 pand mm2, mm0 2785 pandn mm7, mm4 2786 pandn mm0, mm1 2787 paddw mm7, mm5 2788 paddw mm0, mm2 2789 // test ((pa <= pb)? pa:pb) <= pc 2790 pcmpgtw mm7, mm6 // pab > pc? 2791 pxor mm1, mm1 2792 pand mm3, mm7 2793 pandn mm7, mm0 2794 pxor mm1, mm1 2795 paddw mm7, mm3 2796 pxor mm0, mm0 2797 // Step ex to next set of 8 bytes and repeat loop til done 2798 add ebx, 8 2799 packuswb mm1, mm7 2800 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x) 2801 cmp ebx, MMXLength 2802 movq [edi + ebx - 8], mm1 // write back updated value 2803 // mm1 will be used as Raw(x-bpp) next loop 2804 jb dpth6lp 2805 } // end _asm block 2806 } 2807 break; 2808 2809 case 4: 2810 { 2811 ActiveMask.use = 0x00000000ffffffff; 2812 _asm { 2813 mov ebx, diff 2814 mov edi, row 2815 mov esi, prev_row 2816 pxor mm0, mm0 2817 // PRIME the pump (load the first Raw(x-bpp) data set 2818 movq mm1, [edi+ebx-8] // Only time should need to read 2819 // a=Raw(x-bpp) bytes 2820dpth4lp: 2821 // Do first set of 4 bytes 2822 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes 2823 punpckhbw mm1, mm0 // Unpack Low bytes of a 2824 movq mm2, [esi + ebx] // load b=Prior(x) 2825 punpcklbw mm2, mm0 // Unpack High bytes of b 2826 // pav = p - a = (a + b - c) - a = b - c 2827 movq mm4, mm2 2828 punpckhbw mm3, mm0 // Unpack High bytes of c 2829 // pbv = p - b = (a + b - c) - b = a - c 2830 movq mm5, mm1 2831 psubw mm4, mm3 2832 pxor mm7, mm7 2833 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 2834 movq mm6, mm4 2835 psubw mm5, mm3 2836 // pa = abs(p-a) = abs(pav) 2837 // pb = abs(p-b) = abs(pbv) 2838 // pc = abs(p-c) = abs(pcv) 2839 pcmpgtw mm0, mm4 // Create mask pav bytes < 0 2840 paddw mm6, mm5 2841 pand mm0, mm4 // Only pav bytes < 0 in mm7 2842 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 2843 psubw mm4, mm0 2844 pand mm7, mm5 // Only pbv bytes < 0 in mm0 2845 psubw mm4, mm0 2846 psubw mm5, mm7 2847 pxor mm0, mm0 2848 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 2849 pand mm0, mm6 // Only pav bytes < 0 in mm7 2850 psubw mm5, mm7 2851 psubw mm6, mm0 2852 // test pa <= pb 2853 movq mm7, mm4 2854 psubw mm6, mm0 2855 pcmpgtw mm7, mm5 // pa > pb? 2856 movq mm0, mm7 2857 // use mm7 mask to merge pa & pb 2858 pand mm5, mm7 2859 // use mm0 mask copy to merge a & b 2860 pand mm2, mm0 2861 pandn mm7, mm4 2862 pandn mm0, mm1 2863 paddw mm7, mm5 2864 paddw mm0, mm2 2865 // test ((pa <= pb)? pa:pb) <= pc 2866 pcmpgtw mm7, mm6 // pab > pc? 2867 pxor mm1, mm1 2868 pand mm3, mm7 2869 pandn mm7, mm0 2870 paddw mm7, mm3 2871 pxor mm0, mm0 2872 packuswb mm7, mm1 2873 movq mm3, [esi + ebx] // load c=Prior(x-bpp) 2874 pand mm7, ActiveMask 2875 movq mm2, mm3 // load b=Prior(x) step 1 2876 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x) 2877 punpcklbw mm3, mm0 // Unpack High bytes of c 2878 movq [edi + ebx], mm7 // write back updated value 2879 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp) 2880 // Do second set of 4 bytes 2881 punpckhbw mm2, mm0 // Unpack Low bytes of b 2882 punpcklbw mm1, mm0 // Unpack Low bytes of a 2883 // pav = p - a = (a + b - c) - a = b - c 2884 movq mm4, mm2 2885 // pbv = p - b = (a + b - c) - b = a - c 2886 movq mm5, mm1 2887 psubw mm4, mm3 2888 pxor mm7, mm7 2889 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 2890 movq mm6, mm4 2891 psubw mm5, mm3 2892 // pa = abs(p-a) = abs(pav) 2893 // pb = abs(p-b) = abs(pbv) 2894 // pc = abs(p-c) = abs(pcv) 2895 pcmpgtw mm0, mm4 // Create mask pav bytes < 0 2896 paddw mm6, mm5 2897 pand mm0, mm4 // Only pav bytes < 0 in mm7 2898 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 2899 psubw mm4, mm0 2900 pand mm7, mm5 // Only pbv bytes < 0 in mm0 2901 psubw mm4, mm0 2902 psubw mm5, mm7 2903 pxor mm0, mm0 2904 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 2905 pand mm0, mm6 // Only pav bytes < 0 in mm7 2906 psubw mm5, mm7 2907 psubw mm6, mm0 2908 // test pa <= pb 2909 movq mm7, mm4 2910 psubw mm6, mm0 2911 pcmpgtw mm7, mm5 // pa > pb? 2912 movq mm0, mm7 2913 // use mm7 mask to merge pa & pb 2914 pand mm5, mm7 2915 // use mm0 mask copy to merge a & b 2916 pand mm2, mm0 2917 pandn mm7, mm4 2918 pandn mm0, mm1 2919 paddw mm7, mm5 2920 paddw mm0, mm2 2921 // test ((pa <= pb)? pa:pb) <= pc 2922 pcmpgtw mm7, mm6 // pab > pc? 2923 pxor mm1, mm1 2924 pand mm3, mm7 2925 pandn mm7, mm0 2926 pxor mm1, mm1 2927 paddw mm7, mm3 2928 pxor mm0, mm0 2929 // Step ex to next set of 8 bytes and repeat loop til done 2930 add ebx, 8 2931 packuswb mm1, mm7 2932 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x) 2933 cmp ebx, MMXLength 2934 movq [edi + ebx - 8], mm1 // write back updated value 2935 // mm1 will be used as Raw(x-bpp) next loop 2936 jb dpth4lp 2937 } // end _asm block 2938 } 2939 break; 2940 case 8: // bpp == 8 2941 { 2942 ActiveMask.use = 0x00000000ffffffff; 2943 _asm { 2944 mov ebx, diff 2945 mov edi, row 2946 mov esi, prev_row 2947 pxor mm0, mm0 2948 // PRIME the pump (load the first Raw(x-bpp) data set 2949 movq mm1, [edi+ebx-8] // Only time should need to read 2950 // a=Raw(x-bpp) bytes 2951dpth8lp: 2952 // Do first set of 4 bytes 2953 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes 2954 punpcklbw mm1, mm0 // Unpack Low bytes of a 2955 movq mm2, [esi + ebx] // load b=Prior(x) 2956 punpcklbw mm2, mm0 // Unpack Low bytes of b 2957 // pav = p - a = (a + b - c) - a = b - c 2958 movq mm4, mm2 2959 punpcklbw mm3, mm0 // Unpack Low bytes of c 2960 // pbv = p - b = (a + b - c) - b = a - c 2961 movq mm5, mm1 2962 psubw mm4, mm3 2963 pxor mm7, mm7 2964 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 2965 movq mm6, mm4 2966 psubw mm5, mm3 2967 // pa = abs(p-a) = abs(pav) 2968 // pb = abs(p-b) = abs(pbv) 2969 // pc = abs(p-c) = abs(pcv) 2970 pcmpgtw mm0, mm4 // Create mask pav bytes < 0 2971 paddw mm6, mm5 2972 pand mm0, mm4 // Only pav bytes < 0 in mm7 2973 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 2974 psubw mm4, mm0 2975 pand mm7, mm5 // Only pbv bytes < 0 in mm0 2976 psubw mm4, mm0 2977 psubw mm5, mm7 2978 pxor mm0, mm0 2979 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 2980 pand mm0, mm6 // Only pav bytes < 0 in mm7 2981 psubw mm5, mm7 2982 psubw mm6, mm0 2983 // test pa <= pb 2984 movq mm7, mm4 2985 psubw mm6, mm0 2986 pcmpgtw mm7, mm5 // pa > pb? 2987 movq mm0, mm7 2988 // use mm7 mask to merge pa & pb 2989 pand mm5, mm7 2990 // use mm0 mask copy to merge a & b 2991 pand mm2, mm0 2992 pandn mm7, mm4 2993 pandn mm0, mm1 2994 paddw mm7, mm5 2995 paddw mm0, mm2 2996 // test ((pa <= pb)? pa:pb) <= pc 2997 pcmpgtw mm7, mm6 // pab > pc? 2998 pxor mm1, mm1 2999 pand mm3, mm7 3000 pandn mm7, mm0 3001 paddw mm7, mm3 3002 pxor mm0, mm0 3003 packuswb mm7, mm1 3004 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes 3005 pand mm7, ActiveMask 3006 movq mm2, [esi + ebx] // load b=Prior(x) 3007 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x) 3008 punpckhbw mm3, mm0 // Unpack High bytes of c 3009 movq [edi + ebx], mm7 // write back updated value 3010 movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes 3011 3012 // Do second set of 4 bytes 3013 punpckhbw mm2, mm0 // Unpack High bytes of b 3014 punpckhbw mm1, mm0 // Unpack High bytes of a 3015 // pav = p - a = (a + b - c) - a = b - c 3016 movq mm4, mm2 3017 // pbv = p - b = (a + b - c) - b = a - c 3018 movq mm5, mm1 3019 psubw mm4, mm3 3020 pxor mm7, mm7 3021 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 3022 movq mm6, mm4 3023 psubw mm5, mm3 3024 // pa = abs(p-a) = abs(pav) 3025 // pb = abs(p-b) = abs(pbv) 3026 // pc = abs(p-c) = abs(pcv) 3027 pcmpgtw mm0, mm4 // Create mask pav bytes < 0 3028 paddw mm6, mm5 3029 pand mm0, mm4 // Only pav bytes < 0 in mm7 3030 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0 3031 psubw mm4, mm0 3032 pand mm7, mm5 // Only pbv bytes < 0 in mm0 3033 psubw mm4, mm0 3034 psubw mm5, mm7 3035 pxor mm0, mm0 3036 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0 3037 pand mm0, mm6 // Only pav bytes < 0 in mm7 3038 psubw mm5, mm7 3039 psubw mm6, mm0 3040 // test pa <= pb 3041 movq mm7, mm4 3042 psubw mm6, mm0 3043 pcmpgtw mm7, mm5 // pa > pb? 3044 movq mm0, mm7 3045 // use mm7 mask to merge pa & pb 3046 pand mm5, mm7 3047 // use mm0 mask copy to merge a & b 3048 pand mm2, mm0 3049 pandn mm7, mm4 3050 pandn mm0, mm1 3051 paddw mm7, mm5 3052 paddw mm0, mm2 3053 // test ((pa <= pb)? pa:pb) <= pc 3054 pcmpgtw mm7, mm6 // pab > pc? 3055 pxor mm1, mm1 3056 pand mm3, mm7 3057 pandn mm7, mm0 3058 pxor mm1, mm1 3059 paddw mm7, mm3 3060 pxor mm0, mm0 3061 // Step ex to next set of 8 bytes and repeat loop til done 3062 add ebx, 8 3063 packuswb mm1, mm7 3064 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x) 3065 cmp ebx, MMXLength 3066 movq [edi + ebx - 8], mm1 // write back updated value 3067 // mm1 will be used as Raw(x-bpp) next loop 3068 jb dpth8lp 3069 } // end _asm block 3070 } 3071 break; 3072 3073 case 1: // bpp = 1 3074 case 2: // bpp = 2 3075 default: // bpp > 8 3076 { 3077 _asm { 3078 mov ebx, diff 3079 cmp ebx, FullLength 3080 jnb dpthdend 3081 mov edi, row 3082 mov esi, prev_row 3083 // Do Paeth decode for remaining bytes 3084 mov edx, ebx 3085 xor ecx, ecx // zero ecx before using cl & cx in loop below 3086 sub edx, bpp // Set edx = ebx - bpp 3087dpthdlp: 3088 xor eax, eax 3089 // pav = p - a = (a + b - c) - a = b - c 3090 mov al, [esi + ebx] // load Prior(x) into al 3091 mov cl, [esi + edx] // load Prior(x-bpp) into cl 3092 sub eax, ecx // subtract Prior(x-bpp) 3093 mov patemp, eax // Save pav for later use 3094 xor eax, eax 3095 // pbv = p - b = (a + b - c) - b = a - c 3096 mov al, [edi + edx] // load Raw(x-bpp) into al 3097 sub eax, ecx // subtract Prior(x-bpp) 3098 mov ecx, eax 3099 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 3100 add eax, patemp // pcv = pav + pbv 3101 // pc = abs(pcv) 3102 test eax, 0x80000000 3103 jz dpthdpca 3104 neg eax // reverse sign of neg values 3105dpthdpca: 3106 mov pctemp, eax // save pc for later use 3107 // pb = abs(pbv) 3108 test ecx, 0x80000000 3109 jz dpthdpba 3110 neg ecx // reverse sign of neg values 3111dpthdpba: 3112 mov pbtemp, ecx // save pb for later use 3113 // pa = abs(pav) 3114 mov eax, patemp 3115 test eax, 0x80000000 3116 jz dpthdpaa 3117 neg eax // reverse sign of neg values 3118dpthdpaa: 3119 mov patemp, eax // save pa for later use 3120 // test if pa <= pb 3121 cmp eax, ecx 3122 jna dpthdabb 3123 // pa > pb; now test if pb <= pc 3124 cmp ecx, pctemp 3125 jna dpthdbbc 3126 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) 3127 mov cl, [esi + edx] // load Prior(x-bpp) into cl 3128 jmp dpthdpaeth 3129dpthdbbc: 3130 // pb <= pc; Raw(x) = Paeth(x) + Prior(x) 3131 mov cl, [esi + ebx] // load Prior(x) into cl 3132 jmp dpthdpaeth 3133dpthdabb: 3134 // pa <= pb; now test if pa <= pc 3135 cmp eax, pctemp 3136 jna dpthdabc 3137 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) 3138 mov cl, [esi + edx] // load Prior(x-bpp) into cl 3139 jmp dpthdpaeth 3140dpthdabc: 3141 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) 3142 mov cl, [edi + edx] // load Raw(x-bpp) into cl 3143dpthdpaeth: 3144 inc ebx 3145 inc edx 3146 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 3147 add [edi + ebx - 1], cl 3148 cmp ebx, FullLength 3149 jb dpthdlp 3150dpthdend: 3151 } // end _asm block 3152 } 3153 return; // No need to go further with this one 3154 } // end switch ( bpp ) 3155 _asm 3156 { 3157 // MMX acceleration complete now do clean-up 3158 // Check if any remaining bytes left to decode 3159 mov ebx, MMXLength 3160 cmp ebx, FullLength 3161 jnb dpthend 3162 mov edi, row 3163 mov esi, prev_row 3164 // Do Paeth decode for remaining bytes 3165 mov edx, ebx 3166 xor ecx, ecx // zero ecx before using cl & cx in loop below 3167 sub edx, bpp // Set edx = ebx - bpp 3168dpthlp2: 3169 xor eax, eax 3170 // pav = p - a = (a + b - c) - a = b - c 3171 mov al, [esi + ebx] // load Prior(x) into al 3172 mov cl, [esi + edx] // load Prior(x-bpp) into cl 3173 sub eax, ecx // subtract Prior(x-bpp) 3174 mov patemp, eax // Save pav for later use 3175 xor eax, eax 3176 // pbv = p - b = (a + b - c) - b = a - c 3177 mov al, [edi + edx] // load Raw(x-bpp) into al 3178 sub eax, ecx // subtract Prior(x-bpp) 3179 mov ecx, eax 3180 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv 3181 add eax, patemp // pcv = pav + pbv 3182 // pc = abs(pcv) 3183 test eax, 0x80000000 3184 jz dpthpca2 3185 neg eax // reverse sign of neg values 3186dpthpca2: 3187 mov pctemp, eax // save pc for later use 3188 // pb = abs(pbv) 3189 test ecx, 0x80000000 3190 jz dpthpba2 3191 neg ecx // reverse sign of neg values 3192dpthpba2: 3193 mov pbtemp, ecx // save pb for later use 3194 // pa = abs(pav) 3195 mov eax, patemp 3196 test eax, 0x80000000 3197 jz dpthpaa2 3198 neg eax // reverse sign of neg values 3199dpthpaa2: 3200 mov patemp, eax // save pa for later use 3201 // test if pa <= pb 3202 cmp eax, ecx 3203 jna dpthabb2 3204 // pa > pb; now test if pb <= pc 3205 cmp ecx, pctemp 3206 jna dpthbbc2 3207 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) 3208 mov cl, [esi + edx] // load Prior(x-bpp) into cl 3209 jmp dpthpaeth2 3210dpthbbc2: 3211 // pb <= pc; Raw(x) = Paeth(x) + Prior(x) 3212 mov cl, [esi + ebx] // load Prior(x) into cl 3213 jmp dpthpaeth2 3214dpthabb2: 3215 // pa <= pb; now test if pa <= pc 3216 cmp eax, pctemp 3217 jna dpthabc2 3218 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) 3219 mov cl, [esi + edx] // load Prior(x-bpp) into cl 3220 jmp dpthpaeth2 3221dpthabc2: 3222 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) 3223 mov cl, [edi + edx] // load Raw(x-bpp) into cl 3224dpthpaeth2: 3225 inc ebx 3226 inc edx 3227 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 3228 add [edi + ebx - 1], cl 3229 cmp ebx, FullLength 3230 jb dpthlp2 3231dpthend: 3232 emms // End MMX instructions; prep for possible FP instrs. 3233 } // end _asm block 3234} 3235 3236// Optimized code for PNG Sub filter decoder 3237void /* PRIVATE */ 3238png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row) 3239{ 3240 //int test; 3241 int bpp; 3242 png_uint_32 FullLength; 3243 png_uint_32 MMXLength; 3244 int diff; 3245 3246 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel 3247 FullLength = row_info->rowbytes - bpp; // # of bytes to filter 3248 _asm { 3249 mov edi, row 3250 mov esi, edi // lp = row 3251 add edi, bpp // rp = row + bpp 3252 xor eax, eax 3253 // get # of bytes to alignment 3254 mov diff, edi // take start of row 3255 add diff, 0xf // add 7 + 8 to incr past 3256 // alignment boundary 3257 xor ebx, ebx 3258 and diff, 0xfffffff8 // mask to alignment boundary 3259 sub diff, edi // subtract from start ==> value 3260 // ebx at alignment 3261 jz dsubgo 3262 // fix alignment 3263dsublp1: 3264 mov al, [esi+ebx] 3265 add [edi+ebx], al 3266 inc ebx 3267 cmp ebx, diff 3268 jb dsublp1 3269dsubgo: 3270 mov ecx, FullLength 3271 mov edx, ecx 3272 sub edx, ebx // subtract alignment fix 3273 and edx, 0x00000007 // calc bytes over mult of 8 3274 sub ecx, edx // drop over bytes from length 3275 mov MMXLength, ecx 3276 } // end _asm block 3277 3278 // Now do the math for the rest of the row 3279 switch ( bpp ) 3280 { 3281 case 3: 3282 { 3283 ActiveMask.use = 0x0000ffffff000000; 3284 ShiftBpp.use = 24; // == 3 * 8 3285 ShiftRem.use = 40; // == 64 - 24 3286 _asm { 3287 mov edi, row 3288 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group 3289 mov esi, edi // lp = row 3290 add edi, bpp // rp = row + bpp 3291 movq mm6, mm7 3292 mov ebx, diff 3293 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active 3294 // byte group 3295 // PRIME the pump (load the first Raw(x-bpp) data set 3296 movq mm1, [edi+ebx-8] 3297dsub3lp: 3298 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes 3299 // no need for mask; shift clears inactive bytes 3300 // Add 1st active group 3301 movq mm0, [edi+ebx] 3302 paddb mm0, mm1 3303 // Add 2nd active group 3304 movq mm1, mm0 // mov updated Raws to mm1 3305 psllq mm1, ShiftBpp // shift data to position correctly 3306 pand mm1, mm7 // mask to use only 2nd active group 3307 paddb mm0, mm1 3308 // Add 3rd active group 3309 movq mm1, mm0 // mov updated Raws to mm1 3310 psllq mm1, ShiftBpp // shift data to position correctly 3311 pand mm1, mm6 // mask to use only 3rd active group 3312 add ebx, 8 3313 paddb mm0, mm1 3314 cmp ebx, MMXLength 3315 movq [edi+ebx-8], mm0 // Write updated Raws back to array 3316 // Prep for doing 1st add at top of loop 3317 movq mm1, mm0 3318 jb dsub3lp 3319 } // end _asm block 3320 } 3321 break; 3322 3323 case 1: 3324 { 3325 // Placed here just in case this is a duplicate of the 3326 // non-MMX code for the SUB filter in png_read_filter_row above 3327 // 3328 // png_bytep rp; 3329 // png_bytep lp; 3330 // png_uint_32 i; 3331 // bpp = (row_info->pixel_depth + 7) >> 3; 3332 // for (i = (png_uint_32)bpp, rp = row + bpp, lp = row; 3333 // i < row_info->rowbytes; i++, rp++, lp++) 3334 // { 3335 // *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff); 3336 // } 3337 _asm { 3338 mov ebx, diff 3339 mov edi, row 3340 cmp ebx, FullLength 3341 jnb dsub1end 3342 mov esi, edi // lp = row 3343 xor eax, eax 3344 add edi, bpp // rp = row + bpp 3345dsub1lp: 3346 mov al, [esi+ebx] 3347 add [edi+ebx], al 3348 inc ebx 3349 cmp ebx, FullLength 3350 jb dsub1lp 3351dsub1end: 3352 } // end _asm block 3353 } 3354 return; 3355 3356 case 6: 3357 case 7: 3358 case 4: 3359 case 5: 3360 { 3361 ShiftBpp.use = bpp << 3; 3362 ShiftRem.use = 64 - ShiftBpp.use; 3363 _asm { 3364 mov edi, row 3365 mov ebx, diff 3366 mov esi, edi // lp = row 3367 add edi, bpp // rp = row + bpp 3368 // PRIME the pump (load the first Raw(x-bpp) data set 3369 movq mm1, [edi+ebx-8] 3370dsub4lp: 3371 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes 3372 // no need for mask; shift clears inactive bytes 3373 movq mm0, [edi+ebx] 3374 paddb mm0, mm1 3375 // Add 2nd active group 3376 movq mm1, mm0 // mov updated Raws to mm1 3377 psllq mm1, ShiftBpp // shift data to position correctly 3378 // there is no need for any mask 3379 // since shift clears inactive bits/bytes 3380 add ebx, 8 3381 paddb mm0, mm1 3382 cmp ebx, MMXLength 3383 movq [edi+ebx-8], mm0 3384 movq mm1, mm0 // Prep for doing 1st add at top of loop 3385 jb dsub4lp 3386 } // end _asm block 3387 } 3388 break; 3389 3390 case 2: 3391 { 3392 ActiveMask.use = 0x00000000ffff0000; 3393 ShiftBpp.use = 16; // == 2 * 8 3394 ShiftRem.use = 48; // == 64 - 16 3395 _asm { 3396 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group 3397 mov ebx, diff 3398 movq mm6, mm7 3399 mov edi, row 3400 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active 3401 // byte group 3402 mov esi, edi // lp = row 3403 movq mm5, mm6 3404 add edi, bpp // rp = row + bpp 3405 psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active 3406 // byte group 3407 // PRIME the pump (load the first Raw(x-bpp) data set 3408 movq mm1, [edi+ebx-8] 3409dsub2lp: 3410 // Add 1st active group 3411 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes 3412 // no need for mask; shift clears inactive 3413 // bytes 3414 movq mm0, [edi+ebx] 3415 paddb mm0, mm1 3416 // Add 2nd active group 3417 movq mm1, mm0 // mov updated Raws to mm1 3418 psllq mm1, ShiftBpp // shift data to position correctly 3419 pand mm1, mm7 // mask to use only 2nd active group 3420 paddb mm0, mm1 3421 // Add 3rd active group 3422 movq mm1, mm0 // mov updated Raws to mm1 3423 psllq mm1, ShiftBpp // shift data to position correctly 3424 pand mm1, mm6 // mask to use only 3rd active group 3425 paddb mm0, mm1 3426 // Add 4th active group 3427 movq mm1, mm0 // mov updated Raws to mm1 3428 psllq mm1, ShiftBpp // shift data to position correctly 3429 pand mm1, mm5 // mask to use only 4th active group 3430 add ebx, 8 3431 paddb mm0, mm1 3432 cmp ebx, MMXLength 3433 movq [edi+ebx-8], mm0 // Write updated Raws back to array 3434 movq mm1, mm0 // Prep for doing 1st add at top of loop 3435 jb dsub2lp 3436 } // end _asm block 3437 } 3438 break; 3439 case 8: 3440 { 3441 _asm { 3442 mov edi, row 3443 mov ebx, diff 3444 mov esi, edi // lp = row 3445 add edi, bpp // rp = row + bpp 3446 mov ecx, MMXLength 3447 movq mm7, [edi+ebx-8] // PRIME the pump (load the first 3448 // Raw(x-bpp) data set 3449 and ecx, 0x0000003f // calc bytes over mult of 64 3450dsub8lp: 3451 movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes 3452 paddb mm0, mm7 3453 movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes 3454 movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes 3455 // Now mm0 will be used as Raw(x-bpp) for 3456 // the 2nd group of 8 bytes. This will be 3457 // repeated for each group of 8 bytes with 3458 // the 8th group being used as the Raw(x-bpp) 3459 // for the 1st group of the next loop. 3460 paddb mm1, mm0 3461 movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes 3462 movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes 3463 paddb mm2, mm1 3464 movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes 3465 movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes 3466 paddb mm3, mm2 3467 movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes 3468 movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes 3469 paddb mm4, mm3 3470 movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes 3471 movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes 3472 paddb mm5, mm4 3473 movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes 3474 movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes 3475 paddb mm6, mm5 3476 movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes 3477 movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes 3478 add ebx, 64 3479 paddb mm7, mm6 3480 cmp ebx, ecx 3481 movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes 3482 jb dsub8lp 3483 cmp ebx, MMXLength 3484 jnb dsub8lt8 3485dsub8lpA: 3486 movq mm0, [edi+ebx] 3487 add ebx, 8 3488 paddb mm0, mm7 3489 cmp ebx, MMXLength 3490 movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx 3491 movq mm7, mm0 // Move calculated Raw(x) data to mm1 to 3492 // be the new Raw(x-bpp) for the next loop 3493 jb dsub8lpA 3494dsub8lt8: 3495 } // end _asm block 3496 } 3497 break; 3498 3499 default: // bpp greater than 8 bytes 3500 { 3501 _asm { 3502 mov ebx, diff 3503 mov edi, row 3504 mov esi, edi // lp = row 3505 add edi, bpp // rp = row + bpp 3506dsubAlp: 3507 movq mm0, [edi+ebx] 3508 movq mm1, [esi+ebx] 3509 add ebx, 8 3510 paddb mm0, mm1 3511 cmp ebx, MMXLength 3512 movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset 3513 // add ebx 3514 jb dsubAlp 3515 } // end _asm block 3516 } 3517 break; 3518 3519 } // end switch ( bpp ) 3520 3521 _asm { 3522 mov ebx, MMXLength 3523 mov edi, row 3524 cmp ebx, FullLength 3525 jnb dsubend 3526 mov esi, edi // lp = row 3527 xor eax, eax 3528 add edi, bpp // rp = row + bpp 3529dsublp2: 3530 mov al, [esi+ebx] 3531 add [edi+ebx], al 3532 inc ebx 3533 cmp ebx, FullLength 3534 jb dsublp2 3535dsubend: 3536 emms // End MMX instructions; prep for possible FP instrs. 3537 } // end _asm block 3538} 3539 3540// Optimized code for PNG Up filter decoder 3541void /* PRIVATE */ 3542png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row, 3543 png_bytep prev_row) 3544{ 3545 png_uint_32 len; 3546 len = row_info->rowbytes; // # of bytes to filter 3547 _asm { 3548 mov edi, row 3549 // get # of bytes to alignment 3550 mov ecx, edi 3551 xor ebx, ebx 3552 add ecx, 0x7 3553 xor eax, eax 3554 and ecx, 0xfffffff8 3555 mov esi, prev_row 3556 sub ecx, edi 3557 jz dupgo 3558 // fix alignment 3559duplp1: 3560 mov al, [edi+ebx] 3561 add al, [esi+ebx] 3562 inc ebx 3563 cmp ebx, ecx 3564 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx 3565 jb duplp1 3566dupgo: 3567 mov ecx, len 3568 mov edx, ecx 3569 sub edx, ebx // subtract alignment fix 3570 and edx, 0x0000003f // calc bytes over mult of 64 3571 sub ecx, edx // drop over bytes from length 3572 // Unrolled loop - use all MMX registers and interleave to reduce 3573 // number of branch instructions (loops) and reduce partial stalls 3574duploop: 3575 movq mm1, [esi+ebx] 3576 movq mm0, [edi+ebx] 3577 movq mm3, [esi+ebx+8] 3578 paddb mm0, mm1 3579 movq mm2, [edi+ebx+8] 3580 movq [edi+ebx], mm0 3581 paddb mm2, mm3 3582 movq mm5, [esi+ebx+16] 3583 movq [edi+ebx+8], mm2 3584 movq mm4, [edi+ebx+16] 3585 movq mm7, [esi+ebx+24] 3586 paddb mm4, mm5 3587 movq mm6, [edi+ebx+24] 3588 movq [edi+ebx+16], mm4 3589 paddb mm6, mm7 3590 movq mm1, [esi+ebx+32] 3591 movq [edi+ebx+24], mm6 3592 movq mm0, [edi+ebx+32] 3593 movq mm3, [esi+ebx+40] 3594 paddb mm0, mm1 3595 movq mm2, [edi+ebx+40] 3596 movq [edi+ebx+32], mm0 3597 paddb mm2, mm3 3598 movq mm5, [esi+ebx+48] 3599 movq [edi+ebx+40], mm2 3600 movq mm4, [edi+ebx+48] 3601 movq mm7, [esi+ebx+56] 3602 paddb mm4, mm5 3603 movq mm6, [edi+ebx+56] 3604 movq [edi+ebx+48], mm4 3605 add ebx, 64 3606 paddb mm6, mm7 3607 cmp ebx, ecx 3608 movq [edi+ebx-8], mm6 // (+56)movq does not affect flags; 3609 // -8 to offset add ebx 3610 jb duploop 3611 3612 cmp edx, 0 // Test for bytes over mult of 64 3613 jz dupend 3614 3615 3616 // 2 lines added by lcreeve@netins.net 3617 // (mail 11 Jul 98 in png-implement list) 3618 cmp edx, 8 //test for less than 8 bytes 3619 jb duplt8 3620 3621 3622 add ecx, edx 3623 and edx, 0x00000007 // calc bytes over mult of 8 3624 sub ecx, edx // drop over bytes from length 3625 jz duplt8 3626 // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously 3627duplpA: 3628 movq mm1, [esi+ebx] 3629 movq mm0, [edi+ebx] 3630 add ebx, 8 3631 paddb mm0, mm1 3632 cmp ebx, ecx 3633 movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx 3634 jb duplpA 3635 cmp edx, 0 // Test for bytes over mult of 8 3636 jz dupend 3637duplt8: 3638 xor eax, eax 3639 add ecx, edx // move over byte count into counter 3640 // Loop using x86 registers to update remaining bytes 3641duplp2: 3642 mov al, [edi + ebx] 3643 add al, [esi + ebx] 3644 inc ebx 3645 cmp ebx, ecx 3646 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx 3647 jb duplp2 3648dupend: 3649 // Conversion of filtered row completed 3650 emms // End MMX instructions; prep for possible FP instrs. 3651 } // end _asm block 3652} 3653 3654 3655// Optimized png_read_filter_row routines 3656void /* PRIVATE */ 3657png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep 3658 row, png_bytep prev_row, int filter) 3659{ 3660#ifdef PNG_DEBUG 3661 char filnm[6]; 3662#endif 3663#define UseMMX 1 3664 3665 if (mmx_supported == 2) 3666 mmx_supported = mmxsupport(); 3667 3668 if (!mmx_supported) 3669 { 3670 png_read_filter_row_c(png_ptr, row_info, row, prev_row, filter); 3671 return ; 3672 } 3673 3674#ifdef PNG_DEBUG 3675 png_debug(1, "in png_read_filter_row\n"); 3676# if (UseMMX == 1) 3677 png_debug1(0,"%s, ", "MMX"); 3678# else 3679 png_debug1(0,"%s, ", "x86"); 3680# endif 3681 switch (filter) 3682 { 3683 case 0: sprintf(filnm, "None "); 3684 break; 3685 case 1: sprintf(filnm, "Sub "); 3686 break; 3687 case 2: sprintf(filnm, "Up "); 3688 break; 3689 case 3: sprintf(filnm, "Avg "); 3690 break; 3691 case 4: sprintf(filnm, "Paeth"); 3692 break; 3693 default: sprintf(filnm, "Unknw"); 3694 break; 3695 } 3696 png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm); 3697 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth, 3698 (int)((row_info->pixel_depth + 7) >> 3)); 3699 png_debug1(0,"len=%8d, ", row_info->rowbytes); 3700#endif 3701 3702 switch (filter) 3703 { 3704 case PNG_FILTER_VALUE_NONE: 3705 break; 3706 case PNG_FILTER_VALUE_SUB: 3707 { 3708#if (UseMMX == 1) 3709 if ((row_info->pixel_depth > 8) && 3710 (row_info->rowbytes >= 128) ) 3711 { 3712 png_read_filter_row_mmx_sub(row_info, row); 3713 } 3714 else 3715#endif 3716 { 3717 png_uint_32 i; 3718 png_uint_32 istop = row_info->rowbytes; 3719 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; 3720 png_bytep rp = row + bpp; 3721 png_bytep lp = row; 3722 3723 for (i = bpp; i < istop; i++) 3724 { 3725 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff); 3726 rp++; 3727 } 3728 } //end !UseMMX 3729 break; 3730 } 3731 case PNG_FILTER_VALUE_UP: 3732 { 3733#if (UseMMX == 1) 3734 if ((row_info->pixel_depth > 8) && 3735 (row_info->rowbytes >= 128) ) 3736 { 3737 png_read_filter_row_mmx_up(row_info, row, prev_row); 3738 } //end if UseMMX 3739 else 3740#endif 3741 { 3742 png_bytep rp; 3743 png_bytep pp; 3744 png_uint_32 i; 3745 for (i = 0, rp = row, pp = prev_row; 3746 i < row_info->rowbytes; i++, rp++, pp++) 3747 { 3748 *rp = (png_byte)(((int)(*rp) + (int)(*pp)) & 0xff); 3749 } 3750 } //end !UseMMX 3751 break; 3752 } 3753 case PNG_FILTER_VALUE_AVG: 3754 { 3755#if (UseMMX == 1) 3756 if ((row_info->pixel_depth > 8) && 3757 (row_info->rowbytes >= 128) ) 3758 { 3759 png_read_filter_row_mmx_avg(row_info, row, prev_row); 3760 } //end if UseMMX 3761 else 3762#endif 3763 { 3764 png_uint_32 i; 3765 png_bytep rp = row; 3766 png_bytep pp = prev_row; 3767 png_bytep lp = row; 3768 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; 3769 png_uint_32 istop = row_info->rowbytes - bpp; 3770 3771 for (i = 0; i < bpp; i++) 3772 { 3773 *rp = (png_byte)(((int)(*rp) + 3774 ((int)(*pp++) >> 1)) & 0xff); 3775 rp++; 3776 } 3777 3778 for (i = 0; i < istop; i++) 3779 { 3780 *rp = (png_byte)(((int)(*rp) + 3781 ((int)(*pp++ + *lp++) >> 1)) & 0xff); 3782 rp++; 3783 } 3784 } //end !UseMMX 3785 break; 3786 } 3787 case PNG_FILTER_VALUE_PAETH: 3788 { 3789#if (UseMMX == 1) 3790 if ((row_info->pixel_depth > 8) && 3791 (row_info->rowbytes >= 128) ) 3792 { 3793 png_read_filter_row_mmx_paeth(row_info, row, prev_row); 3794 } //end if UseMMX 3795 else 3796#endif 3797 { 3798 png_uint_32 i; 3799 png_bytep rp = row; 3800 png_bytep pp = prev_row; 3801 png_bytep lp = row; 3802 png_bytep cp = prev_row; 3803 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; 3804 png_uint_32 istop=row_info->rowbytes - bpp; 3805 3806 for (i = 0; i < bpp; i++) 3807 { 3808 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); 3809 rp++; 3810 } 3811 3812 for (i = 0; i < istop; i++) // use leftover rp,pp 3813 { 3814 int a, b, c, pa, pb, pc, p; 3815 3816 a = *lp++; 3817 b = *pp++; 3818 c = *cp++; 3819 3820 p = b - c; 3821 pc = a - c; 3822 3823#ifdef PNG_USE_ABS 3824 pa = abs(p); 3825 pb = abs(pc); 3826 pc = abs(p + pc); 3827#else 3828 pa = p < 0 ? -p : p; 3829 pb = pc < 0 ? -pc : pc; 3830 pc = (p + pc) < 0 ? -(p + pc) : p + pc; 3831#endif 3832 3833 /* 3834 if (pa <= pb && pa <= pc) 3835 p = a; 3836 else if (pb <= pc) 3837 p = b; 3838 else 3839 p = c; 3840 */ 3841 3842 p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c; 3843 3844 *rp = (png_byte)(((int)(*rp) + p) & 0xff); 3845 rp++; 3846 } 3847 } //end !UseMMX 3848 break; 3849 } 3850 default: 3851 png_warning(png_ptr, "Ignoring bad adaptive filter type"); 3852 *row=0; 3853 break; 3854 } 3855} 3856#endif 3857