1/* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file 2 * 3 * For Intel x86 CPU and Microsoft Visual C++ compiler 4 * 5 * libpng version 1.2.7 - September 12, 2004 6 * For conditions of distribution and use, see copyright notice in png.h 7 * Copyright (c) 1998-2004 Glenn Randers-Pehrson 8 * Copyright (c) 1998, Intel Corporation 9 * 10 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998 11 * Interface to libpng contributed by Gilles Vollant, 1999 12 * 13 * 14 * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d, 15 * a sign error in the post-MMX cleanup code for each pixel_depth resulted 16 * in bad pixels at the beginning of some rows of some images, and also 17 * (due to out-of-range memory reads and writes) caused heap corruption 18 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e. 19 * 20 * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916] 21 * 22 * [runtime MMX configuration, GRR 20010102] 23 * 24 */ 25 26#define PNG_INTERNAL 27#include "png.h" 28 29#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD) 30 31static int mmx_supported=2; 32 33 34int PNGAPI 35png_mmx_support(void) 36{ 37 int mmx_supported_local = 0; 38 _asm { 39 push ebx /*CPUID will trash these */ 40 push ecx 41 push edx 42 43 pushfd /*Save Eflag to stack */ 44 pop eax /*Get Eflag from stack into eax */ 45 mov ecx, eax /*Make another copy of Eflag in ecx */ 46 xor eax, 0x200000 /*Toggle ID bit in Eflag [i.e. bit(21)] */ 47 push eax /*Save modified Eflag back to stack */ 48 49 popfd /*Restored modified value back to Eflag reg */ 50 pushfd /*Save Eflag to stack */ 51 pop eax /*Get Eflag from stack */ 52 push ecx /* save original Eflag to stack */ 53 popfd /* restore original Eflag */ 54 xor eax, ecx /*Compare the new Eflag with the original Eflag */ 55 jz NOT_SUPPORTED /*If the same, CPUID instruction is not supported, */ 56 /*skip following instructions and jump to */ 57 /*NOT_SUPPORTED label */ 58 59 xor eax, eax /*Set eax to zero */ 60 61 _asm _emit 0x0f /*CPUID instruction (two bytes opcode) */ 62 _asm _emit 0xa2 63 64 cmp eax, 1 /*make sure eax return non-zero value */ 65 jl NOT_SUPPORTED /*If eax is zero, mmx not supported */ 66 67 xor eax, eax /*set eax to zero */ 68 inc eax /*Now increment eax to 1. This instruction is */ 69 /*faster than the instruction "mov eax, 1" */ 70 71 _asm _emit 0x0f /*CPUID instruction */ 72 _asm _emit 0xa2 73 74 and edx, 0x00800000 /*mask out all bits but mmx bit(24) */ 75 cmp edx, 0 /* 0 = mmx not supported */ 76 jz NOT_SUPPORTED /* non-zero = Yes, mmx IS supported */ 77 78 mov mmx_supported_local, 1 /*set return value to 1 */ 79 80NOT_SUPPORTED: 81 mov eax, mmx_supported_local /*move return value to eax */ 82 pop edx /*CPUID trashed these */ 83 pop ecx 84 pop ebx 85 } 86 87 /*mmx_supported_local=0; // test code for force don't support MMX */ 88 /*printf("MMX : %u (1=MMX supported)\n",mmx_supported_local); */ 89 90 mmx_supported = mmx_supported_local; 91 return mmx_supported_local; 92} 93 94/* Combines the row recently read in with the previous row. 95 This routine takes care of alpha and transparency if requested. 96 This routine also handles the two methods of progressive display 97 of interlaced images, depending on the mask value. 98 The mask value describes which pixels are to be combined with 99 the row. The pattern always repeats every 8 pixels, so just 8 100 bits are needed. A one indicates the pixel is to be combined; a 101 zero indicates the pixel is to be skipped. This is in addition 102 to any alpha or transparency value associated with the pixel. If 103 you want all pixels to be combined, pass 0xff (255) in mask. */ 104 105/* Use this routine for x86 platform - uses faster MMX routine if machine 106 supports MMX */ 107 108void /* PRIVATE */ 109png_combine_row(png_structp png_ptr, png_bytep row, int mask) 110{ 111#ifdef PNG_USE_LOCAL_ARRAYS 112 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1}; 113#endif 114 115 png_debug(1,"in png_combine_row_asm\n"); 116 117 if (mmx_supported == 2) { 118#if !defined(PNG_1_0_X) 119 /* this should have happened in png_init_mmx_flags() already */ 120 png_warning(png_ptr, "asm_flags may not have been initialized"); 121#endif 122 png_mmx_support(); 123 } 124 125 if (mask == 0xff) 126 { 127 png_memcpy(row, png_ptr->row_buf + 1, 128 (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth, 129 png_ptr->width)); 130 } 131 /* GRR: add "else if (mask == 0)" case? 132 * or does png_combine_row() not even get called in that case? */ 133 else 134 { 135 switch (png_ptr->row_info.pixel_depth) 136 { 137 case 1: 138 { 139 png_bytep sp; 140 png_bytep dp; 141 int s_inc, s_start, s_end; 142 int m; 143 int shift; 144 png_uint_32 i; 145 146 sp = png_ptr->row_buf + 1; 147 dp = row; 148 m = 0x80; 149#if defined(PNG_READ_PACKSWAP_SUPPORTED) 150 if (png_ptr->transformations & PNG_PACKSWAP) 151 { 152 s_start = 0; 153 s_end = 7; 154 s_inc = 1; 155 } 156 else 157#endif 158 { 159 s_start = 7; 160 s_end = 0; 161 s_inc = -1; 162 } 163 164 shift = s_start; 165 166 for (i = 0; i < png_ptr->width; i++) 167 { 168 if (m & mask) 169 { 170 int value; 171 172 value = (*sp >> shift) & 0x1; 173 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff); 174 *dp |= (png_byte)(value << shift); 175 } 176 177 if (shift == s_end) 178 { 179 shift = s_start; 180 sp++; 181 dp++; 182 } 183 else 184 shift += s_inc; 185 186 if (m == 1) 187 m = 0x80; 188 else 189 m >>= 1; 190 } 191 break; 192 } 193 194 case 2: 195 { 196 png_bytep sp; 197 png_bytep dp; 198 int s_start, s_end, s_inc; 199 int m; 200 int shift; 201 png_uint_32 i; 202 int value; 203 204 sp = png_ptr->row_buf + 1; 205 dp = row; 206 m = 0x80; 207#if defined(PNG_READ_PACKSWAP_SUPPORTED) 208 if (png_ptr->transformations & PNG_PACKSWAP) 209 { 210 s_start = 0; 211 s_end = 6; 212 s_inc = 2; 213 } 214 else 215#endif 216 { 217 s_start = 6; 218 s_end = 0; 219 s_inc = -2; 220 } 221 222 shift = s_start; 223 224 for (i = 0; i < png_ptr->width; i++) 225 { 226 if (m & mask) 227 { 228 value = (*sp >> shift) & 0x3; 229 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff); 230 *dp |= (png_byte)(value << shift); 231 } 232 233 if (shift == s_end) 234 { 235 shift = s_start; 236 sp++; 237 dp++; 238 } 239 else 240 shift += s_inc; 241 if (m == 1) 242 m = 0x80; 243 else 244 m >>= 1; 245 } 246 break; 247 } 248 249 case 4: 250 { 251 png_bytep sp; 252 png_bytep dp; 253 int s_start, s_end, s_inc; 254 int m; 255 int shift; 256 png_uint_32 i; 257 int value; 258 259 sp = png_ptr->row_buf + 1; 260 dp = row; 261 m = 0x80; 262#if defined(PNG_READ_PACKSWAP_SUPPORTED) 263 if (png_ptr->transformations & PNG_PACKSWAP) 264 { 265 s_start = 0; 266 s_end = 4; 267 s_inc = 4; 268 } 269 else 270#endif 271 { 272 s_start = 4; 273 s_end = 0; 274 s_inc = -4; 275 } 276 shift = s_start; 277 278 for (i = 0; i < png_ptr->width; i++) 279 { 280 if (m & mask) 281 { 282 value = (*sp >> shift) & 0xf; 283 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff); 284 *dp |= (png_byte)(value << shift); 285 } 286 287 if (shift == s_end) 288 { 289 shift = s_start; 290 sp++; 291 dp++; 292 } 293 else 294 shift += s_inc; 295 if (m == 1) 296 m = 0x80; 297 else 298 m >>= 1; 299 } 300 break; 301 } 302 303 case 8: 304 { 305 png_bytep srcptr; 306 png_bytep dstptr; 307 png_uint_32 len; 308 int m; 309 int diff, unmask; 310 311 __int64 mask0=0x0102040810204080; 312 313#if !defined(PNG_1_0_X) 314 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 315 /* && mmx_supported */ ) 316#else 317 if (mmx_supported) 318#endif 319 { 320 srcptr = png_ptr->row_buf + 1; 321 dstptr = row; 322 m = 0x80; 323 unmask = ~mask; 324 len = png_ptr->width &~7; /*reduce to multiple of 8 */ 325 diff = png_ptr->width & 7; /*amount lost */ 326 327 _asm 328 { 329 movd mm7, unmask /*load bit pattern */ 330 psubb mm6,mm6 /*zero mm6 */ 331 punpcklbw mm7,mm7 332 punpcklwd mm7,mm7 333 punpckldq mm7,mm7 /*fill register with 8 masks */ 334 335 movq mm0,mask0 336 337 pand mm0,mm7 /*nonzero if keep byte */ 338 pcmpeqb mm0,mm6 /*zeros->1s, v versa */ 339 340 mov ecx,len /*load length of line (pixels) */ 341 mov esi,srcptr /*load source */ 342 mov ebx,dstptr /*load dest */ 343 cmp ecx,0 /*lcr */ 344 je mainloop8end 345 346mainloop8: 347 movq mm4,[esi] 348 pand mm4,mm0 349 movq mm6,mm0 350 pandn mm6,[ebx] 351 por mm4,mm6 352 movq [ebx],mm4 353 354 add esi,8 /*inc by 8 bytes processed */ 355 add ebx,8 356 sub ecx,8 /*dec by 8 pixels processed */ 357 358 ja mainloop8 359mainloop8end: 360 361 mov ecx,diff 362 cmp ecx,0 363 jz end8 364 365 mov edx,mask 366 sal edx,24 /*make low byte the high byte */ 367 368secondloop8: 369 sal edx,1 /*move high bit to CF */ 370 jnc skip8 /*if CF = 0 */ 371 mov al,[esi] 372 mov [ebx],al 373skip8: 374 inc esi 375 inc ebx 376 377 dec ecx 378 jnz secondloop8 379end8: 380 emms 381 } 382 } 383 else /* mmx not supported - use modified C routine */ 384 { 385 register unsigned int incr1, initial_val, final_val; 386 png_size_t pixel_bytes; 387 png_uint_32 i; 388 register int disp = png_pass_inc[png_ptr->pass]; 389 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; 390 391 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); 392 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* 393 pixel_bytes; 394 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; 395 initial_val = offset_table[png_ptr->pass]*pixel_bytes; 396 final_val = png_ptr->width*pixel_bytes; 397 incr1 = (disp)*pixel_bytes; 398 for (i = initial_val; i < final_val; i += incr1) 399 { 400 png_memcpy(dstptr, srcptr, pixel_bytes); 401 srcptr += incr1; 402 dstptr += incr1; 403 } 404 } /* end of else */ 405 406 break; 407 } /* end 8 bpp */ 408 409 case 16: 410 { 411 png_bytep srcptr; 412 png_bytep dstptr; 413 png_uint_32 len; 414 int unmask, diff; 415 __int64 mask1=0x0101020204040808, 416 mask0=0x1010202040408080; 417 418#if !defined(PNG_1_0_X) 419 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 420 /* && mmx_supported */ ) 421#else 422 if (mmx_supported) 423#endif 424 { 425 srcptr = png_ptr->row_buf + 1; 426 dstptr = row; 427 428 unmask = ~mask; 429 len = (png_ptr->width)&~7; 430 diff = (png_ptr->width)&7; 431 _asm 432 { 433 movd mm7, unmask /*load bit pattern */ 434 psubb mm6,mm6 /*zero mm6 */ 435 punpcklbw mm7,mm7 436 punpcklwd mm7,mm7 437 punpckldq mm7,mm7 /*fill register with 8 masks */ 438 439 movq mm0,mask0 440 movq mm1,mask1 441 442 pand mm0,mm7 443 pand mm1,mm7 444 445 pcmpeqb mm0,mm6 446 pcmpeqb mm1,mm6 447 448 mov ecx,len /*load length of line */ 449 mov esi,srcptr /*load source */ 450 mov ebx,dstptr /*load dest */ 451 cmp ecx,0 /*lcr */ 452 jz mainloop16end 453 454mainloop16: 455 movq mm4,[esi] 456 pand mm4,mm0 457 movq mm6,mm0 458 movq mm7,[ebx] 459 pandn mm6,mm7 460 por mm4,mm6 461 movq [ebx],mm4 462 463 movq mm5,[esi+8] 464 pand mm5,mm1 465 movq mm7,mm1 466 movq mm6,[ebx+8] 467 pandn mm7,mm6 468 por mm5,mm7 469 movq [ebx+8],mm5 470 471 add esi,16 /*inc by 16 bytes processed */ 472 add ebx,16 473 sub ecx,8 /*dec by 8 pixels processed */ 474 475 ja mainloop16 476 477mainloop16end: 478 mov ecx,diff 479 cmp ecx,0 480 jz end16 481 482 mov edx,mask 483 sal edx,24 /*make low byte the high byte */ 484secondloop16: 485 sal edx,1 /*move high bit to CF */ 486 jnc skip16 /*if CF = 0 */ 487 mov ax,[esi] 488 mov [ebx],ax 489skip16: 490 add esi,2 491 add ebx,2 492 493 dec ecx 494 jnz secondloop16 495end16: 496 emms 497 } 498 } 499 else /* mmx not supported - use modified C routine */ 500 { 501 register unsigned int incr1, initial_val, final_val; 502 png_size_t pixel_bytes; 503 png_uint_32 i; 504 register int disp = png_pass_inc[png_ptr->pass]; 505 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; 506 507 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); 508 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* 509 pixel_bytes; 510 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; 511 initial_val = offset_table[png_ptr->pass]*pixel_bytes; 512 final_val = png_ptr->width*pixel_bytes; 513 incr1 = (disp)*pixel_bytes; 514 for (i = initial_val; i < final_val; i += incr1) 515 { 516 png_memcpy(dstptr, srcptr, pixel_bytes); 517 srcptr += incr1; 518 dstptr += incr1; 519 } 520 } /* end of else */ 521 522 break; 523 } /* end 16 bpp */ 524 525 case 24: 526 { 527 png_bytep srcptr; 528 png_bytep dstptr; 529 png_uint_32 len; 530 int unmask, diff; 531 532 __int64 mask2=0x0101010202020404, /*24bpp */ 533 mask1=0x0408080810101020, 534 mask0=0x2020404040808080; 535 536 srcptr = png_ptr->row_buf + 1; 537 dstptr = row; 538 539 unmask = ~mask; 540 len = (png_ptr->width)&~7; 541 diff = (png_ptr->width)&7; 542 543#if !defined(PNG_1_0_X) 544 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 545 /* && mmx_supported */ ) 546#else 547 if (mmx_supported) 548#endif 549 { 550 _asm 551 { 552 movd mm7, unmask /*load bit pattern */ 553 psubb mm6,mm6 /*zero mm6 */ 554 punpcklbw mm7,mm7 555 punpcklwd mm7,mm7 556 punpckldq mm7,mm7 /*fill register with 8 masks */ 557 558 movq mm0,mask0 559 movq mm1,mask1 560 movq mm2,mask2 561 562 pand mm0,mm7 563 pand mm1,mm7 564 pand mm2,mm7 565 566 pcmpeqb mm0,mm6 567 pcmpeqb mm1,mm6 568 pcmpeqb mm2,mm6 569 570 mov ecx,len /*load length of line */ 571 mov esi,srcptr /*load source */ 572 mov ebx,dstptr /*load dest */ 573 cmp ecx,0 574 jz mainloop24end 575 576mainloop24: 577 movq mm4,[esi] 578 pand mm4,mm0 579 movq mm6,mm0 580 movq mm7,[ebx] 581 pandn mm6,mm7 582 por mm4,mm6 583 movq [ebx],mm4 584 585 586 movq mm5,[esi+8] 587 pand mm5,mm1 588 movq mm7,mm1 589 movq mm6,[ebx+8] 590 pandn mm7,mm6 591 por mm5,mm7 592 movq [ebx+8],mm5 593 594 movq mm6,[esi+16] 595 pand mm6,mm2 596 movq mm4,mm2 597 movq mm7,[ebx+16] 598 pandn mm4,mm7 599 por mm6,mm4 600 movq [ebx+16],mm6 601 602 add esi,24 /*inc by 24 bytes processed */ 603 add ebx,24 604 sub ecx,8 /*dec by 8 pixels processed */ 605 606 ja mainloop24 607 608mainloop24end: 609 mov ecx,diff 610 cmp ecx,0 611 jz end24 612 613 mov edx,mask 614 sal edx,24 /*make low byte the high byte */ 615secondloop24: 616 sal edx,1 /*move high bit to CF */ 617 jnc skip24 /*if CF = 0 */ 618 mov ax,[esi] 619 mov [ebx],ax 620 xor eax,eax 621 mov al,[esi+2] 622 mov [ebx+2],al 623skip24: 624 add esi,3 625 add ebx,3 626 627 dec ecx 628 jnz secondloop24 629 630end24: 631 emms 632 } 633 } 634 else /* mmx not supported - use modified C routine */ 635 { 636 register unsigned int incr1, initial_val, final_val; 637 png_size_t pixel_bytes; 638 png_uint_32 i; 639 register int disp = png_pass_inc[png_ptr->pass]; 640 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; 641 642 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); 643 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* 644 pixel_bytes; 645 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; 646 initial_val = offset_table[png_ptr->pass]*pixel_bytes; 647 final_val = png_ptr->width*pixel_bytes; 648 incr1 = (disp)*pixel_bytes; 649 for (i = initial_val; i < final_val; i += incr1) 650 { 651 png_memcpy(dstptr, srcptr, pixel_bytes); 652 srcptr += incr1; 653 dstptr += incr1; 654 } 655 } /* end of else */ 656 657 break; 658 } /* end 24 bpp */ 659 660 case 32: 661 { 662 png_bytep srcptr; 663 png_bytep dstptr; 664 png_uint_32 len; 665 int unmask, diff; 666 667 __int64 mask3=0x0101010102020202, /*32bpp */ 668 mask2=0x0404040408080808, 669 mask1=0x1010101020202020, 670 mask0=0x4040404080808080; 671 672 srcptr = png_ptr->row_buf + 1; 673 dstptr = row; 674 675 unmask = ~mask; 676 len = (png_ptr->width)&~7; 677 diff = (png_ptr->width)&7; 678 679#if !defined(PNG_1_0_X) 680 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 681 /* && mmx_supported */ ) 682#else 683 if (mmx_supported) 684#endif 685 { 686 _asm 687 { 688 movd mm7, unmask /*load bit pattern */ 689 psubb mm6,mm6 /*zero mm6 */ 690 punpcklbw mm7,mm7 691 punpcklwd mm7,mm7 692 punpckldq mm7,mm7 /*fill register with 8 masks */ 693 694 movq mm0,mask0 695 movq mm1,mask1 696 movq mm2,mask2 697 movq mm3,mask3 698 699 pand mm0,mm7 700 pand mm1,mm7 701 pand mm2,mm7 702 pand mm3,mm7 703 704 pcmpeqb mm0,mm6 705 pcmpeqb mm1,mm6 706 pcmpeqb mm2,mm6 707 pcmpeqb mm3,mm6 708 709 mov ecx,len /*load length of line */ 710 mov esi,srcptr /*load source */ 711 mov ebx,dstptr /*load dest */ 712 713 cmp ecx,0 /*lcr */ 714 jz mainloop32end 715 716mainloop32: 717 movq mm4,[esi] 718 pand mm4,mm0 719 movq mm6,mm0 720 movq mm7,[ebx] 721 pandn mm6,mm7 722 por mm4,mm6 723 movq [ebx],mm4 724 725 movq mm5,[esi+8] 726 pand mm5,mm1 727 movq mm7,mm1 728 movq mm6,[ebx+8] 729 pandn mm7,mm6 730 por mm5,mm7 731 movq [ebx+8],mm5 732 733 movq mm6,[esi+16] 734 pand mm6,mm2 735 movq mm4,mm2 736 movq mm7,[ebx+16] 737 pandn mm4,mm7 738 por mm6,mm4 739 movq [ebx+16],mm6 740 741 movq mm7,[esi+24] 742 pand mm7,mm3 743 movq mm5,mm3 744 movq mm4,[ebx+24] 745 pandn mm5,mm4 746 por mm7,mm5 747 movq [ebx+24],mm7 748 749 add esi,32 /*inc by 32 bytes processed */ 750 add ebx,32 751 sub ecx,8 /*dec by 8 pixels processed */ 752 753 ja mainloop32 754 755mainloop32end: 756 mov ecx,diff 757 cmp ecx,0 758 jz end32 759 760 mov edx,mask 761 sal edx,24 /*make low byte the high byte */ 762secondloop32: 763 sal edx,1 /*move high bit to CF */ 764 jnc skip32 /*if CF = 0 */ 765 mov eax,[esi] 766 mov [ebx],eax 767skip32: 768 add esi,4 769 add ebx,4 770 771 dec ecx 772 jnz secondloop32 773 774end32: 775 emms 776 } 777 } 778 else /* mmx _not supported - Use modified C routine */ 779 { 780 register unsigned int incr1, initial_val, final_val; 781 png_size_t pixel_bytes; 782 png_uint_32 i; 783 register int disp = png_pass_inc[png_ptr->pass]; 784 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; 785 786 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); 787 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* 788 pixel_bytes; 789 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; 790 initial_val = offset_table[png_ptr->pass]*pixel_bytes; 791 final_val = png_ptr->width*pixel_bytes; 792 incr1 = (disp)*pixel_bytes; 793 for (i = initial_val; i < final_val; i += incr1) 794 { 795 png_memcpy(dstptr, srcptr, pixel_bytes); 796 srcptr += incr1; 797 dstptr += incr1; 798 } 799 } /* end of else */ 800 801 break; 802 } /* end 32 bpp */ 803 804 case 48: 805 { 806 png_bytep srcptr; 807 png_bytep dstptr; 808 png_uint_32 len; 809 int unmask, diff; 810 811 __int64 mask5=0x0101010101010202, 812 mask4=0x0202020204040404, 813 mask3=0x0404080808080808, 814 mask2=0x1010101010102020, 815 mask1=0x2020202040404040, 816 mask0=0x4040808080808080; 817 818#if !defined(PNG_1_0_X) 819 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 820 /* && mmx_supported */ ) 821#else 822 if (mmx_supported) 823#endif 824 { 825 srcptr = png_ptr->row_buf + 1; 826 dstptr = row; 827 828 unmask = ~mask; 829 len = (png_ptr->width)&~7; 830 diff = (png_ptr->width)&7; 831 _asm 832 { 833 movd mm7, unmask /*load bit pattern */ 834 psubb mm6,mm6 /*zero mm6 */ 835 punpcklbw mm7,mm7 836 punpcklwd mm7,mm7 837 punpckldq mm7,mm7 /*fill register with 8 masks */ 838 839 movq mm0,mask0 840 movq mm1,mask1 841 movq mm2,mask2 842 movq mm3,mask3 843 movq mm4,mask4 844 movq mm5,mask5 845 846 pand mm0,mm7 847 pand mm1,mm7 848 pand mm2,mm7 849 pand mm3,mm7 850 pand mm4,mm7 851 pand mm5,mm7 852 853 pcmpeqb mm0,mm6 854 pcmpeqb mm1,mm6 855 pcmpeqb mm2,mm6 856 pcmpeqb mm3,mm6 857 pcmpeqb mm4,mm6 858 pcmpeqb mm5,mm6 859 860 mov ecx,len /*load length of line */ 861 mov esi,srcptr /*load source */ 862 mov ebx,dstptr /*load dest */ 863 864 cmp ecx,0 865 jz mainloop48end 866 867mainloop48: 868 movq mm7,[esi] 869 pand mm7,mm0 870 movq mm6,mm0 871 pandn mm6,[ebx] 872 por mm7,mm6 873 movq [ebx],mm7 874 875 movq mm6,[esi+8] 876 pand mm6,mm1 877 movq mm7,mm1 878 pandn mm7,[ebx+8] 879 por mm6,mm7 880 movq [ebx+8],mm6 881 882 movq mm6,[esi+16] 883 pand mm6,mm2 884 movq mm7,mm2 885 pandn mm7,[ebx+16] 886 por mm6,mm7 887 movq [ebx+16],mm6 888 889 movq mm7,[esi+24] 890 pand mm7,mm3 891 movq mm6,mm3 892 pandn mm6,[ebx+24] 893 por mm7,mm6 894 movq [ebx+24],mm7 895 896 movq mm6,[esi+32] 897 pand mm6,mm4 898 movq mm7,mm4 899 pandn mm7,[ebx+32] 900 por mm6,mm7 901 movq [ebx+32],mm6 902 903 movq mm7,[esi+40] 904 pand mm7,mm5 905 movq mm6,mm5 906 pandn mm6,[ebx+40] 907 por mm7,mm6 908 movq [ebx+40],mm7 909 910 add esi,48 /*inc by 32 bytes processed */ 911 add ebx,48 912 sub ecx,8 /*dec by 8 pixels processed */ 913 914 ja mainloop48 915mainloop48end: 916 917 mov ecx,diff 918 cmp ecx,0 919 jz end48 920 921 mov edx,mask 922 sal edx,24 /*make low byte the high byte */ 923 924secondloop48: 925 sal edx,1 /*move high bit to CF */ 926 jnc skip48 /*if CF = 0 */ 927 mov eax,[esi] 928 mov [ebx],eax 929skip48: 930 add esi,4 931 add ebx,4 932 933 dec ecx 934 jnz secondloop48 935 936end48: 937 emms 938 } 939 } 940 else /* mmx _not supported - Use modified C routine */ 941 { 942 register unsigned int incr1, initial_val, final_val; 943 png_size_t pixel_bytes; 944 png_uint_32 i; 945 register int disp = png_pass_inc[png_ptr->pass]; 946 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; 947 948 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); 949 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* 950 pixel_bytes; 951 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes; 952 initial_val = offset_table[png_ptr->pass]*pixel_bytes; 953 final_val = png_ptr->width*pixel_bytes; 954 incr1 = (disp)*pixel_bytes; 955 for (i = initial_val; i < final_val; i += incr1) 956 { 957 png_memcpy(dstptr, srcptr, pixel_bytes); 958 srcptr += incr1; 959 dstptr += incr1; 960 } 961 } /* end of else */ 962 963 break; 964 } /* end 48 bpp */ 965 966 default: 967 { 968 png_bytep sptr; 969 png_bytep dp; 970 png_size_t pixel_bytes; 971 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0}; 972 unsigned int i; 973 register int disp = png_pass_inc[png_ptr->pass]; /* get the offset */ 974 register unsigned int incr1, initial_val, final_val; 975 976 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3); 977 sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]* 978 pixel_bytes; 979 dp = row + offset_table[png_ptr->pass]*pixel_bytes; 980 initial_val = offset_table[png_ptr->pass]*pixel_bytes; 981 final_val = png_ptr->width*pixel_bytes; 982 incr1 = (disp)*pixel_bytes; 983 for (i = initial_val; i < final_val; i += incr1) 984 { 985 png_memcpy(dp, sptr, pixel_bytes); 986 sptr += incr1; 987 dp += incr1; 988 } 989 break; 990 } 991 } /* end switch (png_ptr->row_info.pixel_depth) */ 992 } /* end if (non-trivial mask) */ 993 994} /* end png_combine_row() */ 995 996 997#if defined(PNG_READ_INTERLACING_SUPPORTED) 998 999void /* PRIVATE */ 1000png_do_read_interlace(png_structp png_ptr) 1001{ 1002 png_row_infop row_info = &(png_ptr->row_info); 1003 png_bytep row = png_ptr->row_buf + 1; 1004 int pass = png_ptr->pass; 1005 png_uint_32 transformations = png_ptr->transformations; 1006#ifdef PNG_USE_LOCAL_ARRAYS 1007 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1}; 1008#endif 1009 1010 png_debug(1,"in png_do_read_interlace\n"); 1011 1012 if (mmx_supported == 2) { 1013#if !defined(PNG_1_0_X) 1014 /* this should have happened in png_init_mmx_flags() already */ 1015 png_warning(png_ptr, "asm_flags may not have been initialized"); 1016#endif 1017 png_mmx_support(); 1018 } 1019 1020 if (row != NULL && row_info != NULL) 1021 { 1022 png_uint_32 final_width; 1023 1024 final_width = row_info->width * png_pass_inc[pass]; 1025 1026 switch (row_info->pixel_depth) 1027 { 1028 case 1: 1029 { 1030 png_bytep sp, dp; 1031 int sshift, dshift; 1032 int s_start, s_end, s_inc; 1033 png_byte v; 1034 png_uint_32 i; 1035 int j; 1036 1037 sp = row + (png_size_t)((row_info->width - 1) >> 3); 1038 dp = row + (png_size_t)((final_width - 1) >> 3); 1039#if defined(PNG_READ_PACKSWAP_SUPPORTED) 1040 if (transformations & PNG_PACKSWAP) 1041 { 1042 sshift = (int)((row_info->width + 7) & 7); 1043 dshift = (int)((final_width + 7) & 7); 1044 s_start = 7; 1045 s_end = 0; 1046 s_inc = -1; 1047 } 1048 else 1049#endif 1050 { 1051 sshift = 7 - (int)((row_info->width + 7) & 7); 1052 dshift = 7 - (int)((final_width + 7) & 7); 1053 s_start = 0; 1054 s_end = 7; 1055 s_inc = 1; 1056 } 1057 1058 for (i = row_info->width; i; i--) 1059 { 1060 v = (png_byte)((*sp >> sshift) & 0x1); 1061 for (j = 0; j < png_pass_inc[pass]; j++) 1062 { 1063 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff); 1064 *dp |= (png_byte)(v << dshift); 1065 if (dshift == s_end) 1066 { 1067 dshift = s_start; 1068 dp--; 1069 } 1070 else 1071 dshift += s_inc; 1072 } 1073 if (sshift == s_end) 1074 { 1075 sshift = s_start; 1076 sp--; 1077 } 1078 else 1079 sshift += s_inc; 1080 } 1081 break; 1082 } 1083 1084 case 2: 1085 { 1086 png_bytep sp, dp; 1087 int sshift, dshift; 1088 int s_start, s_end, s_inc; 1089 png_uint_32 i; 1090 1091 sp = row + (png_size_t)((row_info->width - 1) >> 2); 1092 dp = row + (png_size_t)((final_width - 1) >> 2); 1093#if defined(PNG_READ_PACKSWAP_SUPPORTED) 1094 if (transformations & PNG_PACKSWAP) 1095 { 1096 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1); 1097 dshift = (png_size_t)(((final_width + 3) & 3) << 1); 1098 s_start = 6; 1099 s_end = 0; 1100 s_inc = -2; 1101 } 1102 else 1103#endif 1104 { 1105 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1); 1106 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1); 1107 s_start = 0; 1108 s_end = 6; 1109 s_inc = 2; 1110 } 1111 1112 for (i = row_info->width; i; i--) 1113 { 1114 png_byte v; 1115 int j; 1116 1117 v = (png_byte)((*sp >> sshift) & 0x3); 1118 for (j = 0; j < png_pass_inc[pass]; j++) 1119 { 1120 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff); 1121 *dp |= (png_byte)(v << dshift); 1122 if (dshift == s_end) 1123 { 1124 dshift = s_start; 1125 dp--; 1126 } 1127 else 1128 dshift += s_inc; 1129 } 1130 if (sshift == s_end) 1131 { 1132 sshift = s_start; 1133 sp--; 1134 } 1135 else 1136 sshift += s_inc; 1137 } 1138 break; 1139 } 1140 1141 case 4: 1142 { 1143 png_bytep sp, dp; 1144 int sshift, dshift; 1145 int s_start, s_end, s_inc; 1146 png_uint_32 i; 1147 1148 sp = row + (png_size_t)((row_info->width - 1) >> 1); 1149 dp = row + (png_size_t)((final_width - 1) >> 1); 1150#if defined(PNG_READ_PACKSWAP_SUPPORTED) 1151 if (transformations & PNG_PACKSWAP) 1152 { 1153 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2); 1154 dshift = (png_size_t)(((final_width + 1) & 1) << 2); 1155 s_start = 4; 1156 s_end = 0; 1157 s_inc = -4; 1158 } 1159 else 1160#endif 1161 { 1162 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2); 1163 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2); 1164 s_start = 0; 1165 s_end = 4; 1166 s_inc = 4; 1167 } 1168 1169 for (i = row_info->width; i; i--) 1170 { 1171 png_byte v; 1172 int j; 1173 1174 v = (png_byte)((*sp >> sshift) & 0xf); 1175 for (j = 0; j < png_pass_inc[pass]; j++) 1176 { 1177 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff); 1178 *dp |= (png_byte)(v << dshift); 1179 if (dshift == s_end) 1180 { 1181 dshift = s_start; 1182 dp--; 1183 } 1184 else 1185 dshift += s_inc; 1186 } 1187 if (sshift == s_end) 1188 { 1189 sshift = s_start; 1190 sp--; 1191 } 1192 else 1193 sshift += s_inc; 1194 } 1195 break; 1196 } 1197 1198 default: /* This is the place where the routine is modified */ 1199 { 1200 __int64 const4 = 0x0000000000FFFFFF; 1201 /* __int64 const5 = 0x000000FFFFFF0000; // unused... */ 1202 __int64 const6 = 0x00000000000000FF; 1203 png_bytep sptr, dp; 1204 png_uint_32 i; 1205 png_size_t pixel_bytes; 1206 int width = row_info->width; 1207 1208 pixel_bytes = (row_info->pixel_depth >> 3); 1209 1210 sptr = row + (width - 1) * pixel_bytes; 1211 dp = row + (final_width - 1) * pixel_bytes; 1212 /* New code by Nirav Chhatrapati - Intel Corporation */ 1213 /* sign fix by GRR */ 1214 /* NOTE: there is NO MMX code for 48-bit and 64-bit images */ 1215 1216 // use MMX routine if machine supports it 1217#if !defined(PNG_1_0_X) 1218 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE) 1219 /* && mmx_supported */ ) 1220#else 1221 if (mmx_supported) 1222#endif 1223 { 1224 if (pixel_bytes == 3) 1225 { 1226 if (((pass == 0) || (pass == 1)) && width) 1227 { 1228 _asm 1229 { 1230 mov esi, sptr 1231 mov edi, dp 1232 mov ecx, width 1233 sub edi, 21 /* (png_pass_inc[pass] - 1)*pixel_bytes */ 1234loop_pass0: 1235 movd mm0, [esi] ; X X X X X v2 v1 v0 1236 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0 1237 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0 1238 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0 1239 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0 1240 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0 1241 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1 1242 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0 1243 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1 1244 movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1 1245 psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0 1246 movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1 1247 punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2 1248 movq [edi+16] , mm4 1249 psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0 1250 movq [edi+8] , mm3 1251 punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0 1252 sub esi, 3 1253 movq [edi], mm0 1254 sub edi, 24 1255 /*sub esi, 3 */ 1256 dec ecx 1257 jnz loop_pass0 1258 EMMS 1259 } 1260 } 1261 else if (((pass == 2) || (pass == 3)) && width) 1262 { 1263 _asm 1264 { 1265 mov esi, sptr 1266 mov edi, dp 1267 mov ecx, width 1268 sub edi, 9 /* (png_pass_inc[pass] - 1)*pixel_bytes */ 1269loop_pass2: 1270 movd mm0, [esi] ; X X X X X v2 v1 v0 1271 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0 1272 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0 1273 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0 1274 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0 1275 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0 1276 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1 1277 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0 1278 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1 1279 movq [edi+4], mm0 ; move to memory 1280 psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0 1281 movd [edi], mm0 ; move to memory 1282 sub esi, 3 1283 sub edi, 12 1284 dec ecx 1285 jnz loop_pass2 1286 EMMS 1287 } 1288 } 1289 else if (width) /* && ((pass == 4) || (pass == 5)) */ 1290 { 1291 int width_mmx = ((width >> 1) << 1) - 8; 1292 if (width_mmx < 0) 1293 width_mmx = 0; 1294 width -= width_mmx; /* 8 or 9 pix, 24 or 27 bytes */ 1295 if (width_mmx) 1296 { 1297 _asm 1298 { 1299 mov esi, sptr 1300 mov edi, dp 1301 mov ecx, width_mmx 1302 sub esi, 3 1303 sub edi, 9 1304loop_pass4: 1305 movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3 1306 movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3 1307 movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3 1308 psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0 1309 pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3 1310 psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0 1311 por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3 1312 movq mm5, mm6 ; 0 0 0 X X v2 v1 v0 1313 psllq mm6, 8 ; 0 0 X X v2 v1 v0 0 1314 movq [edi], mm0 ; move quad to memory 1315 psrlq mm5, 16 ; 0 0 0 0 0 X X v2 1316 pand mm5, const6 ; 0 0 0 0 0 0 0 v2 1317 por mm6, mm5 ; 0 0 X X v2 v1 v0 v2 1318 movd [edi+8], mm6 ; move double to memory 1319 sub esi, 6 1320 sub edi, 12 1321 sub ecx, 2 1322 jnz loop_pass4 1323 EMMS 1324 } 1325 } 1326 1327 sptr -= width_mmx*3; 1328 dp -= width_mmx*6; 1329 for (i = width; i; i--) 1330 { 1331 png_byte v[8]; 1332 int j; 1333 1334 png_memcpy(v, sptr, 3); 1335 for (j = 0; j < png_pass_inc[pass]; j++) 1336 { 1337 png_memcpy(dp, v, 3); 1338 dp -= 3; 1339 } 1340 sptr -= 3; 1341 } 1342 } 1343 } /* end of pixel_bytes == 3 */ 1344 1345 else if (pixel_bytes == 1) 1346 { 1347 if (((pass == 0) || (pass == 1)) && width) 1348 { 1349 int width_mmx = ((width >> 2) << 2); 1350 width -= width_mmx; 1351 if (width_mmx) 1352 { 1353 _asm 1354 { 1355 mov esi, sptr 1356 mov edi, dp 1357 mov ecx, width_mmx 1358 sub edi, 31 1359 sub esi, 3 1360loop1_pass0: 1361 movd mm0, [esi] ; X X X X v0 v1 v2 v3 1362 movq mm1, mm0 ; X X X X v0 v1 v2 v3 1363 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 1364 movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 1365 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3 1366 movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3 1367 punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3 1368 punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2 1369 movq [edi], mm0 ; move to memory v3 1370 punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1 1371 movq [edi+8], mm3 ; move to memory v2 1372 movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1 1373 punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1 1374 punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0 1375 movq [edi+16], mm2 ; move to memory v1 1376 movq [edi+24], mm4 ; move to memory v0 1377 sub esi, 4 1378 sub edi, 32 1379 sub ecx, 4 1380 jnz loop1_pass0 1381 EMMS 1382 } 1383 } 1384 1385 sptr -= width_mmx; 1386 dp -= width_mmx*8; 1387 for (i = width; i; i--) 1388 { 1389 int j; 1390 1391 /* I simplified this part in version 1.0.4e 1392 * here and in several other instances where 1393 * pixel_bytes == 1 -- GR-P 1394 * 1395 * Original code: 1396 * 1397 * png_byte v[8]; 1398 * png_memcpy(v, sptr, pixel_bytes); 1399 * for (j = 0; j < png_pass_inc[pass]; j++) 1400 * { 1401 * png_memcpy(dp, v, pixel_bytes); 1402 * dp -= pixel_bytes; 1403 * } 1404 * sptr -= pixel_bytes; 1405 * 1406 * Replacement code is in the next three lines: 1407 */ 1408 1409 for (j = 0; j < png_pass_inc[pass]; j++) 1410 *dp-- = *sptr; 1411 sptr--; 1412 } 1413 } 1414 else if (((pass == 2) || (pass == 3)) && width) 1415 { 1416 int width_mmx = ((width >> 2) << 2); 1417 width -= width_mmx; 1418 if (width_mmx) 1419 { 1420 _asm 1421 { 1422 mov esi, sptr 1423 mov edi, dp 1424 mov ecx, width_mmx 1425 sub edi, 15 1426 sub esi, 3 1427loop1_pass2: 1428 movd mm0, [esi] ; X X X X v0 v1 v2 v3 1429 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 1430 movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 1431 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3 1432 punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1 1433 movq [edi], mm0 ; move to memory v2 and v3 1434 sub esi, 4 1435 movq [edi+8], mm1 ; move to memory v1 and v0 1436 sub edi, 16 1437 sub ecx, 4 1438 jnz loop1_pass2 1439 EMMS 1440 } 1441 } 1442 1443 sptr -= width_mmx; 1444 dp -= width_mmx*4; 1445 for (i = width; i; i--) 1446 { 1447 int j; 1448 1449 for (j = 0; j < png_pass_inc[pass]; j++) 1450 { 1451 *dp-- = *sptr; 1452 } 1453 sptr --; 1454 } 1455 } 1456 else if (width) /* && ((pass == 4) || (pass == 5))) */ 1457 { 1458 int width_mmx = ((width >> 3) << 3); 1459 width -= width_mmx; 1460 if (width_mmx) 1461 { 1462 _asm 1463 { 1464 mov esi, sptr 1465 mov edi, dp 1466 mov ecx, width_mmx 1467 sub edi, 15 1468 sub esi, 7 1469loop1_pass4: 1470 movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7 1471 movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7 1472 punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7 1473 /*movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 */ 1474 punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3 1475 movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3 1476 sub esi, 8 1477 movq [edi], mm0 ; move to memory v4 v5 v6 and v7 1478 /*sub esi, 4 */ 1479 sub edi, 16 1480 sub ecx, 8 1481 jnz loop1_pass4 1482 EMMS 1483 } 1484 } 1485 1486 sptr -= width_mmx; 1487 dp -= width_mmx*2; 1488 for (i = width; i; i--) 1489 { 1490 int j; 1491 1492 for (j = 0; j < png_pass_inc[pass]; j++) 1493 { 1494 *dp-- = *sptr; 1495 } 1496 sptr --; 1497 } 1498 } 1499 } /* end of pixel_bytes == 1 */ 1500 1501 else if (pixel_bytes == 2) 1502 { 1503 if (((pass == 0) || (pass == 1)) && width) 1504 { 1505 int width_mmx = ((width >> 1) << 1); 1506 width -= width_mmx; 1507 if (width_mmx) 1508 { 1509 _asm 1510 { 1511 mov esi, sptr 1512 mov edi, dp 1513 mov ecx, width_mmx 1514 sub esi, 2 1515 sub edi, 30 1516loop2_pass0: 1517 movd mm0, [esi] ; X X X X v1 v0 v3 v2 1518 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 1519 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 1520 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2 1521 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0 1522 movq [edi], mm0 1523 movq [edi + 8], mm0 1524 movq [edi + 16], mm1 1525 movq [edi + 24], mm1 1526 sub esi, 4 1527 sub edi, 32 1528 sub ecx, 2 1529 jnz loop2_pass0 1530 EMMS 1531 } 1532 } 1533 1534 sptr -= (width_mmx*2 - 2); /* sign fixed */ 1535 dp -= (width_mmx*16 - 2); /* sign fixed */ 1536 for (i = width; i; i--) 1537 { 1538 png_byte v[8]; 1539 int j; 1540 sptr -= 2; 1541 png_memcpy(v, sptr, 2); 1542 for (j = 0; j < png_pass_inc[pass]; j++) 1543 { 1544 dp -= 2; 1545 png_memcpy(dp, v, 2); 1546 } 1547 } 1548 } 1549 else if (((pass == 2) || (pass == 3)) && width) 1550 { 1551 int width_mmx = ((width >> 1) << 1) ; 1552 width -= width_mmx; 1553 if (width_mmx) 1554 { 1555 _asm 1556 { 1557 mov esi, sptr 1558 mov edi, dp 1559 mov ecx, width_mmx 1560 sub esi, 2 1561 sub edi, 14 1562loop2_pass2: 1563 movd mm0, [esi] ; X X X X v1 v0 v3 v2 1564 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 1565 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 1566 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2 1567 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0 1568 movq [edi], mm0 1569 sub esi, 4 1570 movq [edi + 8], mm1 1571 /*sub esi, 4 */ 1572 sub edi, 16 1573 sub ecx, 2 1574 jnz loop2_pass2 1575 EMMS 1576 } 1577 } 1578 1579 sptr -= (width_mmx*2 - 2); /* sign fixed */ 1580 dp -= (width_mmx*8 - 2); /* sign fixed */ 1581 for (i = width; i; i--) 1582 { 1583 png_byte v[8]; 1584 int j; 1585 sptr -= 2; 1586 png_memcpy(v, sptr, 2); 1587 for (j = 0; j < png_pass_inc[pass]; j++) 1588 { 1589 dp -= 2; 1590 png_memcpy(dp, v, 2); 1591 } 1592 } 1593 } 1594 else if (width) /* pass == 4 or 5 */ 1595 { 1596 int width_mmx = ((width >> 1) << 1) ; 1597 width -= width_mmx; 1598 if (width_mmx) 1599 { 1600 _asm 1601 { 1602 mov esi, sptr 1603 mov edi, dp 1604 mov ecx, width_mmx 1605 sub esi, 2 1606 sub edi, 6 1607loop2_pass4: 1608 movd mm0, [esi] ; X X X X v1 v0 v3 v2 1609 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2 1610 sub esi, 4 1611 movq [edi], mm0 1612 sub edi, 8 1613 sub ecx, 2 1614 jnz loop2_pass4 1615 EMMS 1616 } 1617 } 1618 1619 sptr -= (width_mmx*2 - 2); /* sign fixed */ 1620 dp -= (width_mmx*4 - 2); /* sign fixed */ 1621 for (i = width; i; i--) 1622 { 1623 png_byte v[8]; 1624 int j; 1625 sptr -= 2; 1626 png_memcpy(v, sptr, 2); 1627 for (j = 0; j < png_pass_inc[pass]; j++) 1628 { 1629 dp -= 2; 1630 png_memcpy(dp, v, 2); 1631 } 1632 } 1633 } 1634 } /* end of pixel_bytes == 2 */ 1635 1636 else if (pixel_bytes == 4) 1637 { 1638 if (((pass == 0) || (pass == 1)) && width) 1639 { 1640 int width_mmx = ((width >> 1) << 1) ; 1641 width -= width_mmx; 1642 if (width_mmx) 1643 { 1644 _asm 1645 { 1646 mov esi, sptr 1647 mov edi, dp 1648 mov ecx, width_mmx 1649 sub esi, 4 1650 sub edi, 60 1651loop4_pass0: 1652 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4 1653 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4 1654 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4 1655 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0 1656 movq [edi], mm0 1657 movq [edi + 8], mm0 1658 movq [edi + 16], mm0 1659 movq [edi + 24], mm0 1660 movq [edi+32], mm1 1661 movq [edi + 40], mm1 1662 movq [edi+ 48], mm1 1663 sub esi, 8 1664 movq [edi + 56], mm1 1665 sub edi, 64 1666 sub ecx, 2 1667 jnz loop4_pass0 1668 EMMS 1669 } 1670 } 1671 1672 sptr -= (width_mmx*4 - 4); /* sign fixed */ 1673 dp -= (width_mmx*32 - 4); /* sign fixed */ 1674 for (i = width; i; i--) 1675 { 1676 png_byte v[8]; 1677 int j; 1678 sptr -= 4; 1679 png_memcpy(v, sptr, 4); 1680 for (j = 0; j < png_pass_inc[pass]; j++) 1681 { 1682 dp -= 4; 1683 png_memcpy(dp, v, 4); 1684 } 1685 } 1686 } 1687 else if (((pass == 2) || (pass == 3)) && width) 1688 { 1689 int width_mmx = ((width >> 1) << 1) ; 1690 width -= width_mmx; 1691 if (width_mmx) 1692 { 1693 _asm 1694 { 1695 mov esi, sptr 1696 mov edi, dp 1697 mov ecx, width_mmx 1698 sub esi, 4 1699 sub edi, 28 1700loop4_pass2: 1701 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4 1702 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4 1703 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4 1704 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0 1705 movq [edi], mm0 1706 movq [edi + 8], mm0 1707 movq [edi+16], mm1 1708 movq [edi + 24], mm1 1709 sub esi, 8 1710 sub edi, 32 1711 sub ecx, 2 1712 jnz loop4_pass2 1713 EMMS 1714 } 1715 } 1716 1717 sptr -= (width_mmx*4 - 4); /* sign fixed */ 1718 dp -= (width_mmx*16 - 4); /* sign fixed */ 1719 for (i = width; i; i--) 1720 { 1721 png_byte v[8]; 1722 int j; 1723 sptr -= 4; 1724 png_memcpy(v, sptr, 4); 1725 for (j = 0; j < png_pass_inc[pass]; j++) 1726 { 1727 dp -= 4; 1728 png_memcpy(dp, v, 4); 1729 } 1730 } 1731 } 1732 else if (width) /* pass == 4 or 5 */ 1733 { 1734 int width_mmx = ((width >> 1) << 1) ; 1735 width -= width_mmx; 1736 if (width_mmx) 1737 { 1738 _asm 1739 { 1740 mov esi, sptr 1741 mov edi, dp 1742 mov ecx, width_mmx 1743 sub esi, 4 1744 sub edi, 12 1745loop4_pass4: 1746 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4 1747 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4 1748 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4 1749 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0 1750 movq [edi], mm0 1751 sub esi, 8 1752 movq [edi + 8], mm1 1753 sub edi, 16 1754 sub ecx, 2 1755 jnz loop4_pass4 1756 EMMS 1757 } 1758 } 1759 1760 sptr -= (width_mmx*4 - 4); /* sign fixed */ 1761 dp -= (width_mmx*8 - 4); /* sign fixed */ 1762 for (i = width; i; i--) 1763 { 1764 png_byte v[8]; 1765 int j; 1766 sptr -= 4; 1767 png_memcpy(v, sptr, 4); 1768 for (j = 0; j < png_pass_inc[pass]; j++) 1769 { 1770 dp -= 4; 1771 png_memcpy(dp, v, 4); 1772 } 1773 } 1774 } 1775 1776 } /* end of pixel_bytes == 4 */ 1777 1778 else if (pixel_bytes == 6) 1779 { 1780 for (i = width; i; i--) 1781 { 1782 png_byte v[8]; 1783 int j; 1784 png_memcpy(v, sptr, 6); 1785 for (j = 0; j < png_pass_inc[pass]; j++) 1786 { 1787 png_memcpy(dp, v, 6); 1788 dp -= 6; 1789 } 1790 sptr -= 6; 1791 } 1792 } /* end of pixel_bytes == 6 */ 1793 1794 else 1795 { 1796 for (i = width; i; i--) 1797 { 1798 png_byte v[8]; 1799 int j; 1800 png_memcpy(v, sptr, pixel_bytes); 1801 for (j = 0; j < png_pass_inc[pass]; j++) 1802 { 1803 png_memcpy(dp, v, pixel_bytes); 1804 dp -= pixel_bytes; 1805 } 1806 sptr-= pixel_bytes; 1807 } 1808 } 1809 } /* end of mmx_supported */ 1810 1811 else /* MMX not supported: use modified C code - takes advantage 1812 * of inlining of memcpy for a constant */ 1813 { 1814 if (pixel_bytes == 1) 1815 { 1816 for (i = width; i; i--) 1817 { 1818 int j; 1819 for (j = 0; j < png_pass_inc[pass]; j++) 1820 *dp-- = *sptr; 1821 sptr--; 1822 } 1823 } 1824 else if (pixel_bytes == 3) 1825 { 1826 for (i = width; i; i--) 1827 { 1828 png_byte v[8]; 1829 int j; 1830 png_memcpy(v, sptr, pixel_bytes); 1831 for (j = 0; j < png_pass_inc[pass]; j++) 1832 { 1833 png_memcpy(dp, v, pixel_bytes); 1834 dp -= pixel_bytes; 1835 } 1836 sptr -= pixel_bytes; 1837 } 1838 } 1839 else if (pixel_bytes == 2) 1840 { 1841 for (i = width; i; i--) 1842 { 1843 png_byte v[8]; 1844 int j; 1845 png_memcpy(v, sptr, pixel_bytes); 1846 for (j = 0; j < png_pass_inc[pass]; j++) 1847 { 1848 png_memcpy(dp, v, pixel_bytes); 1849 dp -= pixel_bytes; 1850 } 1851 sptr -= pixel_bytes; 1852 } 1853 } 1854 else if (pixel_bytes == 4) 1855 { 1856 for (i = width; i; i--) 1857 { 1858 png_byte v[8]; 1859 int j; 1860 png_memcpy(v, sptr, pixel_bytes); 1861 for (j = 0; j < png_pass_inc[pass]; j++) 1862 { 1863 png_memcpy(dp, v, pixel_bytes); 1864 dp -= pixel_bytes; 1865 } 1866 sptr -= pixel_bytes; 1867 } 1868 } 1869 else if (pixel_bytes == 6) 1870 { 1871 for (i = width; i; i--) 1872 { 1873 png_byte v[8]; 1874 int j; 1875 png_memcpy(v, sptr, pixel_bytes); 1876 for (j = 0; j < png_pass_inc[pass]; j++) 1877 { 1878 png_memcpy(dp, v, pixel_bytes); 1879 dp -= pixel_bytes; 1880 } 1881 sptr -= pixel_bytes; 1882 } 1883 } 1884 else 1885 { 1886 for (i = width; i; i--) 1887 { 1888 png_byte v[8]; 1889 int j; 1890 png_memcpy(v, sptr, pixel_bytes); 1891 for (j = 0; j < png_pass_inc[pass]; j++) 1892 { 1893 png_memcpy(dp, v, pixel_bytes); 1894 dp -= pixel_bytes; 1895 } 1896 sptr -= pixel_bytes; 1897 } 1898 } 1899 1900 } /* end of MMX not supported */ 1901 break; 1902 } 1903 } /* end switch (row_info->pixel_depth) */ 1904 1905 row_info->width = final_width; 1906 1907 row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width); 1908 } 1909 1910} 1911 1912#endif /* PNG_READ_INTERLACING_SUPPORTED */ 1913 1914 1915/* These variables are utilized in the functions below. They are declared */ 1916/* globally here to ensure alignment on 8-byte boundaries. */ 1917 1918union uAll { 1919 __int64 use; 1920 double align; 1921} LBCarryMask = {0x0101010101010101}, 1922 HBClearMask = {0x7f7f7f7f7f7f7f7f}, 1923 ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem; 1924 1925 1926/* Optimized code for PNG Average filter decoder */ 1927void /* PRIVATE */ 1928png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row 1929 , png_bytep prev_row) 1930{ 1931 int bpp; 1932 png_uint_32 FullLength; 1933 png_uint_32 MMXLength; 1934 /*png_uint_32 len; */ 1935 int diff; 1936 1937 bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */ 1938 FullLength = row_info->rowbytes; /* # of bytes to filter */ 1939 _asm { 1940 /* Init address pointers and offset */ 1941 mov edi, row /* edi ==> Avg(x) */ 1942 xor ebx, ebx /* ebx ==> x */ 1943 mov edx, edi 1944 mov esi, prev_row /* esi ==> Prior(x) */ 1945 sub edx, bpp /* edx ==> Raw(x-bpp) */ 1946 1947 xor eax, eax 1948 /* Compute the Raw value for the first bpp bytes */ 1949 /* Raw(x) = Avg(x) + (Prior(x)/2) */ 1950davgrlp: 1951 mov al, [esi + ebx] /* Load al with Prior(x) */ 1952 inc ebx 1953 shr al, 1 /* divide by 2 */ 1954 add al, [edi+ebx-1] /* Add Avg(x); -1 to offset inc ebx */ 1955 cmp ebx, bpp 1956 mov [edi+ebx-1], al /* Write back Raw(x); */ 1957 /* mov does not affect flags; -1 to offset inc ebx */ 1958 jb davgrlp 1959 /* get # of bytes to alignment */ 1960 mov diff, edi /* take start of row */ 1961 add diff, ebx /* add bpp */ 1962 add diff, 0xf /* add 7 + 8 to incr past alignment boundary */ 1963 and diff, 0xfffffff8 /* mask to alignment boundary */ 1964 sub diff, edi /* subtract from start ==> value ebx at alignment */ 1965 jz davggo 1966 /* fix alignment */ 1967 /* Compute the Raw value for the bytes upto the alignment boundary */ 1968 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */ 1969 xor ecx, ecx 1970davglp1: 1971 xor eax, eax 1972 mov cl, [esi + ebx] /* load cl with Prior(x) */ 1973 mov al, [edx + ebx] /* load al with Raw(x-bpp) */ 1974 add ax, cx 1975 inc ebx 1976 shr ax, 1 /* divide by 2 */ 1977 add al, [edi+ebx-1] /* Add Avg(x); -1 to offset inc ebx */ 1978 cmp ebx, diff /* Check if at alignment boundary */ 1979 mov [edi+ebx-1], al /* Write back Raw(x); */ 1980 /* mov does not affect flags; -1 to offset inc ebx */ 1981 jb davglp1 /* Repeat until at alignment boundary */ 1982davggo: 1983 mov eax, FullLength 1984 mov ecx, eax 1985 sub eax, ebx /* subtract alignment fix */ 1986 and eax, 0x00000007 /* calc bytes over mult of 8 */ 1987 sub ecx, eax /* drop over bytes from original length */ 1988 mov MMXLength, ecx 1989 } /* end _asm block */ 1990 /* Now do the math for the rest of the row */ 1991 switch ( bpp ) 1992 { 1993 case 3: 1994 { 1995 ActiveMask.use = 0x0000000000ffffff; 1996 ShiftBpp.use = 24; /* == 3 * 8 */ 1997 ShiftRem.use = 40; /* == 64 - 24 */ 1998 _asm { 1999 /* Re-init address pointers and offset */ 2000 movq mm7, ActiveMask 2001 mov ebx, diff /* ebx ==> x = offset to alignment boundary */ 2002 movq mm5, LBCarryMask 2003 mov edi, row /* edi ==> Avg(x) */ 2004 movq mm4, HBClearMask 2005 mov esi, prev_row /* esi ==> Prior(x) */ 2006 /* PRIME the pump (load the first Raw(x-bpp) data set */ 2007 movq mm2, [edi + ebx - 8] /* Load previous aligned 8 bytes */ 2008 /* (we correct position in loop below) */ 2009davg3lp: 2010 movq mm0, [edi + ebx] /* Load mm0 with Avg(x) */ 2011 /* Add (Prev_row/2) to Average */ 2012 movq mm3, mm5 2013 psrlq mm2, ShiftRem /* Correct position Raw(x-bpp) data */ 2014 movq mm1, [esi + ebx] /* Load mm1 with Prior(x) */ 2015 movq mm6, mm7 2016 pand mm3, mm1 /* get lsb for each prev_row byte */ 2017 psrlq mm1, 1 /* divide prev_row bytes by 2 */ 2018 pand mm1, mm4 /* clear invalid bit 7 of each byte */ 2019 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */ 2020 /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */ 2021 movq mm1, mm3 /* now use mm1 for getting LBCarrys */ 2022 pand mm1, mm2 /* get LBCarrys for each byte where both */ 2023 /* lsb's were == 1 (Only valid for active group) */ 2024 psrlq mm2, 1 /* divide raw bytes by 2 */ 2025 pand mm2, mm4 /* clear invalid bit 7 of each byte */ 2026 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */ 2027 pand mm2, mm6 /* Leave only Active Group 1 bytes to add to Avg */ 2028 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */ 2029 /* byte */ 2030 /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */ 2031 psllq mm6, ShiftBpp /* shift the mm6 mask to cover bytes 3-5 */ 2032 movq mm2, mm0 /* mov updated Raws to mm2 */ 2033 psllq mm2, ShiftBpp /* shift data to position correctly */ 2034 movq mm1, mm3 /* now use mm1 for getting LBCarrys */ 2035 pand mm1, mm2 /* get LBCarrys for each byte where both */ 2036 /* lsb's were == 1 (Only valid for active group) */ 2037 psrlq mm2, 1 /* divide raw bytes by 2 */ 2038 pand mm2, mm4 /* clear invalid bit 7 of each byte */ 2039 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */ 2040 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */ 2041 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */ 2042 /* byte */ 2043 2044 /* Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry */ 2045 psllq mm6, ShiftBpp /* shift the mm6 mask to cover the last two */ 2046 /* bytes */ 2047 movq mm2, mm0 /* mov updated Raws to mm2 */ 2048 psllq mm2, ShiftBpp /* shift data to position correctly */ 2049 /* Data only needs to be shifted once here to */ 2050 /* get the correct x-bpp offset. */ 2051 movq mm1, mm3 /* now use mm1 for getting LBCarrys */ 2052 pand mm1, mm2 /* get LBCarrys for each byte where both */ 2053 /* lsb's were == 1 (Only valid for active group) */ 2054 psrlq mm2, 1 /* divide raw bytes by 2 */ 2055 pand mm2, mm4 /* clear invalid bit 7 of each byte */ 2056 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */ 2057 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */ 2058 add ebx, 8 2059 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */ 2060 /* byte */ 2061 2062 /* Now ready to write back to memory */ 2063 movq [edi + ebx - 8], mm0 2064 /* Move updated Raw(x) to use as Raw(x-bpp) for next loop */ 2065 cmp ebx, MMXLength 2066 movq mm2, mm0 /* mov updated Raw(x) to mm2 */ 2067 jb davg3lp 2068 } /* end _asm block */ 2069 } 2070 break; 2071 2072 case 6: 2073 case 4: 2074 case 7: 2075 case 5: 2076 { 2077 ActiveMask.use = 0xffffffffffffffff; /* use shift below to clear */ 2078 /* appropriate inactive bytes */ 2079 ShiftBpp.use = bpp << 3; 2080 ShiftRem.use = 64 - ShiftBpp.use; 2081 _asm { 2082 movq mm4, HBClearMask 2083 /* Re-init address pointers and offset */ 2084 mov ebx, diff /* ebx ==> x = offset to alignment boundary */ 2085 /* Load ActiveMask and clear all bytes except for 1st active group */ 2086 movq mm7, ActiveMask 2087 mov edi, row /* edi ==> Avg(x) */ 2088 psrlq mm7, ShiftRem 2089 mov esi, prev_row /* esi ==> Prior(x) */ 2090 movq mm6, mm7 2091 movq mm5, LBCarryMask 2092 psllq mm6, ShiftBpp /* Create mask for 2nd active group */ 2093 /* PRIME the pump (load the first Raw(x-bpp) data set */ 2094 movq mm2, [edi + ebx - 8] /* Load previous aligned 8 bytes */ 2095 /* (we correct position in loop below) */ 2096davg4lp: 2097 movq mm0, [edi + ebx] 2098 psrlq mm2, ShiftRem /* shift data to position correctly */ 2099 movq mm1, [esi + ebx] 2100 /* Add (Prev_row/2) to Average */ 2101 movq mm3, mm5 2102 pand mm3, mm1 /* get lsb for each prev_row byte */ 2103 psrlq mm1, 1 /* divide prev_row bytes by 2 */ 2104 pand mm1, mm4 /* clear invalid bit 7 of each byte */ 2105 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */ 2106 /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */ 2107 movq mm1, mm3 /* now use mm1 for getting LBCarrys */ 2108 pand mm1, mm2 /* get LBCarrys for each byte where both */ 2109 /* lsb's were == 1 (Only valid for active group) */ 2110 psrlq mm2, 1 /* divide raw bytes by 2 */ 2111 pand mm2, mm4 /* clear invalid bit 7 of each byte */ 2112 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */ 2113 pand mm2, mm7 /* Leave only Active Group 1 bytes to add to Avg */ 2114 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */ 2115 /* byte */ 2116 /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */ 2117 movq mm2, mm0 /* mov updated Raws to mm2 */ 2118 psllq mm2, ShiftBpp /* shift data to position correctly */ 2119 add ebx, 8 2120 movq mm1, mm3 /* now use mm1 for getting LBCarrys */ 2121 pand mm1, mm2 /* get LBCarrys for each byte where both */ 2122 /* lsb's were == 1 (Only valid for active group) */ 2123 psrlq mm2, 1 /* divide raw bytes by 2 */ 2124 pand mm2, mm4 /* clear invalid bit 7 of each byte */ 2125 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */ 2126 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */ 2127 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */ 2128 /* byte */ 2129 cmp ebx, MMXLength 2130 /* Now ready to write back to memory */ 2131 movq [edi + ebx - 8], mm0 2132 /* Prep Raw(x-bpp) for next loop */ 2133 movq mm2, mm0 /* mov updated Raws to mm2 */ 2134 jb davg4lp 2135 } /* end _asm block */ 2136 } 2137 break; 2138 case 2: 2139 { 2140 ActiveMask.use = 0x000000000000ffff; 2141 ShiftBpp.use = 16; /* == 2 * 8 [BUGFIX] */ 2142 ShiftRem.use = 48; /* == 64 - 16 [BUGFIX] */ 2143 _asm { 2144 /* Load ActiveMask */ 2145 movq mm7, ActiveMask 2146 /* Re-init address pointers and offset */ 2147 mov ebx, diff /* ebx ==> x = offset to alignment boundary */ 2148 movq mm5, LBCarryMask 2149 mov edi, row /* edi ==> Avg(x) */ 2150 movq mm4, HBClearMask 2151 mov esi, prev_row /* esi ==> Prior(x) */ 2152 /* PRIME the pump (load the first Raw(x-bpp) data set */ 2153 movq mm2, [edi + ebx - 8] /* Load previous aligned 8 bytes */ 2154 /* (we correct position in loop below) */ 2155davg2lp: 2156 movq mm0, [edi + ebx] 2157 psrlq mm2, ShiftRem /* shift data to position correctly [BUGFIX] */ 2158 movq mm1, [esi + ebx] 2159 /* Add (Prev_row/2) to Average */ 2160 movq mm3, mm5 2161 pand mm3, mm1 /* get lsb for each prev_row byte */ 2162 psrlq mm1, 1 /* divide prev_row bytes by 2 */ 2163 pand mm1, mm4 /* clear invalid bit 7 of each byte */ 2164 movq mm6, mm7 2165 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */ 2166 /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */ 2167 movq mm1, mm3 /* now use mm1 for getting LBCarrys */ 2168 pand mm1, mm2 /* get LBCarrys for each byte where both */ 2169 /* lsb's were == 1 (Only valid for active group) */ 2170 psrlq mm2, 1 /* divide raw bytes by 2 */ 2171 pand mm2, mm4 /* clear invalid bit 7 of each byte */ 2172 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */ 2173 pand mm2, mm6 /* Leave only Active Group 1 bytes to add to Avg */ 2174 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */ 2175 /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */ 2176 psllq mm6, ShiftBpp /* shift the mm6 mask to cover bytes 2 & 3 */ 2177 movq mm2, mm0 /* mov updated Raws to mm2 */ 2178 psllq mm2, ShiftBpp /* shift data to position correctly */ 2179 movq mm1, mm3 /* now use mm1 for getting LBCarrys */ 2180 pand mm1, mm2 /* get LBCarrys for each byte where both */ 2181 /* lsb's were == 1 (Only valid for active group) */ 2182 psrlq mm2, 1 /* divide raw bytes by 2 */ 2183 pand mm2, mm4 /* clear invalid bit 7 of each byte */ 2184 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */ 2185 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */ 2186 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */ 2187 2188 /* Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry */ 2189 psllq mm6, ShiftBpp /* shift the mm6 mask to cover bytes 4 & 5 */ 2190 movq mm2, mm0 /* mov updated Raws to mm2 */ 2191 psllq mm2, ShiftBpp /* shift data to position correctly */ 2192 /* Data only needs to be shifted once here to */ 2193 /* get the correct x-bpp offset. */ 2194 movq mm1, mm3 /* now use mm1 for getting LBCarrys */ 2195 pand mm1, mm2 /* get LBCarrys for each byte where both */ 2196 /* lsb's were == 1 (Only valid for active group) */ 2197 psrlq mm2, 1 /* divide raw bytes by 2 */ 2198 pand mm2, mm4 /* clear invalid bit 7 of each byte */ 2199 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */ 2200 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */ 2201 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */ 2202 2203 /* Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry */ 2204 psllq mm6, ShiftBpp /* shift the mm6 mask to cover bytes 6 & 7 */ 2205 movq mm2, mm0 /* mov updated Raws to mm2 */ 2206 psllq mm2, ShiftBpp /* shift data to position correctly */ 2207 /* Data only needs to be shifted once here to */ 2208 /* get the correct x-bpp offset. */ 2209 add ebx, 8 2210 movq mm1, mm3 /* now use mm1 for getting LBCarrys */ 2211 pand mm1, mm2 /* get LBCarrys for each byte where both */ 2212 /* lsb's were == 1 (Only valid for active group) */ 2213 psrlq mm2, 1 /* divide raw bytes by 2 */ 2214 pand mm2, mm4 /* clear invalid bit 7 of each byte */ 2215 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */ 2216 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */ 2217 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */ 2218 2219 cmp ebx, MMXLength 2220 /* Now ready to write back to memory */ 2221 movq [edi + ebx - 8], mm0 2222 /* Prep Raw(x-bpp) for next loop */ 2223 movq mm2, mm0 /* mov updated Raws to mm2 */ 2224 jb davg2lp 2225 } /* end _asm block */ 2226 } 2227 break; 2228 2229 case 1: /* bpp == 1 */ 2230 { 2231 _asm { 2232 /* Re-init address pointers and offset */ 2233 mov ebx, diff /* ebx ==> x = offset to alignment boundary */ 2234 mov edi, row /* edi ==> Avg(x) */ 2235 cmp ebx, FullLength /* Test if offset at end of array */ 2236 jnb davg1end 2237 /* Do Paeth decode for remaining bytes */ 2238 mov esi, prev_row /* esi ==> Prior(x) */ 2239 mov edx, edi 2240 xor ecx, ecx /* zero ecx before using cl & cx in loop below */ 2241 sub edx, bpp /* edx ==> Raw(x-bpp) */ 2242davg1lp: 2243 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */ 2244 xor eax, eax 2245 mov cl, [esi + ebx] /* load cl with Prior(x) */ 2246 mov al, [edx + ebx] /* load al with Raw(x-bpp) */ 2247 add ax, cx 2248 inc ebx 2249 shr ax, 1 /* divide by 2 */ 2250 add al, [edi+ebx-1] /* Add Avg(x); -1 to offset inc ebx */ 2251 cmp ebx, FullLength /* Check if at end of array */ 2252 mov [edi+ebx-1], al /* Write back Raw(x); */ 2253 /* mov does not affect flags; -1 to offset inc ebx */ 2254 jb davg1lp 2255davg1end: 2256 } /* end _asm block */ 2257 } 2258 return; 2259 2260 case 8: /* bpp == 8 */ 2261 { 2262 _asm { 2263 /* Re-init address pointers and offset */ 2264 mov ebx, diff /* ebx ==> x = offset to alignment boundary */ 2265 movq mm5, LBCarryMask 2266 mov edi, row /* edi ==> Avg(x) */ 2267 movq mm4, HBClearMask 2268 mov esi, prev_row /* esi ==> Prior(x) */ 2269 /* PRIME the pump (load the first Raw(x-bpp) data set */ 2270 movq mm2, [edi + ebx - 8] /* Load previous aligned 8 bytes */ 2271 /* (NO NEED to correct position in loop below) */ 2272davg8lp: 2273 movq mm0, [edi + ebx] 2274 movq mm3, mm5 2275 movq mm1, [esi + ebx] 2276 add ebx, 8 2277 pand mm3, mm1 /* get lsb for each prev_row byte */ 2278 psrlq mm1, 1 /* divide prev_row bytes by 2 */ 2279 pand mm3, mm2 /* get LBCarrys for each byte where both */ 2280 /* lsb's were == 1 */ 2281 psrlq mm2, 1 /* divide raw bytes by 2 */ 2282 pand mm1, mm4 /* clear invalid bit 7 of each byte */ 2283 paddb mm0, mm3 /* add LBCarrys to Avg for each byte */ 2284 pand mm2, mm4 /* clear invalid bit 7 of each byte */ 2285 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */ 2286 paddb mm0, mm2 /* add (Raw/2) to Avg for each byte */ 2287 cmp ebx, MMXLength 2288 movq [edi + ebx - 8], mm0 2289 movq mm2, mm0 /* reuse as Raw(x-bpp) */ 2290 jb davg8lp 2291 } /* end _asm block */ 2292 } 2293 break; 2294 default: /* bpp greater than 8 */ 2295 { 2296 _asm { 2297 movq mm5, LBCarryMask 2298 /* Re-init address pointers and offset */ 2299 mov ebx, diff /* ebx ==> x = offset to alignment boundary */ 2300 mov edi, row /* edi ==> Avg(x) */ 2301 movq mm4, HBClearMask 2302 mov edx, edi 2303 mov esi, prev_row /* esi ==> Prior(x) */ 2304 sub edx, bpp /* edx ==> Raw(x-bpp) */ 2305davgAlp: 2306 movq mm0, [edi + ebx] 2307 movq mm3, mm5 2308 movq mm1, [esi + ebx] 2309 pand mm3, mm1 /* get lsb for each prev_row byte */ 2310 movq mm2, [edx + ebx] 2311 psrlq mm1, 1 /* divide prev_row bytes by 2 */ 2312 pand mm3, mm2 /* get LBCarrys for each byte where both */ 2313 /* lsb's were == 1 */ 2314 psrlq mm2, 1 /* divide raw bytes by 2 */ 2315 pand mm1, mm4 /* clear invalid bit 7 of each byte */ 2316 paddb mm0, mm3 /* add LBCarrys to Avg for each byte */ 2317 pand mm2, mm4 /* clear invalid bit 7 of each byte */ 2318 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */ 2319 add ebx, 8 2320 paddb mm0, mm2 /* add (Raw/2) to Avg for each byte */ 2321 cmp ebx, MMXLength 2322 movq [edi + ebx - 8], mm0 2323 jb davgAlp 2324 } /* end _asm block */ 2325 } 2326 break; 2327 } /* end switch ( bpp ) */ 2328 2329 _asm { 2330 /* MMX acceleration complete now do clean-up */ 2331 /* Check if any remaining bytes left to decode */ 2332 mov ebx, MMXLength /* ebx ==> x = offset bytes remaining after MMX */ 2333 mov edi, row /* edi ==> Avg(x) */ 2334 cmp ebx, FullLength /* Test if offset at end of array */ 2335 jnb davgend 2336 /* Do Paeth decode for remaining bytes */ 2337 mov esi, prev_row /* esi ==> Prior(x) */ 2338 mov edx, edi 2339 xor ecx, ecx /* zero ecx before using cl & cx in loop below */ 2340 sub edx, bpp /* edx ==> Raw(x-bpp) */ 2341davglp2: 2342 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */ 2343 xor eax, eax 2344 mov cl, [esi + ebx] /* load cl with Prior(x) */ 2345 mov al, [edx + ebx] /* load al with Raw(x-bpp) */ 2346 add ax, cx 2347 inc ebx 2348 shr ax, 1 /* divide by 2 */ 2349 add al, [edi+ebx-1] /* Add Avg(x); -1 to offset inc ebx */ 2350 cmp ebx, FullLength /* Check if at end of array */ 2351 mov [edi+ebx-1], al /* Write back Raw(x); */ 2352 /* mov does not affect flags; -1 to offset inc ebx */ 2353 jb davglp2 2354davgend: 2355 emms /* End MMX instructions; prep for possible FP instrs. */ 2356 } /* end _asm block */ 2357} 2358 2359/* Optimized code for PNG Paeth filter decoder */ 2360void /* PRIVATE */ 2361png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row, 2362 png_bytep prev_row) 2363{ 2364 png_uint_32 FullLength; 2365 png_uint_32 MMXLength; 2366 /*png_uint_32 len; */ 2367 int bpp; 2368 int diff; 2369 /*int ptemp; */ 2370 int patemp, pbtemp, pctemp; 2371 2372 bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */ 2373 FullLength = row_info->rowbytes; /* # of bytes to filter */ 2374 _asm 2375 { 2376 xor ebx, ebx /* ebx ==> x offset */ 2377 mov edi, row 2378 xor edx, edx /* edx ==> x-bpp offset */ 2379 mov esi, prev_row 2380 xor eax, eax 2381 2382 /* Compute the Raw value for the first bpp bytes */ 2383 /* Note: the formula works out to be always */ 2384 /* Paeth(x) = Raw(x) + Prior(x) where x < bpp */ 2385dpthrlp: 2386 mov al, [edi + ebx] 2387 add al, [esi + ebx] 2388 inc ebx 2389 cmp ebx, bpp 2390 mov [edi + ebx - 1], al 2391 jb dpthrlp 2392 /* get # of bytes to alignment */ 2393 mov diff, edi /* take start of row */ 2394 add diff, ebx /* add bpp */ 2395 xor ecx, ecx 2396 add diff, 0xf /* add 7 + 8 to incr past alignment boundary */ 2397 and diff, 0xfffffff8 /* mask to alignment boundary */ 2398 sub diff, edi /* subtract from start ==> value ebx at alignment */ 2399 jz dpthgo 2400 /* fix alignment */ 2401dpthlp1: 2402 xor eax, eax 2403 /* pav = p - a = (a + b - c) - a = b - c */ 2404 mov al, [esi + ebx] /* load Prior(x) into al */ 2405 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */ 2406 sub eax, ecx /* subtract Prior(x-bpp) */ 2407 mov patemp, eax /* Save pav for later use */ 2408 xor eax, eax 2409 /* pbv = p - b = (a + b - c) - b = a - c */ 2410 mov al, [edi + edx] /* load Raw(x-bpp) into al */ 2411 sub eax, ecx /* subtract Prior(x-bpp) */ 2412 mov ecx, eax 2413 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 2414 add eax, patemp /* pcv = pav + pbv */ 2415 /* pc = abs(pcv) */ 2416 test eax, 0x80000000 2417 jz dpthpca 2418 neg eax /* reverse sign of neg values */ 2419dpthpca: 2420 mov pctemp, eax /* save pc for later use */ 2421 /* pb = abs(pbv) */ 2422 test ecx, 0x80000000 2423 jz dpthpba 2424 neg ecx /* reverse sign of neg values */ 2425dpthpba: 2426 mov pbtemp, ecx /* save pb for later use */ 2427 /* pa = abs(pav) */ 2428 mov eax, patemp 2429 test eax, 0x80000000 2430 jz dpthpaa 2431 neg eax /* reverse sign of neg values */ 2432dpthpaa: 2433 mov patemp, eax /* save pa for later use */ 2434 /* test if pa <= pb */ 2435 cmp eax, ecx 2436 jna dpthabb 2437 /* pa > pb; now test if pb <= pc */ 2438 cmp ecx, pctemp 2439 jna dpthbbc 2440 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */ 2441 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */ 2442 jmp dpthpaeth 2443dpthbbc: 2444 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */ 2445 mov cl, [esi + ebx] /* load Prior(x) into cl */ 2446 jmp dpthpaeth 2447dpthabb: 2448 /* pa <= pb; now test if pa <= pc */ 2449 cmp eax, pctemp 2450 jna dpthabc 2451 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */ 2452 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */ 2453 jmp dpthpaeth 2454dpthabc: 2455 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */ 2456 mov cl, [edi + edx] /* load Raw(x-bpp) into cl */ 2457dpthpaeth: 2458 inc ebx 2459 inc edx 2460 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */ 2461 add [edi + ebx - 1], cl 2462 cmp ebx, diff 2463 jb dpthlp1 2464dpthgo: 2465 mov ecx, FullLength 2466 mov eax, ecx 2467 sub eax, ebx /* subtract alignment fix */ 2468 and eax, 0x00000007 /* calc bytes over mult of 8 */ 2469 sub ecx, eax /* drop over bytes from original length */ 2470 mov MMXLength, ecx 2471 } /* end _asm block */ 2472 /* Now do the math for the rest of the row */ 2473 switch ( bpp ) 2474 { 2475 case 3: 2476 { 2477 ActiveMask.use = 0x0000000000ffffff; 2478 ActiveMaskEnd.use = 0xffff000000000000; 2479 ShiftBpp.use = 24; /* == bpp(3) * 8 */ 2480 ShiftRem.use = 40; /* == 64 - 24 */ 2481 _asm 2482 { 2483 mov ebx, diff 2484 mov edi, row 2485 mov esi, prev_row 2486 pxor mm0, mm0 2487 /* PRIME the pump (load the first Raw(x-bpp) data set */ 2488 movq mm1, [edi+ebx-8] 2489dpth3lp: 2490 psrlq mm1, ShiftRem /* shift last 3 bytes to 1st 3 bytes */ 2491 movq mm2, [esi + ebx] /* load b=Prior(x) */ 2492 punpcklbw mm1, mm0 /* Unpack High bytes of a */ 2493 movq mm3, [esi+ebx-8] /* Prep c=Prior(x-bpp) bytes */ 2494 punpcklbw mm2, mm0 /* Unpack High bytes of b */ 2495 psrlq mm3, ShiftRem /* shift last 3 bytes to 1st 3 bytes */ 2496 /* pav = p - a = (a + b - c) - a = b - c */ 2497 movq mm4, mm2 2498 punpcklbw mm3, mm0 /* Unpack High bytes of c */ 2499 /* pbv = p - b = (a + b - c) - b = a - c */ 2500 movq mm5, mm1 2501 psubw mm4, mm3 2502 pxor mm7, mm7 2503 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 2504 movq mm6, mm4 2505 psubw mm5, mm3 2506 2507 /* pa = abs(p-a) = abs(pav) */ 2508 /* pb = abs(p-b) = abs(pbv) */ 2509 /* pc = abs(p-c) = abs(pcv) */ 2510 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */ 2511 paddw mm6, mm5 2512 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */ 2513 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */ 2514 psubw mm4, mm0 2515 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */ 2516 psubw mm4, mm0 2517 psubw mm5, mm7 2518 pxor mm0, mm0 2519 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */ 2520 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */ 2521 psubw mm5, mm7 2522 psubw mm6, mm0 2523 /* test pa <= pb */ 2524 movq mm7, mm4 2525 psubw mm6, mm0 2526 pcmpgtw mm7, mm5 /* pa > pb? */ 2527 movq mm0, mm7 2528 /* use mm7 mask to merge pa & pb */ 2529 pand mm5, mm7 2530 /* use mm0 mask copy to merge a & b */ 2531 pand mm2, mm0 2532 pandn mm7, mm4 2533 pandn mm0, mm1 2534 paddw mm7, mm5 2535 paddw mm0, mm2 2536 /* test ((pa <= pb)? pa:pb) <= pc */ 2537 pcmpgtw mm7, mm6 /* pab > pc? */ 2538 pxor mm1, mm1 2539 pand mm3, mm7 2540 pandn mm7, mm0 2541 paddw mm7, mm3 2542 pxor mm0, mm0 2543 packuswb mm7, mm1 2544 movq mm3, [esi + ebx] /* load c=Prior(x-bpp) */ 2545 pand mm7, ActiveMask 2546 movq mm2, mm3 /* load b=Prior(x) step 1 */ 2547 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */ 2548 punpcklbw mm3, mm0 /* Unpack High bytes of c */ 2549 movq [edi + ebx], mm7 /* write back updated value */ 2550 movq mm1, mm7 /* Now mm1 will be used as Raw(x-bpp) */ 2551 /* Now do Paeth for 2nd set of bytes (3-5) */ 2552 psrlq mm2, ShiftBpp /* load b=Prior(x) step 2 */ 2553 punpcklbw mm1, mm0 /* Unpack High bytes of a */ 2554 pxor mm7, mm7 2555 punpcklbw mm2, mm0 /* Unpack High bytes of b */ 2556 /* pbv = p - b = (a + b - c) - b = a - c */ 2557 movq mm5, mm1 2558 /* pav = p - a = (a + b - c) - a = b - c */ 2559 movq mm4, mm2 2560 psubw mm5, mm3 2561 psubw mm4, mm3 2562 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = */ 2563 /* pav + pbv = pbv + pav */ 2564 movq mm6, mm5 2565 paddw mm6, mm4 2566 2567 /* pa = abs(p-a) = abs(pav) */ 2568 /* pb = abs(p-b) = abs(pbv) */ 2569 /* pc = abs(p-c) = abs(pcv) */ 2570 pcmpgtw mm0, mm5 /* Create mask pbv bytes < 0 */ 2571 pcmpgtw mm7, mm4 /* Create mask pav bytes < 0 */ 2572 pand mm0, mm5 /* Only pbv bytes < 0 in mm0 */ 2573 pand mm7, mm4 /* Only pav bytes < 0 in mm7 */ 2574 psubw mm5, mm0 2575 psubw mm4, mm7 2576 psubw mm5, mm0 2577 psubw mm4, mm7 2578 pxor mm0, mm0 2579 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */ 2580 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */ 2581 psubw mm6, mm0 2582 /* test pa <= pb */ 2583 movq mm7, mm4 2584 psubw mm6, mm0 2585 pcmpgtw mm7, mm5 /* pa > pb? */ 2586 movq mm0, mm7 2587 /* use mm7 mask to merge pa & pb */ 2588 pand mm5, mm7 2589 /* use mm0 mask copy to merge a & b */ 2590 pand mm2, mm0 2591 pandn mm7, mm4 2592 pandn mm0, mm1 2593 paddw mm7, mm5 2594 paddw mm0, mm2 2595 /* test ((pa <= pb)? pa:pb) <= pc */ 2596 pcmpgtw mm7, mm6 /* pab > pc? */ 2597 movq mm2, [esi + ebx] /* load b=Prior(x) */ 2598 pand mm3, mm7 2599 pandn mm7, mm0 2600 pxor mm1, mm1 2601 paddw mm7, mm3 2602 pxor mm0, mm0 2603 packuswb mm7, mm1 2604 movq mm3, mm2 /* load c=Prior(x-bpp) step 1 */ 2605 pand mm7, ActiveMask 2606 punpckhbw mm2, mm0 /* Unpack High bytes of b */ 2607 psllq mm7, ShiftBpp /* Shift bytes to 2nd group of 3 bytes */ 2608 /* pav = p - a = (a + b - c) - a = b - c */ 2609 movq mm4, mm2 2610 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */ 2611 psllq mm3, ShiftBpp /* load c=Prior(x-bpp) step 2 */ 2612 movq [edi + ebx], mm7 /* write back updated value */ 2613 movq mm1, mm7 2614 punpckhbw mm3, mm0 /* Unpack High bytes of c */ 2615 psllq mm1, ShiftBpp /* Shift bytes */ 2616 /* Now mm1 will be used as Raw(x-bpp) */ 2617 /* Now do Paeth for 3rd, and final, set of bytes (6-7) */ 2618 pxor mm7, mm7 2619 punpckhbw mm1, mm0 /* Unpack High bytes of a */ 2620 psubw mm4, mm3 2621 /* pbv = p - b = (a + b - c) - b = a - c */ 2622 movq mm5, mm1 2623 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 2624 movq mm6, mm4 2625 psubw mm5, mm3 2626 pxor mm0, mm0 2627 paddw mm6, mm5 2628 2629 /* pa = abs(p-a) = abs(pav) */ 2630 /* pb = abs(p-b) = abs(pbv) */ 2631 /* pc = abs(p-c) = abs(pcv) */ 2632 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */ 2633 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */ 2634 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */ 2635 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */ 2636 psubw mm4, mm0 2637 psubw mm5, mm7 2638 psubw mm4, mm0 2639 psubw mm5, mm7 2640 pxor mm0, mm0 2641 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */ 2642 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */ 2643 psubw mm6, mm0 2644 /* test pa <= pb */ 2645 movq mm7, mm4 2646 psubw mm6, mm0 2647 pcmpgtw mm7, mm5 /* pa > pb? */ 2648 movq mm0, mm7 2649 /* use mm0 mask copy to merge a & b */ 2650 pand mm2, mm0 2651 /* use mm7 mask to merge pa & pb */ 2652 pand mm5, mm7 2653 pandn mm0, mm1 2654 pandn mm7, mm4 2655 paddw mm0, mm2 2656 paddw mm7, mm5 2657 /* test ((pa <= pb)? pa:pb) <= pc */ 2658 pcmpgtw mm7, mm6 /* pab > pc? */ 2659 pand mm3, mm7 2660 pandn mm7, mm0 2661 paddw mm7, mm3 2662 pxor mm1, mm1 2663 packuswb mm1, mm7 2664 /* Step ebx to next set of 8 bytes and repeat loop til done */ 2665 add ebx, 8 2666 pand mm1, ActiveMaskEnd 2667 paddb mm1, [edi + ebx - 8] /* add Paeth predictor with Raw(x) */ 2668 2669 cmp ebx, MMXLength 2670 pxor mm0, mm0 /* pxor does not affect flags */ 2671 movq [edi + ebx - 8], mm1 /* write back updated value */ 2672 /* mm1 will be used as Raw(x-bpp) next loop */ 2673 /* mm3 ready to be used as Prior(x-bpp) next loop */ 2674 jb dpth3lp 2675 } /* end _asm block */ 2676 } 2677 break; 2678 2679 case 6: 2680 case 7: 2681 case 5: 2682 { 2683 ActiveMask.use = 0x00000000ffffffff; 2684 ActiveMask2.use = 0xffffffff00000000; 2685 ShiftBpp.use = bpp << 3; /* == bpp * 8 */ 2686 ShiftRem.use = 64 - ShiftBpp.use; 2687 _asm 2688 { 2689 mov ebx, diff 2690 mov edi, row 2691 mov esi, prev_row 2692 /* PRIME the pump (load the first Raw(x-bpp) data set */ 2693 movq mm1, [edi+ebx-8] 2694 pxor mm0, mm0 2695dpth6lp: 2696 /* Must shift to position Raw(x-bpp) data */ 2697 psrlq mm1, ShiftRem 2698 /* Do first set of 4 bytes */ 2699 movq mm3, [esi+ebx-8] /* read c=Prior(x-bpp) bytes */ 2700 punpcklbw mm1, mm0 /* Unpack Low bytes of a */ 2701 movq mm2, [esi + ebx] /* load b=Prior(x) */ 2702 punpcklbw mm2, mm0 /* Unpack Low bytes of b */ 2703 /* Must shift to position Prior(x-bpp) data */ 2704 psrlq mm3, ShiftRem 2705 /* pav = p - a = (a + b - c) - a = b - c */ 2706 movq mm4, mm2 2707 punpcklbw mm3, mm0 /* Unpack Low bytes of c */ 2708 /* pbv = p - b = (a + b - c) - b = a - c */ 2709 movq mm5, mm1 2710 psubw mm4, mm3 2711 pxor mm7, mm7 2712 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 2713 movq mm6, mm4 2714 psubw mm5, mm3 2715 /* pa = abs(p-a) = abs(pav) */ 2716 /* pb = abs(p-b) = abs(pbv) */ 2717 /* pc = abs(p-c) = abs(pcv) */ 2718 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */ 2719 paddw mm6, mm5 2720 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */ 2721 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */ 2722 psubw mm4, mm0 2723 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */ 2724 psubw mm4, mm0 2725 psubw mm5, mm7 2726 pxor mm0, mm0 2727 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */ 2728 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */ 2729 psubw mm5, mm7 2730 psubw mm6, mm0 2731 /* test pa <= pb */ 2732 movq mm7, mm4 2733 psubw mm6, mm0 2734 pcmpgtw mm7, mm5 /* pa > pb? */ 2735 movq mm0, mm7 2736 /* use mm7 mask to merge pa & pb */ 2737 pand mm5, mm7 2738 /* use mm0 mask copy to merge a & b */ 2739 pand mm2, mm0 2740 pandn mm7, mm4 2741 pandn mm0, mm1 2742 paddw mm7, mm5 2743 paddw mm0, mm2 2744 /* test ((pa <= pb)? pa:pb) <= pc */ 2745 pcmpgtw mm7, mm6 /* pab > pc? */ 2746 pxor mm1, mm1 2747 pand mm3, mm7 2748 pandn mm7, mm0 2749 paddw mm7, mm3 2750 pxor mm0, mm0 2751 packuswb mm7, mm1 2752 movq mm3, [esi + ebx - 8] /* load c=Prior(x-bpp) */ 2753 pand mm7, ActiveMask 2754 psrlq mm3, ShiftRem 2755 movq mm2, [esi + ebx] /* load b=Prior(x) step 1 */ 2756 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */ 2757 movq mm6, mm2 2758 movq [edi + ebx], mm7 /* write back updated value */ 2759 movq mm1, [edi+ebx-8] 2760 psllq mm6, ShiftBpp 2761 movq mm5, mm7 2762 psrlq mm1, ShiftRem 2763 por mm3, mm6 2764 psllq mm5, ShiftBpp 2765 punpckhbw mm3, mm0 /* Unpack High bytes of c */ 2766 por mm1, mm5 2767 /* Do second set of 4 bytes */ 2768 punpckhbw mm2, mm0 /* Unpack High bytes of b */ 2769 punpckhbw mm1, mm0 /* Unpack High bytes of a */ 2770 /* pav = p - a = (a + b - c) - a = b - c */ 2771 movq mm4, mm2 2772 /* pbv = p - b = (a + b - c) - b = a - c */ 2773 movq mm5, mm1 2774 psubw mm4, mm3 2775 pxor mm7, mm7 2776 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 2777 movq mm6, mm4 2778 psubw mm5, mm3 2779 /* pa = abs(p-a) = abs(pav) */ 2780 /* pb = abs(p-b) = abs(pbv) */ 2781 /* pc = abs(p-c) = abs(pcv) */ 2782 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */ 2783 paddw mm6, mm5 2784 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */ 2785 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */ 2786 psubw mm4, mm0 2787 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */ 2788 psubw mm4, mm0 2789 psubw mm5, mm7 2790 pxor mm0, mm0 2791 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */ 2792 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */ 2793 psubw mm5, mm7 2794 psubw mm6, mm0 2795 /* test pa <= pb */ 2796 movq mm7, mm4 2797 psubw mm6, mm0 2798 pcmpgtw mm7, mm5 /* pa > pb? */ 2799 movq mm0, mm7 2800 /* use mm7 mask to merge pa & pb */ 2801 pand mm5, mm7 2802 /* use mm0 mask copy to merge a & b */ 2803 pand mm2, mm0 2804 pandn mm7, mm4 2805 pandn mm0, mm1 2806 paddw mm7, mm5 2807 paddw mm0, mm2 2808 /* test ((pa <= pb)? pa:pb) <= pc */ 2809 pcmpgtw mm7, mm6 /* pab > pc? */ 2810 pxor mm1, mm1 2811 pand mm3, mm7 2812 pandn mm7, mm0 2813 pxor mm1, mm1 2814 paddw mm7, mm3 2815 pxor mm0, mm0 2816 /* Step ex to next set of 8 bytes and repeat loop til done */ 2817 add ebx, 8 2818 packuswb mm1, mm7 2819 paddb mm1, [edi + ebx - 8] /* add Paeth predictor with Raw(x) */ 2820 cmp ebx, MMXLength 2821 movq [edi + ebx - 8], mm1 /* write back updated value */ 2822 /* mm1 will be used as Raw(x-bpp) next loop */ 2823 jb dpth6lp 2824 } /* end _asm block */ 2825 } 2826 break; 2827 2828 case 4: 2829 { 2830 ActiveMask.use = 0x00000000ffffffff; 2831 _asm { 2832 mov ebx, diff 2833 mov edi, row 2834 mov esi, prev_row 2835 pxor mm0, mm0 2836 /* PRIME the pump (load the first Raw(x-bpp) data set */ 2837 movq mm1, [edi+ebx-8] /* Only time should need to read */ 2838 /* a=Raw(x-bpp) bytes */ 2839dpth4lp: 2840 /* Do first set of 4 bytes */ 2841 movq mm3, [esi+ebx-8] /* read c=Prior(x-bpp) bytes */ 2842 punpckhbw mm1, mm0 /* Unpack Low bytes of a */ 2843 movq mm2, [esi + ebx] /* load b=Prior(x) */ 2844 punpcklbw mm2, mm0 /* Unpack High bytes of b */ 2845 /* pav = p - a = (a + b - c) - a = b - c */ 2846 movq mm4, mm2 2847 punpckhbw mm3, mm0 /* Unpack High bytes of c */ 2848 /* pbv = p - b = (a + b - c) - b = a - c */ 2849 movq mm5, mm1 2850 psubw mm4, mm3 2851 pxor mm7, mm7 2852 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 2853 movq mm6, mm4 2854 psubw mm5, mm3 2855 /* pa = abs(p-a) = abs(pav) */ 2856 /* pb = abs(p-b) = abs(pbv) */ 2857 /* pc = abs(p-c) = abs(pcv) */ 2858 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */ 2859 paddw mm6, mm5 2860 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */ 2861 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */ 2862 psubw mm4, mm0 2863 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */ 2864 psubw mm4, mm0 2865 psubw mm5, mm7 2866 pxor mm0, mm0 2867 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */ 2868 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */ 2869 psubw mm5, mm7 2870 psubw mm6, mm0 2871 /* test pa <= pb */ 2872 movq mm7, mm4 2873 psubw mm6, mm0 2874 pcmpgtw mm7, mm5 /* pa > pb? */ 2875 movq mm0, mm7 2876 /* use mm7 mask to merge pa & pb */ 2877 pand mm5, mm7 2878 /* use mm0 mask copy to merge a & b */ 2879 pand mm2, mm0 2880 pandn mm7, mm4 2881 pandn mm0, mm1 2882 paddw mm7, mm5 2883 paddw mm0, mm2 2884 /* test ((pa <= pb)? pa:pb) <= pc */ 2885 pcmpgtw mm7, mm6 /* pab > pc? */ 2886 pxor mm1, mm1 2887 pand mm3, mm7 2888 pandn mm7, mm0 2889 paddw mm7, mm3 2890 pxor mm0, mm0 2891 packuswb mm7, mm1 2892 movq mm3, [esi + ebx] /* load c=Prior(x-bpp) */ 2893 pand mm7, ActiveMask 2894 movq mm2, mm3 /* load b=Prior(x) step 1 */ 2895 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */ 2896 punpcklbw mm3, mm0 /* Unpack High bytes of c */ 2897 movq [edi + ebx], mm7 /* write back updated value */ 2898 movq mm1, mm7 /* Now mm1 will be used as Raw(x-bpp) */ 2899 /* Do second set of 4 bytes */ 2900 punpckhbw mm2, mm0 /* Unpack Low bytes of b */ 2901 punpcklbw mm1, mm0 /* Unpack Low bytes of a */ 2902 /* pav = p - a = (a + b - c) - a = b - c */ 2903 movq mm4, mm2 2904 /* pbv = p - b = (a + b - c) - b = a - c */ 2905 movq mm5, mm1 2906 psubw mm4, mm3 2907 pxor mm7, mm7 2908 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 2909 movq mm6, mm4 2910 psubw mm5, mm3 2911 /* pa = abs(p-a) = abs(pav) */ 2912 /* pb = abs(p-b) = abs(pbv) */ 2913 /* pc = abs(p-c) = abs(pcv) */ 2914 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */ 2915 paddw mm6, mm5 2916 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */ 2917 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */ 2918 psubw mm4, mm0 2919 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */ 2920 psubw mm4, mm0 2921 psubw mm5, mm7 2922 pxor mm0, mm0 2923 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */ 2924 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */ 2925 psubw mm5, mm7 2926 psubw mm6, mm0 2927 /* test pa <= pb */ 2928 movq mm7, mm4 2929 psubw mm6, mm0 2930 pcmpgtw mm7, mm5 /* pa > pb? */ 2931 movq mm0, mm7 2932 /* use mm7 mask to merge pa & pb */ 2933 pand mm5, mm7 2934 /* use mm0 mask copy to merge a & b */ 2935 pand mm2, mm0 2936 pandn mm7, mm4 2937 pandn mm0, mm1 2938 paddw mm7, mm5 2939 paddw mm0, mm2 2940 /* test ((pa <= pb)? pa:pb) <= pc */ 2941 pcmpgtw mm7, mm6 /* pab > pc? */ 2942 pxor mm1, mm1 2943 pand mm3, mm7 2944 pandn mm7, mm0 2945 pxor mm1, mm1 2946 paddw mm7, mm3 2947 pxor mm0, mm0 2948 /* Step ex to next set of 8 bytes and repeat loop til done */ 2949 add ebx, 8 2950 packuswb mm1, mm7 2951 paddb mm1, [edi + ebx - 8] /* add Paeth predictor with Raw(x) */ 2952 cmp ebx, MMXLength 2953 movq [edi + ebx - 8], mm1 /* write back updated value */ 2954 /* mm1 will be used as Raw(x-bpp) next loop */ 2955 jb dpth4lp 2956 } /* end _asm block */ 2957 } 2958 break; 2959 case 8: /* bpp == 8 */ 2960 { 2961 ActiveMask.use = 0x00000000ffffffff; 2962 _asm { 2963 mov ebx, diff 2964 mov edi, row 2965 mov esi, prev_row 2966 pxor mm0, mm0 2967 /* PRIME the pump (load the first Raw(x-bpp) data set */ 2968 movq mm1, [edi+ebx-8] /* Only time should need to read */ 2969 /* a=Raw(x-bpp) bytes */ 2970dpth8lp: 2971 /* Do first set of 4 bytes */ 2972 movq mm3, [esi+ebx-8] /* read c=Prior(x-bpp) bytes */ 2973 punpcklbw mm1, mm0 /* Unpack Low bytes of a */ 2974 movq mm2, [esi + ebx] /* load b=Prior(x) */ 2975 punpcklbw mm2, mm0 /* Unpack Low bytes of b */ 2976 /* pav = p - a = (a + b - c) - a = b - c */ 2977 movq mm4, mm2 2978 punpcklbw mm3, mm0 /* Unpack Low bytes of c */ 2979 /* pbv = p - b = (a + b - c) - b = a - c */ 2980 movq mm5, mm1 2981 psubw mm4, mm3 2982 pxor mm7, mm7 2983 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 2984 movq mm6, mm4 2985 psubw mm5, mm3 2986 /* pa = abs(p-a) = abs(pav) */ 2987 /* pb = abs(p-b) = abs(pbv) */ 2988 /* pc = abs(p-c) = abs(pcv) */ 2989 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */ 2990 paddw mm6, mm5 2991 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */ 2992 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */ 2993 psubw mm4, mm0 2994 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */ 2995 psubw mm4, mm0 2996 psubw mm5, mm7 2997 pxor mm0, mm0 2998 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */ 2999 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */ 3000 psubw mm5, mm7 3001 psubw mm6, mm0 3002 /* test pa <= pb */ 3003 movq mm7, mm4 3004 psubw mm6, mm0 3005 pcmpgtw mm7, mm5 /* pa > pb? */ 3006 movq mm0, mm7 3007 /* use mm7 mask to merge pa & pb */ 3008 pand mm5, mm7 3009 /* use mm0 mask copy to merge a & b */ 3010 pand mm2, mm0 3011 pandn mm7, mm4 3012 pandn mm0, mm1 3013 paddw mm7, mm5 3014 paddw mm0, mm2 3015 /* test ((pa <= pb)? pa:pb) <= pc */ 3016 pcmpgtw mm7, mm6 /* pab > pc? */ 3017 pxor mm1, mm1 3018 pand mm3, mm7 3019 pandn mm7, mm0 3020 paddw mm7, mm3 3021 pxor mm0, mm0 3022 packuswb mm7, mm1 3023 movq mm3, [esi+ebx-8] /* read c=Prior(x-bpp) bytes */ 3024 pand mm7, ActiveMask 3025 movq mm2, [esi + ebx] /* load b=Prior(x) */ 3026 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */ 3027 punpckhbw mm3, mm0 /* Unpack High bytes of c */ 3028 movq [edi + ebx], mm7 /* write back updated value */ 3029 movq mm1, [edi+ebx-8] /* read a=Raw(x-bpp) bytes */ 3030 3031 /* Do second set of 4 bytes */ 3032 punpckhbw mm2, mm0 /* Unpack High bytes of b */ 3033 punpckhbw mm1, mm0 /* Unpack High bytes of a */ 3034 /* pav = p - a = (a + b - c) - a = b - c */ 3035 movq mm4, mm2 3036 /* pbv = p - b = (a + b - c) - b = a - c */ 3037 movq mm5, mm1 3038 psubw mm4, mm3 3039 pxor mm7, mm7 3040 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 3041 movq mm6, mm4 3042 psubw mm5, mm3 3043 /* pa = abs(p-a) = abs(pav) */ 3044 /* pb = abs(p-b) = abs(pbv) */ 3045 /* pc = abs(p-c) = abs(pcv) */ 3046 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */ 3047 paddw mm6, mm5 3048 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */ 3049 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */ 3050 psubw mm4, mm0 3051 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */ 3052 psubw mm4, mm0 3053 psubw mm5, mm7 3054 pxor mm0, mm0 3055 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */ 3056 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */ 3057 psubw mm5, mm7 3058 psubw mm6, mm0 3059 /* test pa <= pb */ 3060 movq mm7, mm4 3061 psubw mm6, mm0 3062 pcmpgtw mm7, mm5 /* pa > pb? */ 3063 movq mm0, mm7 3064 /* use mm7 mask to merge pa & pb */ 3065 pand mm5, mm7 3066 /* use mm0 mask copy to merge a & b */ 3067 pand mm2, mm0 3068 pandn mm7, mm4 3069 pandn mm0, mm1 3070 paddw mm7, mm5 3071 paddw mm0, mm2 3072 /* test ((pa <= pb)? pa:pb) <= pc */ 3073 pcmpgtw mm7, mm6 /* pab > pc? */ 3074 pxor mm1, mm1 3075 pand mm3, mm7 3076 pandn mm7, mm0 3077 pxor mm1, mm1 3078 paddw mm7, mm3 3079 pxor mm0, mm0 3080 /* Step ex to next set of 8 bytes and repeat loop til done */ 3081 add ebx, 8 3082 packuswb mm1, mm7 3083 paddb mm1, [edi + ebx - 8] /* add Paeth predictor with Raw(x) */ 3084 cmp ebx, MMXLength 3085 movq [edi + ebx - 8], mm1 /* write back updated value */ 3086 /* mm1 will be used as Raw(x-bpp) next loop */ 3087 jb dpth8lp 3088 } /* end _asm block */ 3089 } 3090 break; 3091 3092 case 1: /* bpp = 1 */ 3093 case 2: /* bpp = 2 */ 3094 default: /* bpp > 8 */ 3095 { 3096 _asm { 3097 mov ebx, diff 3098 cmp ebx, FullLength 3099 jnb dpthdend 3100 mov edi, row 3101 mov esi, prev_row 3102 /* Do Paeth decode for remaining bytes */ 3103 mov edx, ebx 3104 xor ecx, ecx /* zero ecx before using cl & cx in loop below */ 3105 sub edx, bpp /* Set edx = ebx - bpp */ 3106dpthdlp: 3107 xor eax, eax 3108 /* pav = p - a = (a + b - c) - a = b - c */ 3109 mov al, [esi + ebx] /* load Prior(x) into al */ 3110 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */ 3111 sub eax, ecx /* subtract Prior(x-bpp) */ 3112 mov patemp, eax /* Save pav for later use */ 3113 xor eax, eax 3114 /* pbv = p - b = (a + b - c) - b = a - c */ 3115 mov al, [edi + edx] /* load Raw(x-bpp) into al */ 3116 sub eax, ecx /* subtract Prior(x-bpp) */ 3117 mov ecx, eax 3118 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 3119 add eax, patemp /* pcv = pav + pbv */ 3120 /* pc = abs(pcv) */ 3121 test eax, 0x80000000 3122 jz dpthdpca 3123 neg eax /* reverse sign of neg values */ 3124dpthdpca: 3125 mov pctemp, eax /* save pc for later use */ 3126 /* pb = abs(pbv) */ 3127 test ecx, 0x80000000 3128 jz dpthdpba 3129 neg ecx /* reverse sign of neg values */ 3130dpthdpba: 3131 mov pbtemp, ecx /* save pb for later use */ 3132 /* pa = abs(pav) */ 3133 mov eax, patemp 3134 test eax, 0x80000000 3135 jz dpthdpaa 3136 neg eax /* reverse sign of neg values */ 3137dpthdpaa: 3138 mov patemp, eax /* save pa for later use */ 3139 /* test if pa <= pb */ 3140 cmp eax, ecx 3141 jna dpthdabb 3142 /* pa > pb; now test if pb <= pc */ 3143 cmp ecx, pctemp 3144 jna dpthdbbc 3145 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */ 3146 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */ 3147 jmp dpthdpaeth 3148dpthdbbc: 3149 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */ 3150 mov cl, [esi + ebx] /* load Prior(x) into cl */ 3151 jmp dpthdpaeth 3152dpthdabb: 3153 /* pa <= pb; now test if pa <= pc */ 3154 cmp eax, pctemp 3155 jna dpthdabc 3156 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */ 3157 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */ 3158 jmp dpthdpaeth 3159dpthdabc: 3160 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */ 3161 mov cl, [edi + edx] /* load Raw(x-bpp) into cl */ 3162dpthdpaeth: 3163 inc ebx 3164 inc edx 3165 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */ 3166 add [edi + ebx - 1], cl 3167 cmp ebx, FullLength 3168 jb dpthdlp 3169dpthdend: 3170 } /* end _asm block */ 3171 } 3172 return; /* No need to go further with this one */ 3173 } /* end switch ( bpp ) */ 3174 _asm 3175 { 3176 /* MMX acceleration complete now do clean-up */ 3177 /* Check if any remaining bytes left to decode */ 3178 mov ebx, MMXLength 3179 cmp ebx, FullLength 3180 jnb dpthend 3181 mov edi, row 3182 mov esi, prev_row 3183 /* Do Paeth decode for remaining bytes */ 3184 mov edx, ebx 3185 xor ecx, ecx /* zero ecx before using cl & cx in loop below */ 3186 sub edx, bpp /* Set edx = ebx - bpp */ 3187dpthlp2: 3188 xor eax, eax 3189 /* pav = p - a = (a + b - c) - a = b - c */ 3190 mov al, [esi + ebx] /* load Prior(x) into al */ 3191 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */ 3192 sub eax, ecx /* subtract Prior(x-bpp) */ 3193 mov patemp, eax /* Save pav for later use */ 3194 xor eax, eax 3195 /* pbv = p - b = (a + b - c) - b = a - c */ 3196 mov al, [edi + edx] /* load Raw(x-bpp) into al */ 3197 sub eax, ecx /* subtract Prior(x-bpp) */ 3198 mov ecx, eax 3199 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 3200 add eax, patemp /* pcv = pav + pbv */ 3201 /* pc = abs(pcv) */ 3202 test eax, 0x80000000 3203 jz dpthpca2 3204 neg eax /* reverse sign of neg values */ 3205dpthpca2: 3206 mov pctemp, eax /* save pc for later use */ 3207 /* pb = abs(pbv) */ 3208 test ecx, 0x80000000 3209 jz dpthpba2 3210 neg ecx /* reverse sign of neg values */ 3211dpthpba2: 3212 mov pbtemp, ecx /* save pb for later use */ 3213 /* pa = abs(pav) */ 3214 mov eax, patemp 3215 test eax, 0x80000000 3216 jz dpthpaa2 3217 neg eax /* reverse sign of neg values */ 3218dpthpaa2: 3219 mov patemp, eax /* save pa for later use */ 3220 /* test if pa <= pb */ 3221 cmp eax, ecx 3222 jna dpthabb2 3223 /* pa > pb; now test if pb <= pc */ 3224 cmp ecx, pctemp 3225 jna dpthbbc2 3226 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */ 3227 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */ 3228 jmp dpthpaeth2 3229dpthbbc2: 3230 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */ 3231 mov cl, [esi + ebx] /* load Prior(x) into cl */ 3232 jmp dpthpaeth2 3233dpthabb2: 3234 /* pa <= pb; now test if pa <= pc */ 3235 cmp eax, pctemp 3236 jna dpthabc2 3237 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */ 3238 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */ 3239 jmp dpthpaeth2 3240dpthabc2: 3241 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */ 3242 mov cl, [edi + edx] /* load Raw(x-bpp) into cl */ 3243dpthpaeth2: 3244 inc ebx 3245 inc edx 3246 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */ 3247 add [edi + ebx - 1], cl 3248 cmp ebx, FullLength 3249 jb dpthlp2 3250dpthend: 3251 emms /* End MMX instructions; prep for possible FP instrs. */ 3252 } /* end _asm block */ 3253} 3254 3255/* Optimized code for PNG Sub filter decoder */ 3256void /* PRIVATE */ 3257png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row) 3258{ 3259 /*int test; */ 3260 int bpp; 3261 png_uint_32 FullLength; 3262 png_uint_32 MMXLength; 3263 int diff; 3264 3265 bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */ 3266 FullLength = row_info->rowbytes - bpp; /* # of bytes to filter */ 3267 _asm { 3268 mov edi, row 3269 mov esi, edi /* lp = row */ 3270 add edi, bpp /* rp = row + bpp */ 3271 xor eax, eax 3272 /* get # of bytes to alignment */ 3273 mov diff, edi /* take start of row */ 3274 add diff, 0xf /* add 7 + 8 to incr past */ 3275 /* alignment boundary */ 3276 xor ebx, ebx 3277 and diff, 0xfffffff8 /* mask to alignment boundary */ 3278 sub diff, edi /* subtract from start ==> value */ 3279 /* ebx at alignment */ 3280 jz dsubgo 3281 /* fix alignment */ 3282dsublp1: 3283 mov al, [esi+ebx] 3284 add [edi+ebx], al 3285 inc ebx 3286 cmp ebx, diff 3287 jb dsublp1 3288dsubgo: 3289 mov ecx, FullLength 3290 mov edx, ecx 3291 sub edx, ebx /* subtract alignment fix */ 3292 and edx, 0x00000007 /* calc bytes over mult of 8 */ 3293 sub ecx, edx /* drop over bytes from length */ 3294 mov MMXLength, ecx 3295 } /* end _asm block */ 3296 3297 /* Now do the math for the rest of the row */ 3298 switch ( bpp ) 3299 { 3300 case 3: 3301 { 3302 ActiveMask.use = 0x0000ffffff000000; 3303 ShiftBpp.use = 24; /* == 3 * 8 */ 3304 ShiftRem.use = 40; /* == 64 - 24 */ 3305 _asm { 3306 mov edi, row 3307 movq mm7, ActiveMask /* Load ActiveMask for 2nd active byte group */ 3308 mov esi, edi /* lp = row */ 3309 add edi, bpp /* rp = row + bpp */ 3310 movq mm6, mm7 3311 mov ebx, diff 3312 psllq mm6, ShiftBpp /* Move mask in mm6 to cover 3rd active */ 3313 /* byte group */ 3314 /* PRIME the pump (load the first Raw(x-bpp) data set */ 3315 movq mm1, [edi+ebx-8] 3316dsub3lp: 3317 psrlq mm1, ShiftRem /* Shift data for adding 1st bpp bytes */ 3318 /* no need for mask; shift clears inactive bytes */ 3319 /* Add 1st active group */ 3320 movq mm0, [edi+ebx] 3321 paddb mm0, mm1 3322 /* Add 2nd active group */ 3323 movq mm1, mm0 /* mov updated Raws to mm1 */ 3324 psllq mm1, ShiftBpp /* shift data to position correctly */ 3325 pand mm1, mm7 /* mask to use only 2nd active group */ 3326 paddb mm0, mm1 3327 /* Add 3rd active group */ 3328 movq mm1, mm0 /* mov updated Raws to mm1 */ 3329 psllq mm1, ShiftBpp /* shift data to position correctly */ 3330 pand mm1, mm6 /* mask to use only 3rd active group */ 3331 add ebx, 8 3332 paddb mm0, mm1 3333 cmp ebx, MMXLength 3334 movq [edi+ebx-8], mm0 /* Write updated Raws back to array */ 3335 /* Prep for doing 1st add at top of loop */ 3336 movq mm1, mm0 3337 jb dsub3lp 3338 } /* end _asm block */ 3339 } 3340 break; 3341 3342 case 1: 3343 { 3344 /* Placed here just in case this is a duplicate of the */ 3345 /* non-MMX code for the SUB filter in png_read_filter_row below */ 3346 // 3347 /* png_bytep rp; */ 3348 /* png_bytep lp; */ 3349 /* png_uint_32 i; */ 3350 /* bpp = (row_info->pixel_depth + 7) >> 3; */ 3351 /* for (i = (png_uint_32)bpp, rp = row + bpp, lp = row; */ 3352 /* i < row_info->rowbytes; i++, rp++, lp++) */ 3353 /* { */ 3354 /* *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff); */ 3355 /* } */ 3356 _asm { 3357 mov ebx, diff 3358 mov edi, row 3359 cmp ebx, FullLength 3360 jnb dsub1end 3361 mov esi, edi /* lp = row */ 3362 xor eax, eax 3363 add edi, bpp /* rp = row + bpp */ 3364dsub1lp: 3365 mov al, [esi+ebx] 3366 add [edi+ebx], al 3367 inc ebx 3368 cmp ebx, FullLength 3369 jb dsub1lp 3370dsub1end: 3371 } /* end _asm block */ 3372 } 3373 return; 3374 3375 case 6: 3376 case 7: 3377 case 4: 3378 case 5: 3379 { 3380 ShiftBpp.use = bpp << 3; 3381 ShiftRem.use = 64 - ShiftBpp.use; 3382 _asm { 3383 mov edi, row 3384 mov ebx, diff 3385 mov esi, edi /* lp = row */ 3386 add edi, bpp /* rp = row + bpp */ 3387 /* PRIME the pump (load the first Raw(x-bpp) data set */ 3388 movq mm1, [edi+ebx-8] 3389dsub4lp: 3390 psrlq mm1, ShiftRem /* Shift data for adding 1st bpp bytes */ 3391 /* no need for mask; shift clears inactive bytes */ 3392 movq mm0, [edi+ebx] 3393 paddb mm0, mm1 3394 /* Add 2nd active group */ 3395 movq mm1, mm0 /* mov updated Raws to mm1 */ 3396 psllq mm1, ShiftBpp /* shift data to position correctly */ 3397 /* there is no need for any mask */ 3398 /* since shift clears inactive bits/bytes */ 3399 add ebx, 8 3400 paddb mm0, mm1 3401 cmp ebx, MMXLength 3402 movq [edi+ebx-8], mm0 3403 movq mm1, mm0 /* Prep for doing 1st add at top of loop */ 3404 jb dsub4lp 3405 } /* end _asm block */ 3406 } 3407 break; 3408 3409 case 2: 3410 { 3411 ActiveMask.use = 0x00000000ffff0000; 3412 ShiftBpp.use = 16; /* == 2 * 8 */ 3413 ShiftRem.use = 48; /* == 64 - 16 */ 3414 _asm { 3415 movq mm7, ActiveMask /* Load ActiveMask for 2nd active byte group */ 3416 mov ebx, diff 3417 movq mm6, mm7 3418 mov edi, row 3419 psllq mm6, ShiftBpp /* Move mask in mm6 to cover 3rd active */ 3420 /* byte group */ 3421 mov esi, edi /* lp = row */ 3422 movq mm5, mm6 3423 add edi, bpp /* rp = row + bpp */ 3424 psllq mm5, ShiftBpp /* Move mask in mm5 to cover 4th active */ 3425 /* byte group */ 3426 /* PRIME the pump (load the first Raw(x-bpp) data set */ 3427 movq mm1, [edi+ebx-8] 3428dsub2lp: 3429 /* Add 1st active group */ 3430 psrlq mm1, ShiftRem /* Shift data for adding 1st bpp bytes */ 3431 /* no need for mask; shift clears inactive */ 3432 /* bytes */ 3433 movq mm0, [edi+ebx] 3434 paddb mm0, mm1 3435 /* Add 2nd active group */ 3436 movq mm1, mm0 /* mov updated Raws to mm1 */ 3437 psllq mm1, ShiftBpp /* shift data to position correctly */ 3438 pand mm1, mm7 /* mask to use only 2nd active group */ 3439 paddb mm0, mm1 3440 /* Add 3rd active group */ 3441 movq mm1, mm0 /* mov updated Raws to mm1 */ 3442 psllq mm1, ShiftBpp /* shift data to position correctly */ 3443 pand mm1, mm6 /* mask to use only 3rd active group */ 3444 paddb mm0, mm1 3445 /* Add 4th active group */ 3446 movq mm1, mm0 /* mov updated Raws to mm1 */ 3447 psllq mm1, ShiftBpp /* shift data to position correctly */ 3448 pand mm1, mm5 /* mask to use only 4th active group */ 3449 add ebx, 8 3450 paddb mm0, mm1 3451 cmp ebx, MMXLength 3452 movq [edi+ebx-8], mm0 /* Write updated Raws back to array */ 3453 movq mm1, mm0 /* Prep for doing 1st add at top of loop */ 3454 jb dsub2lp 3455 } /* end _asm block */ 3456 } 3457 break; 3458 case 8: 3459 { 3460 _asm { 3461 mov edi, row 3462 mov ebx, diff 3463 mov esi, edi /* lp = row */ 3464 add edi, bpp /* rp = row + bpp */ 3465 mov ecx, MMXLength 3466 movq mm7, [edi+ebx-8] /* PRIME the pump (load the first */ 3467 /* Raw(x-bpp) data set */ 3468 and ecx, 0x0000003f /* calc bytes over mult of 64 */ 3469dsub8lp: 3470 movq mm0, [edi+ebx] /* Load Sub(x) for 1st 8 bytes */ 3471 paddb mm0, mm7 3472 movq mm1, [edi+ebx+8] /* Load Sub(x) for 2nd 8 bytes */ 3473 movq [edi+ebx], mm0 /* Write Raw(x) for 1st 8 bytes */ 3474 /* Now mm0 will be used as Raw(x-bpp) for */ 3475 /* the 2nd group of 8 bytes. This will be */ 3476 /* repeated for each group of 8 bytes with */ 3477 /* the 8th group being used as the Raw(x-bpp) */ 3478 /* for the 1st group of the next loop. */ 3479 paddb mm1, mm0 3480 movq mm2, [edi+ebx+16] /* Load Sub(x) for 3rd 8 bytes */ 3481 movq [edi+ebx+8], mm1 /* Write Raw(x) for 2nd 8 bytes */ 3482 paddb mm2, mm1 3483 movq mm3, [edi+ebx+24] /* Load Sub(x) for 4th 8 bytes */ 3484 movq [edi+ebx+16], mm2 /* Write Raw(x) for 3rd 8 bytes */ 3485 paddb mm3, mm2 3486 movq mm4, [edi+ebx+32] /* Load Sub(x) for 5th 8 bytes */ 3487 movq [edi+ebx+24], mm3 /* Write Raw(x) for 4th 8 bytes */ 3488 paddb mm4, mm3 3489 movq mm5, [edi+ebx+40] /* Load Sub(x) for 6th 8 bytes */ 3490 movq [edi+ebx+32], mm4 /* Write Raw(x) for 5th 8 bytes */ 3491 paddb mm5, mm4 3492 movq mm6, [edi+ebx+48] /* Load Sub(x) for 7th 8 bytes */ 3493 movq [edi+ebx+40], mm5 /* Write Raw(x) for 6th 8 bytes */ 3494 paddb mm6, mm5 3495 movq mm7, [edi+ebx+56] /* Load Sub(x) for 8th 8 bytes */ 3496 movq [edi+ebx+48], mm6 /* Write Raw(x) for 7th 8 bytes */ 3497 add ebx, 64 3498 paddb mm7, mm6 3499 cmp ebx, ecx 3500 movq [edi+ebx-8], mm7 /* Write Raw(x) for 8th 8 bytes */ 3501 jb dsub8lp 3502 cmp ebx, MMXLength 3503 jnb dsub8lt8 3504dsub8lpA: 3505 movq mm0, [edi+ebx] 3506 add ebx, 8 3507 paddb mm0, mm7 3508 cmp ebx, MMXLength 3509 movq [edi+ebx-8], mm0 /* use -8 to offset early add to ebx */ 3510 movq mm7, mm0 /* Move calculated Raw(x) data to mm1 to */ 3511 /* be the new Raw(x-bpp) for the next loop */ 3512 jb dsub8lpA 3513dsub8lt8: 3514 } /* end _asm block */ 3515 } 3516 break; 3517 3518 default: /* bpp greater than 8 bytes */ 3519 { 3520 _asm { 3521 mov ebx, diff 3522 mov edi, row 3523 mov esi, edi /* lp = row */ 3524 add edi, bpp /* rp = row + bpp */ 3525dsubAlp: 3526 movq mm0, [edi+ebx] 3527 movq mm1, [esi+ebx] 3528 add ebx, 8 3529 paddb mm0, mm1 3530 cmp ebx, MMXLength 3531 movq [edi+ebx-8], mm0 /* mov does not affect flags; -8 to offset */ 3532 /* add ebx */ 3533 jb dsubAlp 3534 } /* end _asm block */ 3535 } 3536 break; 3537 3538 } /* end switch ( bpp ) */ 3539 3540 _asm { 3541 mov ebx, MMXLength 3542 mov edi, row 3543 cmp ebx, FullLength 3544 jnb dsubend 3545 mov esi, edi /* lp = row */ 3546 xor eax, eax 3547 add edi, bpp /* rp = row + bpp */ 3548dsublp2: 3549 mov al, [esi+ebx] 3550 add [edi+ebx], al 3551 inc ebx 3552 cmp ebx, FullLength 3553 jb dsublp2 3554dsubend: 3555 emms /* End MMX instructions; prep for possible FP instrs. */ 3556 } /* end _asm block */ 3557} 3558 3559/* Optimized code for PNG Up filter decoder */ 3560void /* PRIVATE */ 3561png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row, 3562 png_bytep prev_row) 3563{ 3564 png_uint_32 len; 3565 len = row_info->rowbytes; /* # of bytes to filter */ 3566 _asm { 3567 mov edi, row 3568 /* get # of bytes to alignment */ 3569 mov ecx, edi 3570 xor ebx, ebx 3571 add ecx, 0x7 3572 xor eax, eax 3573 and ecx, 0xfffffff8 3574 mov esi, prev_row 3575 sub ecx, edi 3576 jz dupgo 3577 /* fix alignment */ 3578duplp1: 3579 mov al, [edi+ebx] 3580 add al, [esi+ebx] 3581 inc ebx 3582 cmp ebx, ecx 3583 mov [edi + ebx-1], al /* mov does not affect flags; -1 to offset inc ebx */ 3584 jb duplp1 3585dupgo: 3586 mov ecx, len 3587 mov edx, ecx 3588 sub edx, ebx /* subtract alignment fix */ 3589 and edx, 0x0000003f /* calc bytes over mult of 64 */ 3590 sub ecx, edx /* drop over bytes from length */ 3591 /* Unrolled loop - use all MMX registers and interleave to reduce */ 3592 /* number of branch instructions (loops) and reduce partial stalls */ 3593duploop: 3594 movq mm1, [esi+ebx] 3595 movq mm0, [edi+ebx] 3596 movq mm3, [esi+ebx+8] 3597 paddb mm0, mm1 3598 movq mm2, [edi+ebx+8] 3599 movq [edi+ebx], mm0 3600 paddb mm2, mm3 3601 movq mm5, [esi+ebx+16] 3602 movq [edi+ebx+8], mm2 3603 movq mm4, [edi+ebx+16] 3604 movq mm7, [esi+ebx+24] 3605 paddb mm4, mm5 3606 movq mm6, [edi+ebx+24] 3607 movq [edi+ebx+16], mm4 3608 paddb mm6, mm7 3609 movq mm1, [esi+ebx+32] 3610 movq [edi+ebx+24], mm6 3611 movq mm0, [edi+ebx+32] 3612 movq mm3, [esi+ebx+40] 3613 paddb mm0, mm1 3614 movq mm2, [edi+ebx+40] 3615 movq [edi+ebx+32], mm0 3616 paddb mm2, mm3 3617 movq mm5, [esi+ebx+48] 3618 movq [edi+ebx+40], mm2 3619 movq mm4, [edi+ebx+48] 3620 movq mm7, [esi+ebx+56] 3621 paddb mm4, mm5 3622 movq mm6, [edi+ebx+56] 3623 movq [edi+ebx+48], mm4 3624 add ebx, 64 3625 paddb mm6, mm7 3626 cmp ebx, ecx 3627 movq [edi+ebx-8], mm6 /* (+56)movq does not affect flags; */ 3628 /* -8 to offset add ebx */ 3629 jb duploop 3630 3631 cmp edx, 0 /* Test for bytes over mult of 64 */ 3632 jz dupend 3633 3634 3635 /* 2 lines added by lcreeve@netins.net */ 3636 /* (mail 11 Jul 98 in png-implement list) */ 3637 cmp edx, 8 /*test for less than 8 bytes */ 3638 jb duplt8 3639 3640 3641 add ecx, edx 3642 and edx, 0x00000007 /* calc bytes over mult of 8 */ 3643 sub ecx, edx /* drop over bytes from length */ 3644 jz duplt8 3645 /* Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously */ 3646duplpA: 3647 movq mm1, [esi+ebx] 3648 movq mm0, [edi+ebx] 3649 add ebx, 8 3650 paddb mm0, mm1 3651 cmp ebx, ecx 3652 movq [edi+ebx-8], mm0 /* movq does not affect flags; -8 to offset add ebx */ 3653 jb duplpA 3654 cmp edx, 0 /* Test for bytes over mult of 8 */ 3655 jz dupend 3656duplt8: 3657 xor eax, eax 3658 add ecx, edx /* move over byte count into counter */ 3659 /* Loop using x86 registers to update remaining bytes */ 3660duplp2: 3661 mov al, [edi + ebx] 3662 add al, [esi + ebx] 3663 inc ebx 3664 cmp ebx, ecx 3665 mov [edi + ebx-1], al /* mov does not affect flags; -1 to offset inc ebx */ 3666 jb duplp2 3667dupend: 3668 /* Conversion of filtered row completed */ 3669 emms /* End MMX instructions; prep for possible FP instrs. */ 3670 } /* end _asm block */ 3671} 3672 3673 3674/* Optimized png_read_filter_row routines */ 3675void /* PRIVATE */ 3676png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep 3677 row, png_bytep prev_row, int filter) 3678{ 3679#ifdef PNG_DEBUG 3680 char filnm[10]; 3681#endif 3682 3683 if (mmx_supported == 2) { 3684#if !defined(PNG_1_0_X) 3685 /* this should have happened in png_init_mmx_flags() already */ 3686 png_warning(png_ptr, "asm_flags may not have been initialized"); 3687#endif 3688 png_mmx_support(); 3689 } 3690 3691#ifdef PNG_DEBUG 3692 png_debug(1, "in png_read_filter_row\n"); 3693 switch (filter) 3694 { 3695 case 0: sprintf(filnm, "none"); 3696 break; 3697#if !defined(PNG_1_0_X) 3698 case 1: sprintf(filnm, "sub-%s", 3699 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86"); 3700 break; 3701 case 2: sprintf(filnm, "up-%s", 3702 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86"); 3703 break; 3704 case 3: sprintf(filnm, "avg-%s", 3705 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86"); 3706 break; 3707 case 4: sprintf(filnm, "Paeth-%s", 3708 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86"); 3709 break; 3710#else 3711 case 1: sprintf(filnm, "sub"); 3712 break; 3713 case 2: sprintf(filnm, "up"); 3714 break; 3715 case 3: sprintf(filnm, "avg"); 3716 break; 3717 case 4: sprintf(filnm, "Paeth"); 3718 break; 3719#endif 3720 default: sprintf(filnm, "unknw"); 3721 break; 3722 } 3723 png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm); 3724 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth, 3725 (int)((row_info->pixel_depth + 7) >> 3)); 3726 png_debug1(0,"len=%8d, ", row_info->rowbytes); 3727#endif /* PNG_DEBUG */ 3728 3729 switch (filter) 3730 { 3731 case PNG_FILTER_VALUE_NONE: 3732 break; 3733 3734 case PNG_FILTER_VALUE_SUB: 3735 { 3736#if !defined(PNG_1_0_X) 3737 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) && 3738 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 3739 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 3740#else 3741 if (mmx_supported) 3742#endif 3743 { 3744 png_read_filter_row_mmx_sub(row_info, row); 3745 } 3746 else 3747 { 3748 png_uint_32 i; 3749 png_uint_32 istop = row_info->rowbytes; 3750 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; 3751 png_bytep rp = row + bpp; 3752 png_bytep lp = row; 3753 3754 for (i = bpp; i < istop; i++) 3755 { 3756 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff); 3757 rp++; 3758 } 3759 } 3760 break; 3761 } 3762 3763 case PNG_FILTER_VALUE_UP: 3764 { 3765#if !defined(PNG_1_0_X) 3766 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) && 3767 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 3768 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 3769#else 3770 if (mmx_supported) 3771#endif 3772 { 3773 png_read_filter_row_mmx_up(row_info, row, prev_row); 3774 } 3775 else 3776 { 3777 png_uint_32 i; 3778 png_uint_32 istop = row_info->rowbytes; 3779 png_bytep rp = row; 3780 png_bytep pp = prev_row; 3781 3782 for (i = 0; i < istop; ++i) 3783 { 3784 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); 3785 rp++; 3786 } 3787 } 3788 break; 3789 } 3790 3791 case PNG_FILTER_VALUE_AVG: 3792 { 3793#if !defined(PNG_1_0_X) 3794 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) && 3795 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 3796 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 3797#else 3798 if (mmx_supported) 3799#endif 3800 { 3801 png_read_filter_row_mmx_avg(row_info, row, prev_row); 3802 } 3803 else 3804 { 3805 png_uint_32 i; 3806 png_bytep rp = row; 3807 png_bytep pp = prev_row; 3808 png_bytep lp = row; 3809 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; 3810 png_uint_32 istop = row_info->rowbytes - bpp; 3811 3812 for (i = 0; i < bpp; i++) 3813 { 3814 *rp = (png_byte)(((int)(*rp) + 3815 ((int)(*pp++) >> 1)) & 0xff); 3816 rp++; 3817 } 3818 3819 for (i = 0; i < istop; i++) 3820 { 3821 *rp = (png_byte)(((int)(*rp) + 3822 ((int)(*pp++ + *lp++) >> 1)) & 0xff); 3823 rp++; 3824 } 3825 } 3826 break; 3827 } 3828 3829 case PNG_FILTER_VALUE_PAETH: 3830 { 3831#if !defined(PNG_1_0_X) 3832 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) && 3833 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 3834 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 3835#else 3836 if (mmx_supported) 3837#endif 3838 { 3839 png_read_filter_row_mmx_paeth(row_info, row, prev_row); 3840 } 3841 else 3842 { 3843 png_uint_32 i; 3844 png_bytep rp = row; 3845 png_bytep pp = prev_row; 3846 png_bytep lp = row; 3847 png_bytep cp = prev_row; 3848 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; 3849 png_uint_32 istop=row_info->rowbytes - bpp; 3850 3851 for (i = 0; i < bpp; i++) 3852 { 3853 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); 3854 rp++; 3855 } 3856 3857 for (i = 0; i < istop; i++) /* use leftover rp,pp */ 3858 { 3859 int a, b, c, pa, pb, pc, p; 3860 3861 a = *lp++; 3862 b = *pp++; 3863 c = *cp++; 3864 3865 p = b - c; 3866 pc = a - c; 3867 3868#ifdef PNG_USE_ABS 3869 pa = abs(p); 3870 pb = abs(pc); 3871 pc = abs(p + pc); 3872#else 3873 pa = p < 0 ? -p : p; 3874 pb = pc < 0 ? -pc : pc; 3875 pc = (p + pc) < 0 ? -(p + pc) : p + pc; 3876#endif 3877 3878 /* 3879 if (pa <= pb && pa <= pc) 3880 p = a; 3881 else if (pb <= pc) 3882 p = b; 3883 else 3884 p = c; 3885 */ 3886 3887 p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c; 3888 3889 *rp = (png_byte)(((int)(*rp) + p) & 0xff); 3890 rp++; 3891 } 3892 } 3893 break; 3894 } 3895 3896 default: 3897 png_warning(png_ptr, "Ignoring bad row filter type"); 3898 *row=0; 3899 break; 3900 } 3901} 3902 3903#endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */ 3904