1/* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file 2 * 3 * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler. 4 * 5 * See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm 6 * and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm 7 * for Intel's performance analysis of the MMX vs. non-MMX code. 8 * 9 * libpng version 1.2.7 - September 12, 2004 10 * For conditions of distribution and use, see copyright notice in png.h 11 * Copyright (c) 1998-2004 Glenn Randers-Pehrson 12 * Copyright (c) 1998, Intel Corporation 13 * 14 * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998. 15 * Interface to libpng contributed by Gilles Vollant, 1999. 16 * GNU C port by Greg Roelofs, 1999-2001. 17 * 18 * Lines 2350-4300 converted in place with intel2gas 1.3.1: 19 * 20 * intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c 21 * 22 * and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ . 23 * 24 * NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows) 25 * is required to assemble the newer MMX instructions such as movq. 26 * For djgpp, see 27 * 28 * ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip 29 * 30 * (or a later version in the same directory). For Linux, check your 31 * distribution's web site(s) or try these links: 32 * 33 * http://rufus.w3.org/linux/RPM/binutils.html 34 * http://www.debian.org/Packages/stable/devel/binutils.html 35 * ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/ 36 * binutils.tgz 37 * 38 * For other platforms, see the main GNU site: 39 * 40 * ftp://ftp.gnu.org/pub/gnu/binutils/ 41 * 42 * Version 2.5.2l.15 is definitely too old... 43 */ 44 45/* 46 * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs) 47 * ===================================== 48 * 49 * 19991006: 50 * - fixed sign error in post-MMX cleanup code (16- & 32-bit cases) 51 * 52 * 19991007: 53 * - additional optimizations (possible or definite): 54 * x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested] 55 * - write MMX code for 48-bit case (pixel_bytes == 6) 56 * - figure out what's up with 24-bit case (pixel_bytes == 3): 57 * why subtract 8 from width_mmx in the pass 4/5 case? 58 * (only width_mmx case) (near line 1606) 59 * x [DONE] replace pixel_bytes within each block with the true 60 * constant value (or are compilers smart enough to do that?) 61 * - rewrite all MMX interlacing code so it's aligned with 62 * the *beginning* of the row buffer, not the end. This 63 * would not only allow one to eliminate half of the memory 64 * writes for odd passes (that is, pass == odd), it may also 65 * eliminate some unaligned-data-access exceptions (assuming 66 * there's a penalty for not aligning 64-bit accesses on 67 * 64-bit boundaries). The only catch is that the "leftover" 68 * pixel(s) at the end of the row would have to be saved, 69 * but there are enough unused MMX registers in every case, 70 * so this is not a problem. A further benefit is that the 71 * post-MMX cleanup code (C code) in at least some of the 72 * cases could be done within the assembler block. 73 * x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing, 74 * inconsistent, and don't match the MMX Programmer's Reference 75 * Manual conventions anyway. They should be changed to 76 * "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that 77 * was lowest in memory (e.g., corresponding to a left pixel) 78 * and b7 is the byte that was highest (e.g., a right pixel). 79 * 80 * 19991016: 81 * - Brennan's Guide notwithstanding, gcc under Linux does *not* 82 * want globals prefixed by underscores when referencing them-- 83 * i.e., if the variable is const4, then refer to it as const4, 84 * not _const4. This seems to be a djgpp-specific requirement. 85 * Also, such variables apparently *must* be declared outside 86 * of functions; neither static nor automatic variables work if 87 * defined within the scope of a single function, but both 88 * static and truly global (multi-module) variables work fine. 89 * 90 * 19991023: 91 * - fixed png_combine_row() non-MMX replication bug (odd passes only?) 92 * - switched from string-concatenation-with-macros to cleaner method of 93 * renaming global variables for djgpp--i.e., always use prefixes in 94 * inlined assembler code (== strings) and conditionally rename the 95 * variables, not the other way around. Hence _const4, _mask8_0, etc. 96 * 97 * 19991024: 98 * - fixed mmxsupport()/png_do_read_interlace() first-row bug 99 * This one was severely weird: even though mmxsupport() doesn't touch 100 * ebx (where "row" pointer was stored), it nevertheless managed to zero 101 * the register (even in static/non-fPIC code--see below), which in turn 102 * caused png_do_read_interlace() to return prematurely on the first row of 103 * interlaced images (i.e., without expanding the interlaced pixels). 104 * Inspection of the generated assembly code didn't turn up any clues, 105 * although it did point at a minor optimization (i.e., get rid of 106 * mmx_supported_local variable and just use eax). Possibly the CPUID 107 * instruction is more destructive than it looks? (Not yet checked.) 108 * - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly 109 * listings... Apparently register spillage has to do with ebx, since 110 * it's used to index the global offset table. Commenting it out of the 111 * input-reg lists in png_combine_row() eliminated compiler barfage, so 112 * ifdef'd with __PIC__ macro: if defined, use a global for unmask 113 * 114 * 19991107: 115 * - verified CPUID clobberage: 12-char string constant ("GenuineIntel", 116 * "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish. 117 * 118 * 19991120: 119 * - made "diff" variable (now "_dif") global to simplify conversion of 120 * filtering routines (running out of regs, sigh). "diff" is still used 121 * in interlacing routines, however. 122 * - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX 123 * macro determines which is used); original not yet tested. 124 * 125 * 20000213: 126 * - when compiling with gcc, be sure to use -fomit-frame-pointer 127 * 128 * 20000319: 129 * - fixed a register-name typo in png_do_read_interlace(), default (MMX) case, 130 * pass == 4 or 5, that caused visible corruption of interlaced images 131 * 132 * 20000623: 133 * - Various problems were reported with gcc 2.95.2 in the Cygwin environment, 134 * many of the form "forbidden register 0 (ax) was spilled for class AREG." 135 * This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and 136 * Chuck Wilson supplied a patch involving dummy output registers. See 137 * http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624 138 * for the original (anonymous) SourceForge bug report. 139 * 140 * 20000706: 141 * - Chuck Wilson passed along these remaining gcc 2.95.2 errors: 142 * pnggccrd.c: In function `png_combine_row': 143 * pnggccrd.c:525: more than 10 operands in `asm' 144 * pnggccrd.c:669: more than 10 operands in `asm' 145 * pnggccrd.c:828: more than 10 operands in `asm' 146 * pnggccrd.c:994: more than 10 operands in `asm' 147 * pnggccrd.c:1177: more than 10 operands in `asm' 148 * They are all the same problem and can be worked around by using the 149 * global _unmask variable unconditionally, not just in the -fPIC case. 150 * Reportedly earlier versions of gcc also have the problem with more than 151 * 10 operands; they just don't report it. Much strangeness ensues, etc. 152 * 153 * 20000729: 154 * - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted 155 * MMX routine); began converting png_read_filter_row_mmx_sub() 156 * - to finish remaining sections: 157 * - clean up indentation and comments 158 * - preload local variables 159 * - add output and input regs (order of former determines numerical 160 * mapping of latter) 161 * - avoid all usage of ebx (including bx, bh, bl) register [20000823] 162 * - remove "$" from addressing of Shift and Mask variables [20000823] 163 * 164 * 20000731: 165 * - global union vars causing segfaults in png_read_filter_row_mmx_sub()? 166 * 167 * 20000822: 168 * - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with 169 * shared-library (-fPIC) version! Code works just fine as part of static 170 * library. Damn damn damn damn damn, should have tested that sooner. 171 * ebx is getting clobbered again (explicitly this time); need to save it 172 * on stack or rewrite asm code to avoid using it altogether. Blargh! 173 * 174 * 20000823: 175 * - first section was trickiest; all remaining sections have ebx -> edx now. 176 * (-fPIC works again.) Also added missing underscores to various Shift* 177 * and *Mask* globals and got rid of leading "$" signs. 178 * 179 * 20000826: 180 * - added visual separators to help navigate microscopic printed copies 181 * (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working 182 * on png_read_filter_row_mmx_avg() 183 * 184 * 20000828: 185 * - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...) 186 * What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not 187 * cleaned up/shortened in either routine, but functionality is complete 188 * and seems to be working fine. 189 * 190 * 20000829: 191 * - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed 192 * as an input reg (with dummy output variables, etc.), then it *cannot* 193 * also appear in the clobber list or gcc 2.95.2 will barf. The solution 194 * is simple enough... 195 * 196 * 20000914: 197 * - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled 198 * correctly (but 48-bit RGB just fine) 199 * 200 * 20000916: 201 * - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors: 202 * - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;" 203 * - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;" 204 * - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2" 205 * 206 * 20010101: 207 * - added new png_init_mmx_flags() function (here only because it needs to 208 * call mmxsupport(), which should probably become global png_mmxsupport()); 209 * modified other MMX routines to run conditionally (png_ptr->asm_flags) 210 * 211 * 20010103: 212 * - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported, 213 * and made it public; moved png_init_mmx_flags() to png.c as internal func 214 * 215 * 20010104: 216 * - removed dependency on png_read_filter_row_c() (C code already duplicated 217 * within MMX version of png_read_filter_row()) so no longer necessary to 218 * compile it into pngrutil.o 219 * 220 * 20010310: 221 * - fixed buffer-overrun bug in png_combine_row() C code (non-MMX) 222 * 223 * 20020304: 224 * - eliminated incorrect use of width_mmx in pixel_bytes == 8 case 225 * 226 * 20040724: 227 * - more tinkering with clobber list at lines 4529 and 5033, to get 228 * it to compile on gcc-3.4. 229 * 230 * STILL TO DO: 231 * - test png_do_read_interlace() 64-bit case (pixel_bytes == 8) 232 * - write MMX code for 48-bit case (pixel_bytes == 6) 233 * - figure out what's up with 24-bit case (pixel_bytes == 3): 234 * why subtract 8 from width_mmx in the pass 4/5 case? 235 * (only width_mmx case) (near line 1606) 236 * - rewrite all MMX interlacing code so it's aligned with beginning 237 * of the row buffer, not the end (see 19991007 for details) 238 * x pick one version of mmxsupport() and get rid of the other 239 * - add error messages to any remaining bogus default cases 240 * - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed) 241 * x add support for runtime enable/disable/query of various MMX routines 242 */ 243 244#define PNG_INTERNAL 245#include "png.h" 246 247#if defined(PNG_USE_PNGGCCRD) 248 249int PNGAPI png_mmx_support(void); 250 251#ifdef PNG_USE_LOCAL_ARRAYS 252static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0}; 253static const int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1}; 254static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1}; 255#endif 256 257#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) 258/* djgpp, Win32, and Cygwin add their own underscores to global variables, 259 * so define them without: */ 260#if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__) 261# define _mmx_supported mmx_supported 262# define _const4 const4 263# define _const6 const6 264# define _mask8_0 mask8_0 265# define _mask16_1 mask16_1 266# define _mask16_0 mask16_0 267# define _mask24_2 mask24_2 268# define _mask24_1 mask24_1 269# define _mask24_0 mask24_0 270# define _mask32_3 mask32_3 271# define _mask32_2 mask32_2 272# define _mask32_1 mask32_1 273# define _mask32_0 mask32_0 274# define _mask48_5 mask48_5 275# define _mask48_4 mask48_4 276# define _mask48_3 mask48_3 277# define _mask48_2 mask48_2 278# define _mask48_1 mask48_1 279# define _mask48_0 mask48_0 280# define _LBCarryMask LBCarryMask 281# define _HBClearMask HBClearMask 282# define _ActiveMask ActiveMask 283# define _ActiveMask2 ActiveMask2 284# define _ActiveMaskEnd ActiveMaskEnd 285# define _ShiftBpp ShiftBpp 286# define _ShiftRem ShiftRem 287#ifdef PNG_THREAD_UNSAFE_OK 288# define _unmask unmask 289# define _FullLength FullLength 290# define _MMXLength MMXLength 291# define _dif dif 292# define _patemp patemp 293# define _pbtemp pbtemp 294# define _pctemp pctemp 295#endif 296#endif 297 298 299/* These constants are used in the inlined MMX assembly code. 300 Ignore gcc's "At top level: defined but not used" warnings. */ 301 302/* GRR 20000706: originally _unmask was needed only when compiling with -fPIC, 303 * since that case uses the %ebx register for indexing the Global Offset Table 304 * and there were no other registers available. But gcc 2.95 and later emit 305 * "more than 10 operands in `asm'" errors when %ebx is used to preload unmask 306 * in the non-PIC case, so we'll just use the global unconditionally now. 307 */ 308#ifdef PNG_THREAD_UNSAFE_OK 309static int _unmask; 310#endif 311 312static unsigned long long _mask8_0 = 0x0102040810204080LL; 313 314static unsigned long long _mask16_1 = 0x0101020204040808LL; 315static unsigned long long _mask16_0 = 0x1010202040408080LL; 316 317static unsigned long long _mask24_2 = 0x0101010202020404LL; 318static unsigned long long _mask24_1 = 0x0408080810101020LL; 319static unsigned long long _mask24_0 = 0x2020404040808080LL; 320 321static unsigned long long _mask32_3 = 0x0101010102020202LL; 322static unsigned long long _mask32_2 = 0x0404040408080808LL; 323static unsigned long long _mask32_1 = 0x1010101020202020LL; 324static unsigned long long _mask32_0 = 0x4040404080808080LL; 325 326static unsigned long long _mask48_5 = 0x0101010101010202LL; 327static unsigned long long _mask48_4 = 0x0202020204040404LL; 328static unsigned long long _mask48_3 = 0x0404080808080808LL; 329static unsigned long long _mask48_2 = 0x1010101010102020LL; 330static unsigned long long _mask48_1 = 0x2020202040404040LL; 331static unsigned long long _mask48_0 = 0x4040808080808080LL; 332 333static unsigned long long _const4 = 0x0000000000FFFFFFLL; 334/* static unsigned long long _const5 = 0x000000FFFFFF0000LL; */ /* NOT USED */ 335static unsigned long long _const6 = 0x00000000000000FFLL; 336 337/* These are used in the row-filter routines and should/would be local */ 338/* variables if not for gcc addressing limitations. */ 339/* WARNING: Their presence probably defeats the thread safety of libpng. */ 340 341#ifdef PNG_THREAD_UNSAFE_OK 342static png_uint_32 _FullLength; 343static png_uint_32 _MMXLength; 344static int _dif; 345static int _patemp; /* temp variables for Paeth routine */ 346static int _pbtemp; 347static int _pctemp; 348#endif 349 350void /* PRIVATE */ 351png_squelch_warnings(void) 352{ 353#ifdef PNG_THREAD_UNSAFE_OK 354 _dif = _dif; 355 _patemp = _patemp; 356 _pbtemp = _pbtemp; 357 _pctemp = _pctemp; 358 _MMXLength = _MMXLength; 359#endif 360 _const4 = _const4; 361 _const6 = _const6; 362 _mask8_0 = _mask8_0; 363 _mask16_1 = _mask16_1; 364 _mask16_0 = _mask16_0; 365 _mask24_2 = _mask24_2; 366 _mask24_1 = _mask24_1; 367 _mask24_0 = _mask24_0; 368 _mask32_3 = _mask32_3; 369 _mask32_2 = _mask32_2; 370 _mask32_1 = _mask32_1; 371 _mask32_0 = _mask32_0; 372 _mask48_5 = _mask48_5; 373 _mask48_4 = _mask48_4; 374 _mask48_3 = _mask48_3; 375 _mask48_2 = _mask48_2; 376 _mask48_1 = _mask48_1; 377 _mask48_0 = _mask48_0; 378} 379#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */ 380 381 382static int _mmx_supported = 2; 383 384/*===========================================================================*/ 385/* */ 386/* P N G _ C O M B I N E _ R O W */ 387/* */ 388/*===========================================================================*/ 389 390#if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW) 391 392#define BPP2 2 393#define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */ 394#define BPP4 4 395#define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */ 396#define BPP8 8 397 398/* Combines the row recently read in with the previous row. 399 This routine takes care of alpha and transparency if requested. 400 This routine also handles the two methods of progressive display 401 of interlaced images, depending on the mask value. 402 The mask value describes which pixels are to be combined with 403 the row. The pattern always repeats every 8 pixels, so just 8 404 bits are needed. A one indicates the pixel is to be combined; a 405 zero indicates the pixel is to be skipped. This is in addition 406 to any alpha or transparency value associated with the pixel. 407 If you want all pixels to be combined, pass 0xff (255) in mask. */ 408 409/* Use this routine for the x86 platform - it uses a faster MMX routine 410 if the machine supports MMX. */ 411 412void /* PRIVATE */ 413png_combine_row(png_structp png_ptr, png_bytep row, int mask) 414{ 415 png_debug(1, "in png_combine_row (pnggccrd.c)\n"); 416 417#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) 418 if (_mmx_supported == 2) { 419#if !defined(PNG_1_0_X) 420 /* this should have happened in png_init_mmx_flags() already */ 421 png_warning(png_ptr, "asm_flags may not have been initialized"); 422#endif 423 png_mmx_support(); 424 } 425#endif 426 427 if (mask == 0xff) 428 { 429 png_debug(2,"mask == 0xff: doing single png_memcpy()\n"); 430 png_memcpy(row, png_ptr->row_buf + 1, 431 (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width)); 432 } 433 else /* (png_combine_row() is never called with mask == 0) */ 434 { 435 switch (png_ptr->row_info.pixel_depth) 436 { 437 case 1: /* png_ptr->row_info.pixel_depth */ 438 { 439 png_bytep sp; 440 png_bytep dp; 441 int s_inc, s_start, s_end; 442 int m; 443 int shift; 444 png_uint_32 i; 445 446 sp = png_ptr->row_buf + 1; 447 dp = row; 448 m = 0x80; 449#if defined(PNG_READ_PACKSWAP_SUPPORTED) 450 if (png_ptr->transformations & PNG_PACKSWAP) 451 { 452 s_start = 0; 453 s_end = 7; 454 s_inc = 1; 455 } 456 else 457#endif 458 { 459 s_start = 7; 460 s_end = 0; 461 s_inc = -1; 462 } 463 464 shift = s_start; 465 466 for (i = 0; i < png_ptr->width; i++) 467 { 468 if (m & mask) 469 { 470 int value; 471 472 value = (*sp >> shift) & 0x1; 473 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff); 474 *dp |= (png_byte)(value << shift); 475 } 476 477 if (shift == s_end) 478 { 479 shift = s_start; 480 sp++; 481 dp++; 482 } 483 else 484 shift += s_inc; 485 486 if (m == 1) 487 m = 0x80; 488 else 489 m >>= 1; 490 } 491 break; 492 } 493 494 case 2: /* png_ptr->row_info.pixel_depth */ 495 { 496 png_bytep sp; 497 png_bytep dp; 498 int s_start, s_end, s_inc; 499 int m; 500 int shift; 501 png_uint_32 i; 502 int value; 503 504 sp = png_ptr->row_buf + 1; 505 dp = row; 506 m = 0x80; 507#if defined(PNG_READ_PACKSWAP_SUPPORTED) 508 if (png_ptr->transformations & PNG_PACKSWAP) 509 { 510 s_start = 0; 511 s_end = 6; 512 s_inc = 2; 513 } 514 else 515#endif 516 { 517 s_start = 6; 518 s_end = 0; 519 s_inc = -2; 520 } 521 522 shift = s_start; 523 524 for (i = 0; i < png_ptr->width; i++) 525 { 526 if (m & mask) 527 { 528 value = (*sp >> shift) & 0x3; 529 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff); 530 *dp |= (png_byte)(value << shift); 531 } 532 533 if (shift == s_end) 534 { 535 shift = s_start; 536 sp++; 537 dp++; 538 } 539 else 540 shift += s_inc; 541 if (m == 1) 542 m = 0x80; 543 else 544 m >>= 1; 545 } 546 break; 547 } 548 549 case 4: /* png_ptr->row_info.pixel_depth */ 550 { 551 png_bytep sp; 552 png_bytep dp; 553 int s_start, s_end, s_inc; 554 int m; 555 int shift; 556 png_uint_32 i; 557 int value; 558 559 sp = png_ptr->row_buf + 1; 560 dp = row; 561 m = 0x80; 562#if defined(PNG_READ_PACKSWAP_SUPPORTED) 563 if (png_ptr->transformations & PNG_PACKSWAP) 564 { 565 s_start = 0; 566 s_end = 4; 567 s_inc = 4; 568 } 569 else 570#endif 571 { 572 s_start = 4; 573 s_end = 0; 574 s_inc = -4; 575 } 576 shift = s_start; 577 578 for (i = 0; i < png_ptr->width; i++) 579 { 580 if (m & mask) 581 { 582 value = (*sp >> shift) & 0xf; 583 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff); 584 *dp |= (png_byte)(value << shift); 585 } 586 587 if (shift == s_end) 588 { 589 shift = s_start; 590 sp++; 591 dp++; 592 } 593 else 594 shift += s_inc; 595 if (m == 1) 596 m = 0x80; 597 else 598 m >>= 1; 599 } 600 break; 601 } 602 603 case 8: /* png_ptr->row_info.pixel_depth */ 604 { 605 png_bytep srcptr; 606 png_bytep dstptr; 607 608#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK) 609#if !defined(PNG_1_0_X) 610 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 611 /* && _mmx_supported */ ) 612#else 613 if (_mmx_supported) 614#endif 615 { 616 png_uint_32 len; 617 int diff; 618 int dummy_value_a; /* fix 'forbidden register spilled' error */ 619 int dummy_value_d; 620 int dummy_value_c; 621 int dummy_value_S; 622 int dummy_value_D; 623 _unmask = ~mask; /* global variable for -fPIC version */ 624 srcptr = png_ptr->row_buf + 1; 625 dstptr = row; 626 len = png_ptr->width &~7; /* reduce to multiple of 8 */ 627 diff = (int) (png_ptr->width & 7); /* amount lost */ 628 629 __asm__ __volatile__ ( 630 "movd _unmask, %%mm7 \n\t" /* load bit pattern */ 631 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */ 632 "punpcklbw %%mm7, %%mm7 \n\t" 633 "punpcklwd %%mm7, %%mm7 \n\t" 634 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */ 635 636 "movq _mask8_0, %%mm0 \n\t" 637 "pand %%mm7, %%mm0 \n\t" /* nonzero if keep byte */ 638 "pcmpeqb %%mm6, %%mm0 \n\t" /* zeros->1s, v versa */ 639 640/* preload "movl len, %%ecx \n\t" // load length of line */ 641/* preload "movl srcptr, %%esi \n\t" // load source */ 642/* preload "movl dstptr, %%edi \n\t" // load dest */ 643 644 "cmpl $0, %%ecx \n\t" /* len == 0 ? */ 645 "je mainloop8end \n\t" 646 647 "mainloop8: \n\t" 648 "movq (%%esi), %%mm4 \n\t" /* *srcptr */ 649 "pand %%mm0, %%mm4 \n\t" 650 "movq %%mm0, %%mm6 \n\t" 651 "pandn (%%edi), %%mm6 \n\t" /* *dstptr */ 652 "por %%mm6, %%mm4 \n\t" 653 "movq %%mm4, (%%edi) \n\t" 654 "addl $8, %%esi \n\t" /* inc by 8 bytes processed */ 655 "addl $8, %%edi \n\t" 656 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */ 657 "ja mainloop8 \n\t" 658 659 "mainloop8end: \n\t" 660/* preload "movl diff, %%ecx \n\t" // (diff is in eax) */ 661 "movl %%eax, %%ecx \n\t" 662 "cmpl $0, %%ecx \n\t" 663 "jz end8 \n\t" 664/* preload "movl mask, %%edx \n\t" */ 665 "sall $24, %%edx \n\t" /* make low byte, high byte */ 666 667 "secondloop8: \n\t" 668 "sall %%edx \n\t" /* move high bit to CF */ 669 "jnc skip8 \n\t" /* if CF = 0 */ 670 "movb (%%esi), %%al \n\t" 671 "movb %%al, (%%edi) \n\t" 672 673 "skip8: \n\t" 674 "incl %%esi \n\t" 675 "incl %%edi \n\t" 676 "decl %%ecx \n\t" 677 "jnz secondloop8 \n\t" 678 679 "end8: \n\t" 680 "EMMS \n\t" /* DONE */ 681 682 : "=a" (dummy_value_a), /* output regs (dummy) */ 683 "=d" (dummy_value_d), 684 "=c" (dummy_value_c), 685 "=S" (dummy_value_S), 686 "=D" (dummy_value_D) 687 688 : "3" (srcptr), /* esi // input regs */ 689 "4" (dstptr), /* edi */ 690 "0" (diff), /* eax */ 691/* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */ 692 "2" (len), /* ecx */ 693 "1" (mask) /* edx */ 694 695#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */ 696 : "%mm0", "%mm4", "%mm6", "%mm7" /* clobber list */ 697#endif 698 ); 699 } 700 else /* mmx _not supported - Use modified C routine */ 701#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */ 702 { 703 register png_uint_32 i; 704 png_uint_32 initial_val = png_pass_start[png_ptr->pass]; 705 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */ 706 register int stride = png_pass_inc[png_ptr->pass]; 707 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */ 708 register int rep_bytes = png_pass_width[png_ptr->pass]; 709 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */ 710 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */ 711 int diff = (int) (png_ptr->width & 7); /* amount lost */ 712 register png_uint_32 final_val = len; /* GRR bugfix */ 713 714 srcptr = png_ptr->row_buf + 1 + initial_val; 715 dstptr = row + initial_val; 716 717 for (i = initial_val; i < final_val; i += stride) 718 { 719 png_memcpy(dstptr, srcptr, rep_bytes); 720 srcptr += stride; 721 dstptr += stride; 722 } 723 if (diff) /* number of leftover pixels: 3 for pngtest */ 724 { 725 final_val+=diff /* *BPP1 */ ; 726 for (; i < final_val; i += stride) 727 { 728 if (rep_bytes > (int)(final_val-i)) 729 rep_bytes = (int)(final_val-i); 730 png_memcpy(dstptr, srcptr, rep_bytes); 731 srcptr += stride; 732 dstptr += stride; 733 } 734 } 735 736 } /* end of else (_mmx_supported) */ 737 738 break; 739 } /* end 8 bpp */ 740 741 case 16: /* png_ptr->row_info.pixel_depth */ 742 { 743 png_bytep srcptr; 744 png_bytep dstptr; 745 746#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK) 747#if !defined(PNG_1_0_X) 748 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 749 /* && _mmx_supported */ ) 750#else 751 if (_mmx_supported) 752#endif 753 { 754 png_uint_32 len; 755 int diff; 756 int dummy_value_a; /* fix 'forbidden register spilled' error */ 757 int dummy_value_d; 758 int dummy_value_c; 759 int dummy_value_S; 760 int dummy_value_D; 761 _unmask = ~mask; /* global variable for -fPIC version */ 762 srcptr = png_ptr->row_buf + 1; 763 dstptr = row; 764 len = png_ptr->width &~7; /* reduce to multiple of 8 */ 765 diff = (int) (png_ptr->width & 7); /* amount lost // */ 766 767 __asm__ __volatile__ ( 768 "movd _unmask, %%mm7 \n\t" /* load bit pattern */ 769 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */ 770 "punpcklbw %%mm7, %%mm7 \n\t" 771 "punpcklwd %%mm7, %%mm7 \n\t" 772 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */ 773 774 "movq _mask16_0, %%mm0 \n\t" 775 "movq _mask16_1, %%mm1 \n\t" 776 777 "pand %%mm7, %%mm0 \n\t" 778 "pand %%mm7, %%mm1 \n\t" 779 780 "pcmpeqb %%mm6, %%mm0 \n\t" 781 "pcmpeqb %%mm6, %%mm1 \n\t" 782 783/* preload "movl len, %%ecx \n\t" // load length of line */ 784/* preload "movl srcptr, %%esi \n\t" // load source */ 785/* preload "movl dstptr, %%edi \n\t" // load dest */ 786 787 "cmpl $0, %%ecx \n\t" 788 "jz mainloop16end \n\t" 789 790 "mainloop16: \n\t" 791 "movq (%%esi), %%mm4 \n\t" 792 "pand %%mm0, %%mm4 \n\t" 793 "movq %%mm0, %%mm6 \n\t" 794 "movq (%%edi), %%mm7 \n\t" 795 "pandn %%mm7, %%mm6 \n\t" 796 "por %%mm6, %%mm4 \n\t" 797 "movq %%mm4, (%%edi) \n\t" 798 799 "movq 8(%%esi), %%mm5 \n\t" 800 "pand %%mm1, %%mm5 \n\t" 801 "movq %%mm1, %%mm7 \n\t" 802 "movq 8(%%edi), %%mm6 \n\t" 803 "pandn %%mm6, %%mm7 \n\t" 804 "por %%mm7, %%mm5 \n\t" 805 "movq %%mm5, 8(%%edi) \n\t" 806 807 "addl $16, %%esi \n\t" /* inc by 16 bytes processed */ 808 "addl $16, %%edi \n\t" 809 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */ 810 "ja mainloop16 \n\t" 811 812 "mainloop16end: \n\t" 813/* preload "movl diff, %%ecx \n\t" // (diff is in eax) */ 814 "movl %%eax, %%ecx \n\t" 815 "cmpl $0, %%ecx \n\t" 816 "jz end16 \n\t" 817/* preload "movl mask, %%edx \n\t" */ 818 "sall $24, %%edx \n\t" /* make low byte, high byte */ 819 820 "secondloop16: \n\t" 821 "sall %%edx \n\t" /* move high bit to CF */ 822 "jnc skip16 \n\t" /* if CF = 0 */ 823 "movw (%%esi), %%ax \n\t" 824 "movw %%ax, (%%edi) \n\t" 825 826 "skip16: \n\t" 827 "addl $2, %%esi \n\t" 828 "addl $2, %%edi \n\t" 829 "decl %%ecx \n\t" 830 "jnz secondloop16 \n\t" 831 832 "end16: \n\t" 833 "EMMS \n\t" /* DONE */ 834 835 : "=a" (dummy_value_a), /* output regs (dummy) */ 836 "=c" (dummy_value_c), 837 "=d" (dummy_value_d), 838 "=S" (dummy_value_S), 839 "=D" (dummy_value_D) 840 841 : "0" (diff), /* eax // input regs */ 842/* was (unmask) " " RESERVED // ebx // Global Offset Table idx */ 843 "1" (len), /* ecx */ 844 "2" (mask), /* edx */ 845 "3" (srcptr), /* esi */ 846 "4" (dstptr) /* edi */ 847 848#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */ 849 : "%mm0", "%mm1", "%mm4" /* clobber list */ 850 , "%mm5", "%mm6", "%mm7" 851#endif 852 ); 853 } 854 else /* mmx _not supported - Use modified C routine */ 855#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */ 856 { 857 register png_uint_32 i; 858 png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass]; 859 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */ 860 register int stride = BPP2 * png_pass_inc[png_ptr->pass]; 861 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */ 862 register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass]; 863 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */ 864 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */ 865 int diff = (int) (png_ptr->width & 7); /* amount lost */ 866 register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */ 867 868 srcptr = png_ptr->row_buf + 1 + initial_val; 869 dstptr = row + initial_val; 870 871 for (i = initial_val; i < final_val; i += stride) 872 { 873 png_memcpy(dstptr, srcptr, rep_bytes); 874 srcptr += stride; 875 dstptr += stride; 876 } 877 if (diff) /* number of leftover pixels: 3 for pngtest */ 878 { 879 final_val+=diff*BPP2; 880 for (; i < final_val; i += stride) 881 { 882 if (rep_bytes > (int)(final_val-i)) 883 rep_bytes = (int)(final_val-i); 884 png_memcpy(dstptr, srcptr, rep_bytes); 885 srcptr += stride; 886 dstptr += stride; 887 } 888 } 889 } /* end of else (_mmx_supported) */ 890 891 break; 892 } /* end 16 bpp */ 893 894 case 24: /* png_ptr->row_info.pixel_depth */ 895 { 896 png_bytep srcptr; 897 png_bytep dstptr; 898 899#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK) 900#if !defined(PNG_1_0_X) 901 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 902 /* && _mmx_supported */ ) 903#else 904 if (_mmx_supported) 905#endif 906 { 907 png_uint_32 len; 908 int diff; 909 int dummy_value_a; /* fix 'forbidden register spilled' error */ 910 int dummy_value_d; 911 int dummy_value_c; 912 int dummy_value_S; 913 int dummy_value_D; 914 _unmask = ~mask; /* global variable for -fPIC version */ 915 srcptr = png_ptr->row_buf + 1; 916 dstptr = row; 917 len = png_ptr->width &~7; /* reduce to multiple of 8 */ 918 diff = (int) (png_ptr->width & 7); /* amount lost // */ 919 920 __asm__ __volatile__ ( 921 "movd _unmask, %%mm7 \n\t" /* load bit pattern */ 922 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */ 923 "punpcklbw %%mm7, %%mm7 \n\t" 924 "punpcklwd %%mm7, %%mm7 \n\t" 925 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */ 926 927 "movq _mask24_0, %%mm0 \n\t" 928 "movq _mask24_1, %%mm1 \n\t" 929 "movq _mask24_2, %%mm2 \n\t" 930 931 "pand %%mm7, %%mm0 \n\t" 932 "pand %%mm7, %%mm1 \n\t" 933 "pand %%mm7, %%mm2 \n\t" 934 935 "pcmpeqb %%mm6, %%mm0 \n\t" 936 "pcmpeqb %%mm6, %%mm1 \n\t" 937 "pcmpeqb %%mm6, %%mm2 \n\t" 938 939/* preload "movl len, %%ecx \n\t" // load length of line */ 940/* preload "movl srcptr, %%esi \n\t" // load source */ 941/* preload "movl dstptr, %%edi \n\t" // load dest */ 942 943 "cmpl $0, %%ecx \n\t" 944 "jz mainloop24end \n\t" 945 946 "mainloop24: \n\t" 947 "movq (%%esi), %%mm4 \n\t" 948 "pand %%mm0, %%mm4 \n\t" 949 "movq %%mm0, %%mm6 \n\t" 950 "movq (%%edi), %%mm7 \n\t" 951 "pandn %%mm7, %%mm6 \n\t" 952 "por %%mm6, %%mm4 \n\t" 953 "movq %%mm4, (%%edi) \n\t" 954 955 "movq 8(%%esi), %%mm5 \n\t" 956 "pand %%mm1, %%mm5 \n\t" 957 "movq %%mm1, %%mm7 \n\t" 958 "movq 8(%%edi), %%mm6 \n\t" 959 "pandn %%mm6, %%mm7 \n\t" 960 "por %%mm7, %%mm5 \n\t" 961 "movq %%mm5, 8(%%edi) \n\t" 962 963 "movq 16(%%esi), %%mm6 \n\t" 964 "pand %%mm2, %%mm6 \n\t" 965 "movq %%mm2, %%mm4 \n\t" 966 "movq 16(%%edi), %%mm7 \n\t" 967 "pandn %%mm7, %%mm4 \n\t" 968 "por %%mm4, %%mm6 \n\t" 969 "movq %%mm6, 16(%%edi) \n\t" 970 971 "addl $24, %%esi \n\t" /* inc by 24 bytes processed */ 972 "addl $24, %%edi \n\t" 973 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */ 974 975 "ja mainloop24 \n\t" 976 977 "mainloop24end: \n\t" 978/* preload "movl diff, %%ecx \n\t" // (diff is in eax) */ 979 "movl %%eax, %%ecx \n\t" 980 "cmpl $0, %%ecx \n\t" 981 "jz end24 \n\t" 982/* preload "movl mask, %%edx \n\t" */ 983 "sall $24, %%edx \n\t" /* make low byte, high byte */ 984 985 "secondloop24: \n\t" 986 "sall %%edx \n\t" /* move high bit to CF */ 987 "jnc skip24 \n\t" /* if CF = 0 */ 988 "movw (%%esi), %%ax \n\t" 989 "movw %%ax, (%%edi) \n\t" 990 "xorl %%eax, %%eax \n\t" 991 "movb 2(%%esi), %%al \n\t" 992 "movb %%al, 2(%%edi) \n\t" 993 994 "skip24: \n\t" 995 "addl $3, %%esi \n\t" 996 "addl $3, %%edi \n\t" 997 "decl %%ecx \n\t" 998 "jnz secondloop24 \n\t" 999 1000 "end24: \n\t" 1001 "EMMS \n\t" /* DONE */ 1002 1003 : "=a" (dummy_value_a), /* output regs (dummy) */ 1004 "=d" (dummy_value_d), 1005 "=c" (dummy_value_c), 1006 "=S" (dummy_value_S), 1007 "=D" (dummy_value_D) 1008 1009 : "3" (srcptr), /* esi // input regs */ 1010 "4" (dstptr), /* edi */ 1011 "0" (diff), /* eax */ 1012/* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */ 1013 "2" (len), /* ecx */ 1014 "1" (mask) /* edx */ 1015 1016#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */ 1017 : "%mm0", "%mm1", "%mm2" /* clobber list */ 1018 , "%mm4", "%mm5", "%mm6", "%mm7" 1019#endif 1020 ); 1021 } 1022 else /* mmx _not supported - Use modified C routine */ 1023#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */ 1024 { 1025 register png_uint_32 i; 1026 png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass]; 1027 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */ 1028 register int stride = BPP3 * png_pass_inc[png_ptr->pass]; 1029 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */ 1030 register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass]; 1031 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */ 1032 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */ 1033 int diff = (int) (png_ptr->width & 7); /* amount lost */ 1034 register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */ 1035 1036 srcptr = png_ptr->row_buf + 1 + initial_val; 1037 dstptr = row + initial_val; 1038 1039 for (i = initial_val; i < final_val; i += stride) 1040 { 1041 png_memcpy(dstptr, srcptr, rep_bytes); 1042 srcptr += stride; 1043 dstptr += stride; 1044 } 1045 if (diff) /* number of leftover pixels: 3 for pngtest */ 1046 { 1047 final_val+=diff*BPP3; 1048 for (; i < final_val; i += stride) 1049 { 1050 if (rep_bytes > (int)(final_val-i)) 1051 rep_bytes = (int)(final_val-i); 1052 png_memcpy(dstptr, srcptr, rep_bytes); 1053 srcptr += stride; 1054 dstptr += stride; 1055 } 1056 } 1057 } /* end of else (_mmx_supported) */ 1058 1059 break; 1060 } /* end 24 bpp */ 1061 1062 case 32: /* png_ptr->row_info.pixel_depth */ 1063 { 1064 png_bytep srcptr; 1065 png_bytep dstptr; 1066 1067#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK) 1068#if !defined(PNG_1_0_X) 1069 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 1070 /* && _mmx_supported */ ) 1071#else 1072 if (_mmx_supported) 1073#endif 1074 { 1075 png_uint_32 len; 1076 int diff; 1077 int dummy_value_a; /* fix 'forbidden register spilled' error */ 1078 int dummy_value_d; 1079 int dummy_value_c; 1080 int dummy_value_S; 1081 int dummy_value_D; 1082 _unmask = ~mask; /* global variable for -fPIC version */ 1083 srcptr = png_ptr->row_buf + 1; 1084 dstptr = row; 1085 len = png_ptr->width &~7; /* reduce to multiple of 8 */ 1086 diff = (int) (png_ptr->width & 7); /* amount lost // */ 1087 1088 __asm__ __volatile__ ( 1089 "movd _unmask, %%mm7 \n\t" /* load bit pattern */ 1090 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */ 1091 "punpcklbw %%mm7, %%mm7 \n\t" 1092 "punpcklwd %%mm7, %%mm7 \n\t" 1093 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */ 1094 1095 "movq _mask32_0, %%mm0 \n\t" 1096 "movq _mask32_1, %%mm1 \n\t" 1097 "movq _mask32_2, %%mm2 \n\t" 1098 "movq _mask32_3, %%mm3 \n\t" 1099 1100 "pand %%mm7, %%mm0 \n\t" 1101 "pand %%mm7, %%mm1 \n\t" 1102 "pand %%mm7, %%mm2 \n\t" 1103 "pand %%mm7, %%mm3 \n\t" 1104 1105 "pcmpeqb %%mm6, %%mm0 \n\t" 1106 "pcmpeqb %%mm6, %%mm1 \n\t" 1107 "pcmpeqb %%mm6, %%mm2 \n\t" 1108 "pcmpeqb %%mm6, %%mm3 \n\t" 1109 1110/* preload "movl len, %%ecx \n\t" // load length of line */ 1111/* preload "movl srcptr, %%esi \n\t" // load source */ 1112/* preload "movl dstptr, %%edi \n\t" // load dest */ 1113 1114 "cmpl $0, %%ecx \n\t" /* lcr */ 1115 "jz mainloop32end \n\t" 1116 1117 "mainloop32: \n\t" 1118 "movq (%%esi), %%mm4 \n\t" 1119 "pand %%mm0, %%mm4 \n\t" 1120 "movq %%mm0, %%mm6 \n\t" 1121 "movq (%%edi), %%mm7 \n\t" 1122 "pandn %%mm7, %%mm6 \n\t" 1123 "por %%mm6, %%mm4 \n\t" 1124 "movq %%mm4, (%%edi) \n\t" 1125 1126 "movq 8(%%esi), %%mm5 \n\t" 1127 "pand %%mm1, %%mm5 \n\t" 1128 "movq %%mm1, %%mm7 \n\t" 1129 "movq 8(%%edi), %%mm6 \n\t" 1130 "pandn %%mm6, %%mm7 \n\t" 1131 "por %%mm7, %%mm5 \n\t" 1132 "movq %%mm5, 8(%%edi) \n\t" 1133 1134 "movq 16(%%esi), %%mm6 \n\t" 1135 "pand %%mm2, %%mm6 \n\t" 1136 "movq %%mm2, %%mm4 \n\t" 1137 "movq 16(%%edi), %%mm7 \n\t" 1138 "pandn %%mm7, %%mm4 \n\t" 1139 "por %%mm4, %%mm6 \n\t" 1140 "movq %%mm6, 16(%%edi) \n\t" 1141 1142 "movq 24(%%esi), %%mm7 \n\t" 1143 "pand %%mm3, %%mm7 \n\t" 1144 "movq %%mm3, %%mm5 \n\t" 1145 "movq 24(%%edi), %%mm4 \n\t" 1146 "pandn %%mm4, %%mm5 \n\t" 1147 "por %%mm5, %%mm7 \n\t" 1148 "movq %%mm7, 24(%%edi) \n\t" 1149 1150 "addl $32, %%esi \n\t" /* inc by 32 bytes processed */ 1151 "addl $32, %%edi \n\t" 1152 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */ 1153 "ja mainloop32 \n\t" 1154 1155 "mainloop32end: \n\t" 1156/* preload "movl diff, %%ecx \n\t" // (diff is in eax) */ 1157 "movl %%eax, %%ecx \n\t" 1158 "cmpl $0, %%ecx \n\t" 1159 "jz end32 \n\t" 1160/* preload "movl mask, %%edx \n\t" */ 1161 "sall $24, %%edx \n\t" /* low byte => high byte */ 1162 1163 "secondloop32: \n\t" 1164 "sall %%edx \n\t" /* move high bit to CF */ 1165 "jnc skip32 \n\t" /* if CF = 0 */ 1166 "movl (%%esi), %%eax \n\t" 1167 "movl %%eax, (%%edi) \n\t" 1168 1169 "skip32: \n\t" 1170 "addl $4, %%esi \n\t" 1171 "addl $4, %%edi \n\t" 1172 "decl %%ecx \n\t" 1173 "jnz secondloop32 \n\t" 1174 1175 "end32: \n\t" 1176 "EMMS \n\t" /* DONE */ 1177 1178 : "=a" (dummy_value_a), /* output regs (dummy) */ 1179 "=d" (dummy_value_d), 1180 "=c" (dummy_value_c), 1181 "=S" (dummy_value_S), 1182 "=D" (dummy_value_D) 1183 1184 : "3" (srcptr), /* esi // input regs */ 1185 "4" (dstptr), /* edi */ 1186 "0" (diff), /* eax */ 1187/* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */ 1188 "2" (len), /* ecx */ 1189 "1" (mask) /* edx */ 1190 1191#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */ 1192 : "%mm0", "%mm1", "%mm2", "%mm3" /* clobber list */ 1193 , "%mm4", "%mm5", "%mm6", "%mm7" 1194#endif 1195 ); 1196 } 1197 else /* mmx _not supported - Use modified C routine */ 1198#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */ 1199 { 1200 register png_uint_32 i; 1201 png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass]; 1202 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */ 1203 register int stride = BPP4 * png_pass_inc[png_ptr->pass]; 1204 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */ 1205 register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass]; 1206 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */ 1207 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */ 1208 int diff = (int) (png_ptr->width & 7); /* amount lost */ 1209 register png_uint_32 final_val = BPP4 * len; /* GRR bugfix */ 1210 1211 srcptr = png_ptr->row_buf + 1 + initial_val; 1212 dstptr = row + initial_val; 1213 1214 for (i = initial_val; i < final_val; i += stride) 1215 { 1216 png_memcpy(dstptr, srcptr, rep_bytes); 1217 srcptr += stride; 1218 dstptr += stride; 1219 } 1220 if (diff) /* number of leftover pixels: 3 for pngtest */ 1221 { 1222 final_val+=diff*BPP4; 1223 for (; i < final_val; i += stride) 1224 { 1225 if (rep_bytes > (int)(final_val-i)) 1226 rep_bytes = (int)(final_val-i); 1227 png_memcpy(dstptr, srcptr, rep_bytes); 1228 srcptr += stride; 1229 dstptr += stride; 1230 } 1231 } 1232 } /* end of else (_mmx_supported) */ 1233 1234 break; 1235 } /* end 32 bpp */ 1236 1237 case 48: /* png_ptr->row_info.pixel_depth */ 1238 { 1239 png_bytep srcptr; 1240 png_bytep dstptr; 1241 1242#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK) 1243#if !defined(PNG_1_0_X) 1244 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW) 1245 /* && _mmx_supported */ ) 1246#else 1247 if (_mmx_supported) 1248#endif 1249 { 1250 png_uint_32 len; 1251 int diff; 1252 int dummy_value_a; /* fix 'forbidden register spilled' error */ 1253 int dummy_value_d; 1254 int dummy_value_c; 1255 int dummy_value_S; 1256 int dummy_value_D; 1257 _unmask = ~mask; /* global variable for -fPIC version */ 1258 srcptr = png_ptr->row_buf + 1; 1259 dstptr = row; 1260 len = png_ptr->width &~7; /* reduce to multiple of 8 */ 1261 diff = (int) (png_ptr->width & 7); /* amount lost // */ 1262 1263 __asm__ __volatile__ ( 1264 "movd _unmask, %%mm7 \n\t" /* load bit pattern */ 1265 "psubb %%mm6, %%mm6 \n\t" /* zero mm6 */ 1266 "punpcklbw %%mm7, %%mm7 \n\t" 1267 "punpcklwd %%mm7, %%mm7 \n\t" 1268 "punpckldq %%mm7, %%mm7 \n\t" /* fill reg with 8 masks */ 1269 1270 "movq _mask48_0, %%mm0 \n\t" 1271 "movq _mask48_1, %%mm1 \n\t" 1272 "movq _mask48_2, %%mm2 \n\t" 1273 "movq _mask48_3, %%mm3 \n\t" 1274 "movq _mask48_4, %%mm4 \n\t" 1275 "movq _mask48_5, %%mm5 \n\t" 1276 1277 "pand %%mm7, %%mm0 \n\t" 1278 "pand %%mm7, %%mm1 \n\t" 1279 "pand %%mm7, %%mm2 \n\t" 1280 "pand %%mm7, %%mm3 \n\t" 1281 "pand %%mm7, %%mm4 \n\t" 1282 "pand %%mm7, %%mm5 \n\t" 1283 1284 "pcmpeqb %%mm6, %%mm0 \n\t" 1285 "pcmpeqb %%mm6, %%mm1 \n\t" 1286 "pcmpeqb %%mm6, %%mm2 \n\t" 1287 "pcmpeqb %%mm6, %%mm3 \n\t" 1288 "pcmpeqb %%mm6, %%mm4 \n\t" 1289 "pcmpeqb %%mm6, %%mm5 \n\t" 1290 1291/* preload "movl len, %%ecx \n\t" // load length of line */ 1292/* preload "movl srcptr, %%esi \n\t" // load source */ 1293/* preload "movl dstptr, %%edi \n\t" // load dest */ 1294 1295 "cmpl $0, %%ecx \n\t" 1296 "jz mainloop48end \n\t" 1297 1298 "mainloop48: \n\t" 1299 "movq (%%esi), %%mm7 \n\t" 1300 "pand %%mm0, %%mm7 \n\t" 1301 "movq %%mm0, %%mm6 \n\t" 1302 "pandn (%%edi), %%mm6 \n\t" 1303 "por %%mm6, %%mm7 \n\t" 1304 "movq %%mm7, (%%edi) \n\t" 1305 1306 "movq 8(%%esi), %%mm6 \n\t" 1307 "pand %%mm1, %%mm6 \n\t" 1308 "movq %%mm1, %%mm7 \n\t" 1309 "pandn 8(%%edi), %%mm7 \n\t" 1310 "por %%mm7, %%mm6 \n\t" 1311 "movq %%mm6, 8(%%edi) \n\t" 1312 1313 "movq 16(%%esi), %%mm6 \n\t" 1314 "pand %%mm2, %%mm6 \n\t" 1315 "movq %%mm2, %%mm7 \n\t" 1316 "pandn 16(%%edi), %%mm7 \n\t" 1317 "por %%mm7, %%mm6 \n\t" 1318 "movq %%mm6, 16(%%edi) \n\t" 1319 1320 "movq 24(%%esi), %%mm7 \n\t" 1321 "pand %%mm3, %%mm7 \n\t" 1322 "movq %%mm3, %%mm6 \n\t" 1323 "pandn 24(%%edi), %%mm6 \n\t" 1324 "por %%mm6, %%mm7 \n\t" 1325 "movq %%mm7, 24(%%edi) \n\t" 1326 1327 "movq 32(%%esi), %%mm6 \n\t" 1328 "pand %%mm4, %%mm6 \n\t" 1329 "movq %%mm4, %%mm7 \n\t" 1330 "pandn 32(%%edi), %%mm7 \n\t" 1331 "por %%mm7, %%mm6 \n\t" 1332 "movq %%mm6, 32(%%edi) \n\t" 1333 1334 "movq 40(%%esi), %%mm7 \n\t" 1335 "pand %%mm5, %%mm7 \n\t" 1336 "movq %%mm5, %%mm6 \n\t" 1337 "pandn 40(%%edi), %%mm6 \n\t" 1338 "por %%mm6, %%mm7 \n\t" 1339 "movq %%mm7, 40(%%edi) \n\t" 1340 1341 "addl $48, %%esi \n\t" /* inc by 48 bytes processed */ 1342 "addl $48, %%edi \n\t" 1343 "subl $8, %%ecx \n\t" /* dec by 8 pixels processed */ 1344 1345 "ja mainloop48 \n\t" 1346 1347 "mainloop48end: \n\t" 1348/* preload "movl diff, %%ecx \n\t" // (diff is in eax) */ 1349 "movl %%eax, %%ecx \n\t" 1350 "cmpl $0, %%ecx \n\t" 1351 "jz end48 \n\t" 1352/* preload "movl mask, %%edx \n\t" */ 1353 "sall $24, %%edx \n\t" /* make low byte, high byte */ 1354 1355 "secondloop48: \n\t" 1356 "sall %%edx \n\t" /* move high bit to CF */ 1357 "jnc skip48 \n\t" /* if CF = 0 */ 1358 "movl (%%esi), %%eax \n\t" 1359 "movl %%eax, (%%edi) \n\t" 1360 1361 "skip48: \n\t" 1362 "addl $4, %%esi \n\t" 1363 "addl $4, %%edi \n\t" 1364 "decl %%ecx \n\t" 1365 "jnz secondloop48 \n\t" 1366 1367 "end48: \n\t" 1368 "EMMS \n\t" /* DONE */ 1369 1370 : "=a" (dummy_value_a), /* output regs (dummy) */ 1371 "=d" (dummy_value_d), 1372 "=c" (dummy_value_c), 1373 "=S" (dummy_value_S), 1374 "=D" (dummy_value_D) 1375 1376 : "3" (srcptr), /* esi // input regs */ 1377 "4" (dstptr), /* edi */ 1378 "0" (diff), /* eax */ 1379/* was (unmask) "b" RESERVED // ebx // Global Offset Table idx */ 1380 "2" (len), /* ecx */ 1381 "1" (mask) /* edx */ 1382 1383#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */ 1384 : "%mm0", "%mm1", "%mm2", "%mm3" /* clobber list */ 1385 , "%mm4", "%mm5", "%mm6", "%mm7" 1386#endif 1387 ); 1388 } 1389 else /* mmx _not supported - Use modified C routine */ 1390#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */ 1391 { 1392 register png_uint_32 i; 1393 png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass]; 1394 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */ 1395 register int stride = BPP6 * png_pass_inc[png_ptr->pass]; 1396 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */ 1397 register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass]; 1398 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */ 1399 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */ 1400 int diff = (int) (png_ptr->width & 7); /* amount lost */ 1401 register png_uint_32 final_val = BPP6 * len; /* GRR bugfix */ 1402 1403 srcptr = png_ptr->row_buf + 1 + initial_val; 1404 dstptr = row + initial_val; 1405 1406 for (i = initial_val; i < final_val; i += stride) 1407 { 1408 png_memcpy(dstptr, srcptr, rep_bytes); 1409 srcptr += stride; 1410 dstptr += stride; 1411 } 1412 if (diff) /* number of leftover pixels: 3 for pngtest */ 1413 { 1414 final_val+=diff*BPP6; 1415 for (; i < final_val; i += stride) 1416 { 1417 if (rep_bytes > (int)(final_val-i)) 1418 rep_bytes = (int)(final_val-i); 1419 png_memcpy(dstptr, srcptr, rep_bytes); 1420 srcptr += stride; 1421 dstptr += stride; 1422 } 1423 } 1424 } /* end of else (_mmx_supported) */ 1425 1426 break; 1427 } /* end 48 bpp */ 1428 1429 case 64: /* png_ptr->row_info.pixel_depth */ 1430 { 1431 png_bytep srcptr; 1432 png_bytep dstptr; 1433 register png_uint_32 i; 1434 png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass]; 1435 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */ 1436 register int stride = BPP8 * png_pass_inc[png_ptr->pass]; 1437 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */ 1438 register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass]; 1439 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */ 1440 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */ 1441 int diff = (int) (png_ptr->width & 7); /* amount lost */ 1442 register png_uint_32 final_val = BPP8 * len; /* GRR bugfix */ 1443 1444 srcptr = png_ptr->row_buf + 1 + initial_val; 1445 dstptr = row + initial_val; 1446 1447 for (i = initial_val; i < final_val; i += stride) 1448 { 1449 png_memcpy(dstptr, srcptr, rep_bytes); 1450 srcptr += stride; 1451 dstptr += stride; 1452 } 1453 if (diff) /* number of leftover pixels: 3 for pngtest */ 1454 { 1455 final_val+=diff*BPP8; 1456 for (; i < final_val; i += stride) 1457 { 1458 if (rep_bytes > (int)(final_val-i)) 1459 rep_bytes = (int)(final_val-i); 1460 png_memcpy(dstptr, srcptr, rep_bytes); 1461 srcptr += stride; 1462 dstptr += stride; 1463 } 1464 } 1465 1466 break; 1467 } /* end 64 bpp */ 1468 1469 default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */ 1470 { 1471 /* this should never happen */ 1472 png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd"); 1473 break; 1474 } 1475 } /* end switch (png_ptr->row_info.pixel_depth) */ 1476 1477 } /* end if (non-trivial mask) */ 1478 1479} /* end png_combine_row() */ 1480 1481#endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */ 1482 1483 1484 1485 1486/*===========================================================================*/ 1487/* */ 1488/* P N G _ D O _ R E A D _ I N T E R L A C E */ 1489/* */ 1490/*===========================================================================*/ 1491 1492#if defined(PNG_READ_INTERLACING_SUPPORTED) 1493#if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE) 1494 1495/* png_do_read_interlace() is called after any 16-bit to 8-bit conversion 1496 * has taken place. [GRR: what other steps come before and/or after?] 1497 */ 1498 1499void /* PRIVATE */ 1500png_do_read_interlace(png_structp png_ptr) 1501{ 1502 png_row_infop row_info = &(png_ptr->row_info); 1503 png_bytep row = png_ptr->row_buf + 1; 1504 int pass = png_ptr->pass; 1505#if defined(PNG_READ_PACKSWAP_SUPPORTED) 1506 png_uint_32 transformations = png_ptr->transformations; 1507#endif 1508 1509 png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n"); 1510 1511#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) 1512 if (_mmx_supported == 2) { 1513#if !defined(PNG_1_0_X) 1514 /* this should have happened in png_init_mmx_flags() already */ 1515 png_warning(png_ptr, "asm_flags may not have been initialized"); 1516#endif 1517 png_mmx_support(); 1518 } 1519#endif 1520 1521 if (row != NULL && row_info != NULL) 1522 { 1523 png_uint_32 final_width; 1524 1525 final_width = row_info->width * png_pass_inc[pass]; 1526 1527 switch (row_info->pixel_depth) 1528 { 1529 case 1: 1530 { 1531 png_bytep sp, dp; 1532 int sshift, dshift; 1533 int s_start, s_end, s_inc; 1534 png_byte v; 1535 png_uint_32 i; 1536 int j; 1537 1538 sp = row + (png_size_t)((row_info->width - 1) >> 3); 1539 dp = row + (png_size_t)((final_width - 1) >> 3); 1540#if defined(PNG_READ_PACKSWAP_SUPPORTED) 1541 if (transformations & PNG_PACKSWAP) 1542 { 1543 sshift = (int)((row_info->width + 7) & 7); 1544 dshift = (int)((final_width + 7) & 7); 1545 s_start = 7; 1546 s_end = 0; 1547 s_inc = -1; 1548 } 1549 else 1550#endif 1551 { 1552 sshift = 7 - (int)((row_info->width + 7) & 7); 1553 dshift = 7 - (int)((final_width + 7) & 7); 1554 s_start = 0; 1555 s_end = 7; 1556 s_inc = 1; 1557 } 1558 1559 for (i = row_info->width; i; i--) 1560 { 1561 v = (png_byte)((*sp >> sshift) & 0x1); 1562 for (j = 0; j < png_pass_inc[pass]; j++) 1563 { 1564 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff); 1565 *dp |= (png_byte)(v << dshift); 1566 if (dshift == s_end) 1567 { 1568 dshift = s_start; 1569 dp--; 1570 } 1571 else 1572 dshift += s_inc; 1573 } 1574 if (sshift == s_end) 1575 { 1576 sshift = s_start; 1577 sp--; 1578 } 1579 else 1580 sshift += s_inc; 1581 } 1582 break; 1583 } 1584 1585 case 2: 1586 { 1587 png_bytep sp, dp; 1588 int sshift, dshift; 1589 int s_start, s_end, s_inc; 1590 png_uint_32 i; 1591 1592 sp = row + (png_size_t)((row_info->width - 1) >> 2); 1593 dp = row + (png_size_t)((final_width - 1) >> 2); 1594#if defined(PNG_READ_PACKSWAP_SUPPORTED) 1595 if (transformations & PNG_PACKSWAP) 1596 { 1597 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1); 1598 dshift = (png_size_t)(((final_width + 3) & 3) << 1); 1599 s_start = 6; 1600 s_end = 0; 1601 s_inc = -2; 1602 } 1603 else 1604#endif 1605 { 1606 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1); 1607 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1); 1608 s_start = 0; 1609 s_end = 6; 1610 s_inc = 2; 1611 } 1612 1613 for (i = row_info->width; i; i--) 1614 { 1615 png_byte v; 1616 int j; 1617 1618 v = (png_byte)((*sp >> sshift) & 0x3); 1619 for (j = 0; j < png_pass_inc[pass]; j++) 1620 { 1621 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff); 1622 *dp |= (png_byte)(v << dshift); 1623 if (dshift == s_end) 1624 { 1625 dshift = s_start; 1626 dp--; 1627 } 1628 else 1629 dshift += s_inc; 1630 } 1631 if (sshift == s_end) 1632 { 1633 sshift = s_start; 1634 sp--; 1635 } 1636 else 1637 sshift += s_inc; 1638 } 1639 break; 1640 } 1641 1642 case 4: 1643 { 1644 png_bytep sp, dp; 1645 int sshift, dshift; 1646 int s_start, s_end, s_inc; 1647 png_uint_32 i; 1648 1649 sp = row + (png_size_t)((row_info->width - 1) >> 1); 1650 dp = row + (png_size_t)((final_width - 1) >> 1); 1651#if defined(PNG_READ_PACKSWAP_SUPPORTED) 1652 if (transformations & PNG_PACKSWAP) 1653 { 1654 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2); 1655 dshift = (png_size_t)(((final_width + 1) & 1) << 2); 1656 s_start = 4; 1657 s_end = 0; 1658 s_inc = -4; 1659 } 1660 else 1661#endif 1662 { 1663 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2); 1664 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2); 1665 s_start = 0; 1666 s_end = 4; 1667 s_inc = 4; 1668 } 1669 1670 for (i = row_info->width; i; i--) 1671 { 1672 png_byte v; 1673 int j; 1674 1675 v = (png_byte)((*sp >> sshift) & 0xf); 1676 for (j = 0; j < png_pass_inc[pass]; j++) 1677 { 1678 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff); 1679 *dp |= (png_byte)(v << dshift); 1680 if (dshift == s_end) 1681 { 1682 dshift = s_start; 1683 dp--; 1684 } 1685 else 1686 dshift += s_inc; 1687 } 1688 if (sshift == s_end) 1689 { 1690 sshift = s_start; 1691 sp--; 1692 } 1693 else 1694 sshift += s_inc; 1695 } 1696 break; 1697 } 1698 1699 /*====================================================================*/ 1700 1701 default: /* 8-bit or larger (this is where the routine is modified) */ 1702 { 1703#if 0 1704/* static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good */ 1705/* static unsigned long long const4 = 0x0000000000FFFFFFLL; no good */ 1706/* unsigned long long _const4 = 0x0000000000FFFFFFLL; no good */ 1707/* unsigned long long const4 = 0x0000000000FFFFFFLL; no good */ 1708#endif 1709 png_bytep sptr, dp; 1710 png_uint_32 i; 1711 png_size_t pixel_bytes; 1712 int width = (int)row_info->width; 1713 1714 pixel_bytes = (row_info->pixel_depth >> 3); 1715 1716 /* point sptr at the last pixel in the pre-expanded row: */ 1717 sptr = row + (width - 1) * pixel_bytes; 1718 1719 /* point dp at the last pixel position in the expanded row: */ 1720 dp = row + (final_width - 1) * pixel_bytes; 1721 1722 /* New code by Nirav Chhatrapati - Intel Corporation */ 1723 1724#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) 1725#if !defined(PNG_1_0_X) 1726 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE) 1727 /* && _mmx_supported */ ) 1728#else 1729 if (_mmx_supported) 1730#endif 1731 { 1732 //-------------------------------------------------------------- 1733 if (pixel_bytes == 3) 1734 { 1735 if (((pass == 0) || (pass == 1)) && width) 1736 { 1737 int dummy_value_c; /* fix 'forbidden register spilled' */ 1738 int dummy_value_S; 1739 int dummy_value_D; 1740 1741 __asm__ __volatile__ ( 1742 "subl $21, %%edi \n\t" 1743 /* (png_pass_inc[pass] - 1)*pixel_bytes */ 1744 1745 ".loop3_pass0: \n\t" 1746 "movd (%%esi), %%mm0 \n\t" /* x x x x x 2 1 0 */ 1747 "pand _const4, %%mm0 \n\t" /* z z z z z 2 1 0 */ 1748 "movq %%mm0, %%mm1 \n\t" /* z z z z z 2 1 0 */ 1749 "psllq $16, %%mm0 \n\t" /* z z z 2 1 0 z z */ 1750 "movq %%mm0, %%mm2 \n\t" /* z z z 2 1 0 z z */ 1751 "psllq $24, %%mm0 \n\t" /* 2 1 0 z z z z z */ 1752 "psrlq $8, %%mm1 \n\t" /* z z z z z z 2 1 */ 1753 "por %%mm2, %%mm0 \n\t" /* 2 1 0 2 1 0 z z */ 1754 "por %%mm1, %%mm0 \n\t" /* 2 1 0 2 1 0 2 1 */ 1755 "movq %%mm0, %%mm3 \n\t" /* 2 1 0 2 1 0 2 1 */ 1756 "psllq $16, %%mm0 \n\t" /* 0 2 1 0 2 1 z z */ 1757 "movq %%mm3, %%mm4 \n\t" /* 2 1 0 2 1 0 2 1 */ 1758 "punpckhdq %%mm0, %%mm3 \n\t" /* 0 2 1 0 2 1 0 2 */ 1759 "movq %%mm4, 16(%%edi) \n\t" 1760 "psrlq $32, %%mm0 \n\t" /* z z z z 0 2 1 0 */ 1761 "movq %%mm3, 8(%%edi) \n\t" 1762 "punpckldq %%mm4, %%mm0 \n\t" /* 1 0 2 1 0 2 1 0 */ 1763 "subl $3, %%esi \n\t" 1764 "movq %%mm0, (%%edi) \n\t" 1765 "subl $24, %%edi \n\t" 1766 "decl %%ecx \n\t" 1767 "jnz .loop3_pass0 \n\t" 1768 "EMMS \n\t" /* DONE */ 1769 1770 : "=c" (dummy_value_c), /* output regs (dummy) */ 1771 "=S" (dummy_value_S), 1772 "=D" (dummy_value_D) 1773 1774 : "1" (sptr), // esi // input regs 1775 "2" (dp), // edi 1776 "0" (width), // ecx 1777 "rim" (_const4) // %1(?) (0x0000000000FFFFFFLL) 1778 1779#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */ 1780 : "%mm0", "%mm1", "%mm2" /* clobber list */ 1781 , "%mm3", "%mm4" 1782#endif 1783 ); 1784 } 1785 else if (((pass == 2) || (pass == 3)) && width) 1786 { 1787 int dummy_value_c; /* fix 'forbidden register spilled' */ 1788 int dummy_value_S; 1789 int dummy_value_D; 1790 1791 __asm__ __volatile__ ( 1792 "subl $9, %%edi \n\t" 1793 /* (png_pass_inc[pass] - 1)*pixel_bytes */ 1794 1795 ".loop3_pass2: \n\t" 1796 "movd (%%esi), %%mm0 \n\t" /* x x x x x 2 1 0 */ 1797 "pand _const4, %%mm0 \n\t" /* z z z z z 2 1 0 */ 1798 "movq %%mm0, %%mm1 \n\t" /* z z z z z 2 1 0 */ 1799 "psllq $16, %%mm0 \n\t" /* z z z 2 1 0 z z */ 1800 "movq %%mm0, %%mm2 \n\t" /* z z z 2 1 0 z z */ 1801 "psllq $24, %%mm0 \n\t" /* 2 1 0 z z z z z */ 1802 "psrlq $8, %%mm1 \n\t" /* z z z z z z 2 1 */ 1803 "por %%mm2, %%mm0 \n\t" /* 2 1 0 2 1 0 z z */ 1804 "por %%mm1, %%mm0 \n\t" /* 2 1 0 2 1 0 2 1 */ 1805 "movq %%mm0, 4(%%edi) \n\t" 1806 "psrlq $16, %%mm0 \n\t" /* z z 2 1 0 2 1 0 */ 1807 "subl $3, %%esi \n\t" 1808 "movd %%mm0, (%%edi) \n\t" 1809 "subl $12, %%edi \n\t" 1810 "decl %%ecx \n\t" 1811 "jnz .loop3_pass2 \n\t" 1812 "EMMS \n\t" /* DONE */ 1813 1814 : "=c" (dummy_value_c), /* output regs (dummy) */ 1815 "=S" (dummy_value_S), 1816 "=D" (dummy_value_D) 1817 1818 : "1" (sptr), // esi // input regs 1819 "2" (dp), // edi 1820 "0" (width), // ecx 1821 "rim" (_const4) // (0x0000000000FFFFFFLL) 1822 1823#if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */ 1824 : "%mm0", "%mm1", "%mm2" /* clobber list */ 1825#endif 1826 ); 1827 } 1828 else if (width) /* && ((pass == 4) || (pass == 5)) */ 1829 { 1830 int width_mmx = ((width >> 1) << 1) - 8; /* GRR: huh? */ 1831 if (width_mmx < 0) 1832 width_mmx = 0; 1833 width -= width_mmx; /* 8 or 9 pix, 24 or 27 bytes */ 1834 if (width_mmx) 1835 { 1836 /* png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */ 1837 /* sptr points at last pixel in pre-expanded row */ 1838 /* dp points at last pixel position in expanded row */ 1839 int dummy_value_c; /* fix 'forbidden register spilled' */ 1840 int dummy_value_S; 1841 int dummy_value_D; 1842 1843 __asm__ __volatile__ ( 1844 "subl $3, %%esi \n\t" 1845 "subl $9, %%edi \n\t" 1846 /* (png_pass_inc[pass] + 1)*pixel_bytes */ 1847 1848 ".loop3_pass4: \n\t" 1849 "movq (%%esi), %%mm0 \n\t" /* x x 5 4 3 2 1 0 */ 1850 "movq %%mm0, %%mm1 \n\t" /* x x 5 4 3 2 1 0 */ 1851 "movq %%mm0, %%mm2 \n\t" /* x x 5 4 3 2 1 0 */ 1852 "psllq $24, %%mm0 \n\t" /* 4 3 2 1 0 z z z */ 1853 "pand _const4, %%mm1 \n\t" /* z z z z z 2 1 0 */ 1854 "psrlq $24, %%mm2 \n\t" /* z z z x x 5 4 3 */ 1855 "por %%mm1, %%mm0 \n\t" /* 4 3 2 1 0 2 1 0 */ 1856 "movq %%mm2, %%mm3 \n\t" /* z z z x x 5 4 3 */ 1857 "psllq $8, %%mm2 \n\t" /* z z x x 5 4 3 z */ 1858 "movq %%mm0, (%%edi) \n\t" 1859 "psrlq $16, %%mm3 \n\t" /* z z z z z x x 5 */ 1860 "pand _const6, %%mm3 \n\t" /* z z z z z z z 5 */ 1861 "por %%mm3, %%mm2 \n\t" /* z z x x 5 4 3 5 */ 1862 "subl $6, %%esi \n\t" 1863 "movd %%mm2, 8(%%edi) \n\t" 1864 "subl $12, %%edi \n\t" 1865 "subl $2, %%ecx \n\t" 1866 "jnz .loop3_pass4 \n\t" 1867 "EMMS \n\t" /* DONE */ 1868 1869 : "=c" (dummy_value_c), /* output regs (dummy) */ 1870 "=S" (dummy_value_S), 1871 "=D" (dummy_value_D) 1872 1873 : "1" (sptr), // esi // input regs 1874 "2" (dp), // edi 1875 "0" (width_mmx), // ecx 1876 "rim" (_const4), // 0x0000000000FFFFFFLL 1877 "rim" (_const6) // 0x00000000000000FFLL 1878 1879#if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */ 1880 : "%mm0", "%mm1" /* clobber list */ 1881 , "%mm2", "%mm3" 1882#endif 1883 ); 1884 } 1885 1886 sptr -= width_mmx*3; 1887 dp -= width_mmx*6; 1888 for (i = width; i; i--) 1889 { 1890 png_byte v[8]; 1891 int j; 1892 1893 png_memcpy(v, sptr, 3); 1894 for (j = 0; j < png_pass_inc[pass]; j++) 1895 { 1896 png_memcpy(dp, v, 3); 1897 dp -= 3; 1898 } 1899 sptr -= 3; 1900 } 1901 } 1902 } /* end of pixel_bytes == 3 */ 1903 1904 //-------------------------------------------------------------- 1905 else if (pixel_bytes == 1) 1906 { 1907 if (((pass == 0) || (pass == 1)) && width) 1908 { 1909 int width_mmx = ((width >> 2) << 2); 1910 width -= width_mmx; /* 0-3 pixels => 0-3 bytes */ 1911 if (width_mmx) 1912 { 1913 int dummy_value_c; /* fix 'forbidden register spilled' */ 1914 int dummy_value_S; 1915 int dummy_value_D; 1916 1917 __asm__ __volatile__ ( 1918 "subl $3, %%esi \n\t" 1919 "subl $31, %%edi \n\t" 1920 1921 ".loop1_pass0: \n\t" 1922 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */ 1923 "movq %%mm0, %%mm1 \n\t" /* x x x x 3 2 1 0 */ 1924 "punpcklbw %%mm0, %%mm0 \n\t" /* 3 3 2 2 1 1 0 0 */ 1925 "movq %%mm0, %%mm2 \n\t" /* 3 3 2 2 1 1 0 0 */ 1926 "punpcklwd %%mm0, %%mm0 \n\t" /* 1 1 1 1 0 0 0 0 */ 1927 "movq %%mm0, %%mm3 \n\t" /* 1 1 1 1 0 0 0 0 */ 1928 "punpckldq %%mm0, %%mm0 \n\t" /* 0 0 0 0 0 0 0 0 */ 1929 "punpckhdq %%mm3, %%mm3 \n\t" /* 1 1 1 1 1 1 1 1 */ 1930 "movq %%mm0, (%%edi) \n\t" 1931 "punpckhwd %%mm2, %%mm2 \n\t" /* 3 3 3 3 2 2 2 2 */ 1932 "movq %%mm3, 8(%%edi) \n\t" 1933 "movq %%mm2, %%mm4 \n\t" /* 3 3 3 3 2 2 2 2 */ 1934 "punpckldq %%mm2, %%mm2 \n\t" /* 2 2 2 2 2 2 2 2 */ 1935 "punpckhdq %%mm4, %%mm4 \n\t" /* 3 3 3 3 3 3 3 3 */ 1936 "movq %%mm2, 16(%%edi) \n\t" 1937 "subl $4, %%esi \n\t" 1938 "movq %%mm4, 24(%%edi) \n\t" 1939 "subl $32, %%edi \n\t" 1940 "subl $4, %%ecx \n\t" 1941 "jnz .loop1_pass0 \n\t" 1942 "EMMS \n\t" /* DONE */ 1943 1944 : "=c" (dummy_value_c), /* output regs (dummy) */ 1945 "=S" (dummy_value_S), 1946 "=D" (dummy_value_D) 1947 1948 : "1" (sptr), /* esi // input regs */ 1949 "2" (dp), /* edi */ 1950 "0" (width_mmx) /* ecx */ 1951 1952#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */ 1953 : "%mm0", "%mm1", "%mm2" /* clobber list */ 1954 , "%mm3", "%mm4" 1955#endif 1956 ); 1957 } 1958 1959 sptr -= width_mmx; 1960 dp -= width_mmx*8; 1961 for (i = width; i; i--) 1962 { 1963 int j; 1964 1965 /* I simplified this part in version 1.0.4e 1966 * here and in several other instances where 1967 * pixel_bytes == 1 -- GR-P 1968 * 1969 * Original code: 1970 * 1971 * png_byte v[8]; 1972 * png_memcpy(v, sptr, pixel_bytes); 1973 * for (j = 0; j < png_pass_inc[pass]; j++) 1974 * { 1975 * png_memcpy(dp, v, pixel_bytes); 1976 * dp -= pixel_bytes; 1977 * } 1978 * sptr -= pixel_bytes; 1979 * 1980 * Replacement code is in the next three lines: 1981 */ 1982 1983 for (j = 0; j < png_pass_inc[pass]; j++) 1984 { 1985 *dp-- = *sptr; 1986 } 1987 --sptr; 1988 } 1989 } 1990 else if (((pass == 2) || (pass == 3)) && width) 1991 { 1992 int width_mmx = ((width >> 2) << 2); 1993 width -= width_mmx; /* 0-3 pixels => 0-3 bytes */ 1994 if (width_mmx) 1995 { 1996 int dummy_value_c; /* fix 'forbidden register spilled' */ 1997 int dummy_value_S; 1998 int dummy_value_D; 1999 2000 __asm__ __volatile__ ( 2001 "subl $3, %%esi \n\t" 2002 "subl $15, %%edi \n\t" 2003 2004 ".loop1_pass2: \n\t" 2005 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */ 2006 "punpcklbw %%mm0, %%mm0 \n\t" /* 3 3 2 2 1 1 0 0 */ 2007 "movq %%mm0, %%mm1 \n\t" /* 3 3 2 2 1 1 0 0 */ 2008 "punpcklwd %%mm0, %%mm0 \n\t" /* 1 1 1 1 0 0 0 0 */ 2009 "punpckhwd %%mm1, %%mm1 \n\t" /* 3 3 3 3 2 2 2 2 */ 2010 "movq %%mm0, (%%edi) \n\t" 2011 "subl $4, %%esi \n\t" 2012 "movq %%mm1, 8(%%edi) \n\t" 2013 "subl $16, %%edi \n\t" 2014 "subl $4, %%ecx \n\t" 2015 "jnz .loop1_pass2 \n\t" 2016 "EMMS \n\t" /* DONE */ 2017 2018 : "=c" (dummy_value_c), /* output regs (dummy) */ 2019 "=S" (dummy_value_S), 2020 "=D" (dummy_value_D) 2021 2022 : "1" (sptr), /* esi // input regs */ 2023 "2" (dp), /* edi */ 2024 "0" (width_mmx) /* ecx */ 2025 2026#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */ 2027 : "%mm0", "%mm1" /* clobber list */ 2028#endif 2029 ); 2030 } 2031 2032 sptr -= width_mmx; 2033 dp -= width_mmx*4; 2034 for (i = width; i; i--) 2035 { 2036 int j; 2037 2038 for (j = 0; j < png_pass_inc[pass]; j++) 2039 { 2040 *dp-- = *sptr; 2041 } 2042 --sptr; 2043 } 2044 } 2045 else if (width) /* && ((pass == 4) || (pass == 5)) */ 2046 { 2047 int width_mmx = ((width >> 3) << 3); 2048 width -= width_mmx; /* 0-3 pixels => 0-3 bytes */ 2049 if (width_mmx) 2050 { 2051 int dummy_value_c; /* fix 'forbidden register spilled' */ 2052 int dummy_value_S; 2053 int dummy_value_D; 2054 2055 __asm__ __volatile__ ( 2056 "subl $7, %%esi \n\t" 2057 "subl $15, %%edi \n\t" 2058 2059 ".loop1_pass4: \n\t" 2060 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */ 2061 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */ 2062 "punpcklbw %%mm0, %%mm0 \n\t" /* 3 3 2 2 1 1 0 0 */ 2063 "punpckhbw %%mm1, %%mm1 \n\t" /* 7 7 6 6 5 5 4 4 */ 2064 "movq %%mm1, 8(%%edi) \n\t" 2065 "subl $8, %%esi \n\t" 2066 "movq %%mm0, (%%edi) \n\t" 2067 "subl $16, %%edi \n\t" 2068 "subl $8, %%ecx \n\t" 2069 "jnz .loop1_pass4 \n\t" 2070 "EMMS \n\t" /* DONE */ 2071 2072 : "=c" (dummy_value_c), /* output regs (none) */ 2073 "=S" (dummy_value_S), 2074 "=D" (dummy_value_D) 2075 2076 : "1" (sptr), /* esi // input regs */ 2077 "2" (dp), /* edi */ 2078 "0" (width_mmx) /* ecx */ 2079 2080#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */ 2081 : "%mm0", "%mm1" /* clobber list */ 2082#endif 2083 ); 2084 } 2085 2086 sptr -= width_mmx; 2087 dp -= width_mmx*2; 2088 for (i = width; i; i--) 2089 { 2090 int j; 2091 2092 for (j = 0; j < png_pass_inc[pass]; j++) 2093 { 2094 *dp-- = *sptr; 2095 } 2096 --sptr; 2097 } 2098 } 2099 } /* end of pixel_bytes == 1 */ 2100 2101 //-------------------------------------------------------------- 2102 else if (pixel_bytes == 2) 2103 { 2104 if (((pass == 0) || (pass == 1)) && width) 2105 { 2106 int width_mmx = ((width >> 1) << 1); 2107 width -= width_mmx; /* 0,1 pixels => 0,2 bytes */ 2108 if (width_mmx) 2109 { 2110 int dummy_value_c; /* fix 'forbidden register spilled' */ 2111 int dummy_value_S; 2112 int dummy_value_D; 2113 2114 __asm__ __volatile__ ( 2115 "subl $2, %%esi \n\t" 2116 "subl $30, %%edi \n\t" 2117 2118 ".loop2_pass0: \n\t" 2119 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */ 2120 "punpcklwd %%mm0, %%mm0 \n\t" /* 3 2 3 2 1 0 1 0 */ 2121 "movq %%mm0, %%mm1 \n\t" /* 3 2 3 2 1 0 1 0 */ 2122 "punpckldq %%mm0, %%mm0 \n\t" /* 1 0 1 0 1 0 1 0 */ 2123 "punpckhdq %%mm1, %%mm1 \n\t" /* 3 2 3 2 3 2 3 2 */ 2124 "movq %%mm0, (%%edi) \n\t" 2125 "movq %%mm0, 8(%%edi) \n\t" 2126 "movq %%mm1, 16(%%edi) \n\t" 2127 "subl $4, %%esi \n\t" 2128 "movq %%mm1, 24(%%edi) \n\t" 2129 "subl $32, %%edi \n\t" 2130 "subl $2, %%ecx \n\t" 2131 "jnz .loop2_pass0 \n\t" 2132 "EMMS \n\t" /* DONE */ 2133 2134 : "=c" (dummy_value_c), /* output regs (dummy) */ 2135 "=S" (dummy_value_S), 2136 "=D" (dummy_value_D) 2137 2138 : "1" (sptr), /* esi // input regs */ 2139 "2" (dp), /* edi */ 2140 "0" (width_mmx) /* ecx */ 2141 2142#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */ 2143 : "%mm0", "%mm1" /* clobber list */ 2144#endif 2145 ); 2146 } 2147 2148 sptr -= (width_mmx*2 - 2); /* sign fixed */ 2149 dp -= (width_mmx*16 - 2); /* sign fixed */ 2150 for (i = width; i; i--) 2151 { 2152 png_byte v[8]; 2153 int j; 2154 sptr -= 2; 2155 png_memcpy(v, sptr, 2); 2156 for (j = 0; j < png_pass_inc[pass]; j++) 2157 { 2158 dp -= 2; 2159 png_memcpy(dp, v, 2); 2160 } 2161 } 2162 } 2163 else if (((pass == 2) || (pass == 3)) && width) 2164 { 2165 int width_mmx = ((width >> 1) << 1) ; 2166 width -= width_mmx; /* 0,1 pixels => 0,2 bytes */ 2167 if (width_mmx) 2168 { 2169 int dummy_value_c; /* fix 'forbidden register spilled' */ 2170 int dummy_value_S; 2171 int dummy_value_D; 2172 2173 __asm__ __volatile__ ( 2174 "subl $2, %%esi \n\t" 2175 "subl $14, %%edi \n\t" 2176 2177 ".loop2_pass2: \n\t" 2178 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */ 2179 "punpcklwd %%mm0, %%mm0 \n\t" /* 3 2 3 2 1 0 1 0 */ 2180 "movq %%mm0, %%mm1 \n\t" /* 3 2 3 2 1 0 1 0 */ 2181 "punpckldq %%mm0, %%mm0 \n\t" /* 1 0 1 0 1 0 1 0 */ 2182 "punpckhdq %%mm1, %%mm1 \n\t" /* 3 2 3 2 3 2 3 2 */ 2183 "movq %%mm0, (%%edi) \n\t" 2184 "subl $4, %%esi \n\t" 2185 "movq %%mm1, 8(%%edi) \n\t" 2186 "subl $16, %%edi \n\t" 2187 "subl $2, %%ecx \n\t" 2188 "jnz .loop2_pass2 \n\t" 2189 "EMMS \n\t" /* DONE */ 2190 2191 : "=c" (dummy_value_c), /* output regs (dummy) */ 2192 "=S" (dummy_value_S), 2193 "=D" (dummy_value_D) 2194 2195 : "1" (sptr), /* esi // input regs */ 2196 "2" (dp), /* edi */ 2197 "0" (width_mmx) /* ecx */ 2198 2199#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */ 2200 : "%mm0", "%mm1" /* clobber list */ 2201#endif 2202 ); 2203 } 2204 2205 sptr -= (width_mmx*2 - 2); /* sign fixed */ 2206 dp -= (width_mmx*8 - 2); /* sign fixed */ 2207 for (i = width; i; i--) 2208 { 2209 png_byte v[8]; 2210 int j; 2211 sptr -= 2; 2212 png_memcpy(v, sptr, 2); 2213 for (j = 0; j < png_pass_inc[pass]; j++) 2214 { 2215 dp -= 2; 2216 png_memcpy(dp, v, 2); 2217 } 2218 } 2219 } 2220 else if (width) /* pass == 4 or 5 */ 2221 { 2222 int width_mmx = ((width >> 1) << 1) ; 2223 width -= width_mmx; /* 0,1 pixels => 0,2 bytes */ 2224 if (width_mmx) 2225 { 2226 int dummy_value_c; /* fix 'forbidden register spilled' */ 2227 int dummy_value_S; 2228 int dummy_value_D; 2229 2230 __asm__ __volatile__ ( 2231 "subl $2, %%esi \n\t" 2232 "subl $6, %%edi \n\t" 2233 2234 ".loop2_pass4: \n\t" 2235 "movd (%%esi), %%mm0 \n\t" /* x x x x 3 2 1 0 */ 2236 "punpcklwd %%mm0, %%mm0 \n\t" /* 3 2 3 2 1 0 1 0 */ 2237 "subl $4, %%esi \n\t" 2238 "movq %%mm0, (%%edi) \n\t" 2239 "subl $8, %%edi \n\t" 2240 "subl $2, %%ecx \n\t" 2241 "jnz .loop2_pass4 \n\t" 2242 "EMMS \n\t" /* DONE */ 2243 2244 : "=c" (dummy_value_c), /* output regs (dummy) */ 2245 "=S" (dummy_value_S), 2246 "=D" (dummy_value_D) 2247 2248 : "1" (sptr), /* esi // input regs */ 2249 "2" (dp), /* edi */ 2250 "0" (width_mmx) /* ecx */ 2251 2252#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */ 2253 : "%mm0" /* clobber list */ 2254#endif 2255 ); 2256 } 2257 2258 sptr -= (width_mmx*2 - 2); /* sign fixed */ 2259 dp -= (width_mmx*4 - 2); /* sign fixed */ 2260 for (i = width; i; i--) 2261 { 2262 png_byte v[8]; 2263 int j; 2264 sptr -= 2; 2265 png_memcpy(v, sptr, 2); 2266 for (j = 0; j < png_pass_inc[pass]; j++) 2267 { 2268 dp -= 2; 2269 png_memcpy(dp, v, 2); 2270 } 2271 } 2272 } 2273 } /* end of pixel_bytes == 2 */ 2274 2275 //-------------------------------------------------------------- 2276 else if (pixel_bytes == 4) 2277 { 2278 if (((pass == 0) || (pass == 1)) && width) 2279 { 2280 int width_mmx = ((width >> 1) << 1); 2281 width -= width_mmx; /* 0,1 pixels => 0,4 bytes */ 2282 if (width_mmx) 2283 { 2284 int dummy_value_c; /* fix 'forbidden register spilled' */ 2285 int dummy_value_S; 2286 int dummy_value_D; 2287 2288 __asm__ __volatile__ ( 2289 "subl $4, %%esi \n\t" 2290 "subl $60, %%edi \n\t" 2291 2292 ".loop4_pass0: \n\t" 2293 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */ 2294 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */ 2295 "punpckldq %%mm0, %%mm0 \n\t" /* 3 2 1 0 3 2 1 0 */ 2296 "punpckhdq %%mm1, %%mm1 \n\t" /* 7 6 5 4 7 6 5 4 */ 2297 "movq %%mm0, (%%edi) \n\t" 2298 "movq %%mm0, 8(%%edi) \n\t" 2299 "movq %%mm0, 16(%%edi) \n\t" 2300 "movq %%mm0, 24(%%edi) \n\t" 2301 "movq %%mm1, 32(%%edi) \n\t" 2302 "movq %%mm1, 40(%%edi) \n\t" 2303 "movq %%mm1, 48(%%edi) \n\t" 2304 "subl $8, %%esi \n\t" 2305 "movq %%mm1, 56(%%edi) \n\t" 2306 "subl $64, %%edi \n\t" 2307 "subl $2, %%ecx \n\t" 2308 "jnz .loop4_pass0 \n\t" 2309 "EMMS \n\t" /* DONE */ 2310 2311 : "=c" (dummy_value_c), /* output regs (dummy) */ 2312 "=S" (dummy_value_S), 2313 "=D" (dummy_value_D) 2314 2315 : "1" (sptr), /* esi // input regs */ 2316 "2" (dp), /* edi */ 2317 "0" (width_mmx) /* ecx */ 2318 2319#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */ 2320 : "%mm0", "%mm1" /* clobber list */ 2321#endif 2322 ); 2323 } 2324 2325 sptr -= (width_mmx*4 - 4); /* sign fixed */ 2326 dp -= (width_mmx*32 - 4); /* sign fixed */ 2327 for (i = width; i; i--) 2328 { 2329 png_byte v[8]; 2330 int j; 2331 sptr -= 4; 2332 png_memcpy(v, sptr, 4); 2333 for (j = 0; j < png_pass_inc[pass]; j++) 2334 { 2335 dp -= 4; 2336 png_memcpy(dp, v, 4); 2337 } 2338 } 2339 } 2340 else if (((pass == 2) || (pass == 3)) && width) 2341 { 2342 int width_mmx = ((width >> 1) << 1); 2343 width -= width_mmx; /* 0,1 pixels => 0,4 bytes */ 2344 if (width_mmx) 2345 { 2346 int dummy_value_c; /* fix 'forbidden register spilled' */ 2347 int dummy_value_S; 2348 int dummy_value_D; 2349 2350 __asm__ __volatile__ ( 2351 "subl $4, %%esi \n\t" 2352 "subl $28, %%edi \n\t" 2353 2354 ".loop4_pass2: \n\t" 2355 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */ 2356 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */ 2357 "punpckldq %%mm0, %%mm0 \n\t" /* 3 2 1 0 3 2 1 0 */ 2358 "punpckhdq %%mm1, %%mm1 \n\t" /* 7 6 5 4 7 6 5 4 */ 2359 "movq %%mm0, (%%edi) \n\t" 2360 "movq %%mm0, 8(%%edi) \n\t" 2361 "movq %%mm1, 16(%%edi) \n\t" 2362 "movq %%mm1, 24(%%edi) \n\t" 2363 "subl $8, %%esi \n\t" 2364 "subl $32, %%edi \n\t" 2365 "subl $2, %%ecx \n\t" 2366 "jnz .loop4_pass2 \n\t" 2367 "EMMS \n\t" /* DONE */ 2368 2369 : "=c" (dummy_value_c), /* output regs (dummy) */ 2370 "=S" (dummy_value_S), 2371 "=D" (dummy_value_D) 2372 2373 : "1" (sptr), /* esi // input regs */ 2374 "2" (dp), /* edi */ 2375 "0" (width_mmx) /* ecx */ 2376 2377#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */ 2378 : "%mm0", "%mm1" /* clobber list */ 2379#endif 2380 ); 2381 } 2382 2383 sptr -= (width_mmx*4 - 4); /* sign fixed */ 2384 dp -= (width_mmx*16 - 4); /* sign fixed */ 2385 for (i = width; i; i--) 2386 { 2387 png_byte v[8]; 2388 int j; 2389 sptr -= 4; 2390 png_memcpy(v, sptr, 4); 2391 for (j = 0; j < png_pass_inc[pass]; j++) 2392 { 2393 dp -= 4; 2394 png_memcpy(dp, v, 4); 2395 } 2396 } 2397 } 2398 else if (width) /* pass == 4 or 5 */ 2399 { 2400 int width_mmx = ((width >> 1) << 1) ; 2401 width -= width_mmx; /* 0,1 pixels => 0,4 bytes */ 2402 if (width_mmx) 2403 { 2404 int dummy_value_c; /* fix 'forbidden register spilled' */ 2405 int dummy_value_S; 2406 int dummy_value_D; 2407 2408 __asm__ __volatile__ ( 2409 "subl $4, %%esi \n\t" 2410 "subl $12, %%edi \n\t" 2411 2412 ".loop4_pass4: \n\t" 2413 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */ 2414 "movq %%mm0, %%mm1 \n\t" /* 7 6 5 4 3 2 1 0 */ 2415 "punpckldq %%mm0, %%mm0 \n\t" /* 3 2 1 0 3 2 1 0 */ 2416 "punpckhdq %%mm1, %%mm1 \n\t" /* 7 6 5 4 7 6 5 4 */ 2417 "movq %%mm0, (%%edi) \n\t" 2418 "subl $8, %%esi \n\t" 2419 "movq %%mm1, 8(%%edi) \n\t" 2420 "subl $16, %%edi \n\t" 2421 "subl $2, %%ecx \n\t" 2422 "jnz .loop4_pass4 \n\t" 2423 "EMMS \n\t" /* DONE */ 2424 2425 : "=c" (dummy_value_c), /* output regs (dummy) */ 2426 "=S" (dummy_value_S), 2427 "=D" (dummy_value_D) 2428 2429 : "1" (sptr), /* esi // input regs */ 2430 "2" (dp), /* edi */ 2431 "0" (width_mmx) /* ecx */ 2432 2433#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */ 2434 : "%mm0", "%mm1" /* clobber list */ 2435#endif 2436 ); 2437 } 2438 2439 sptr -= (width_mmx*4 - 4); /* sign fixed */ 2440 dp -= (width_mmx*8 - 4); /* sign fixed */ 2441 for (i = width; i; i--) 2442 { 2443 png_byte v[8]; 2444 int j; 2445 sptr -= 4; 2446 png_memcpy(v, sptr, 4); 2447 for (j = 0; j < png_pass_inc[pass]; j++) 2448 { 2449 dp -= 4; 2450 png_memcpy(dp, v, 4); 2451 } 2452 } 2453 } 2454 } /* end of pixel_bytes == 4 */ 2455 2456 //-------------------------------------------------------------- 2457 else if (pixel_bytes == 8) 2458 { 2459/* GRR TEST: should work, but needs testing (special 64-bit version of rpng2?) */ 2460 /* GRR NOTE: no need to combine passes here! */ 2461 if (((pass == 0) || (pass == 1)) && width) 2462 { 2463 int dummy_value_c; /* fix 'forbidden register spilled' */ 2464 int dummy_value_S; 2465 int dummy_value_D; 2466 2467 /* source is 8-byte RRGGBBAA */ 2468 /* dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ... */ 2469 __asm__ __volatile__ ( 2470 "subl $56, %%edi \n\t" /* start of last block */ 2471 2472 ".loop8_pass0: \n\t" 2473 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */ 2474 "movq %%mm0, (%%edi) \n\t" 2475 "movq %%mm0, 8(%%edi) \n\t" 2476 "movq %%mm0, 16(%%edi) \n\t" 2477 "movq %%mm0, 24(%%edi) \n\t" 2478 "movq %%mm0, 32(%%edi) \n\t" 2479 "movq %%mm0, 40(%%edi) \n\t" 2480 "movq %%mm0, 48(%%edi) \n\t" 2481 "subl $8, %%esi \n\t" 2482 "movq %%mm0, 56(%%edi) \n\t" 2483 "subl $64, %%edi \n\t" 2484 "decl %%ecx \n\t" 2485 "jnz .loop8_pass0 \n\t" 2486 "EMMS \n\t" /* DONE */ 2487 2488 : "=c" (dummy_value_c), /* output regs (dummy) */ 2489 "=S" (dummy_value_S), 2490 "=D" (dummy_value_D) 2491 2492 : "1" (sptr), /* esi // input regs */ 2493 "2" (dp), /* edi */ 2494 "0" (width) /* ecx */ 2495 2496#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */ 2497 : "%mm0" /* clobber list */ 2498#endif 2499 ); 2500 } 2501 else if (((pass == 2) || (pass == 3)) && width) 2502 { 2503 /* source is 8-byte RRGGBBAA */ 2504 /* dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA */ 2505 /* (recall that expansion is _in place_: sptr and dp */ 2506 /* both point at locations within same row buffer) */ 2507 { 2508 int dummy_value_c; /* fix 'forbidden register spilled' */ 2509 int dummy_value_S; 2510 int dummy_value_D; 2511 2512 __asm__ __volatile__ ( 2513 "subl $24, %%edi \n\t" /* start of last block */ 2514 2515 ".loop8_pass2: \n\t" 2516 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */ 2517 "movq %%mm0, (%%edi) \n\t" 2518 "movq %%mm0, 8(%%edi) \n\t" 2519 "movq %%mm0, 16(%%edi) \n\t" 2520 "subl $8, %%esi \n\t" 2521 "movq %%mm0, 24(%%edi) \n\t" 2522 "subl $32, %%edi \n\t" 2523 "decl %%ecx \n\t" 2524 "jnz .loop8_pass2 \n\t" 2525 "EMMS \n\t" /* DONE */ 2526 2527 : "=c" (dummy_value_c), /* output regs (dummy) */ 2528 "=S" (dummy_value_S), 2529 "=D" (dummy_value_D) 2530 2531 : "1" (sptr), /* esi // input regs */ 2532 "2" (dp), /* edi */ 2533 "0" (width) /* ecx */ 2534 2535#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */ 2536 : "%mm0" /* clobber list */ 2537#endif 2538 ); 2539 } 2540 } 2541 else if (width) /* pass == 4 or 5 */ 2542 { 2543 /* source is 8-byte RRGGBBAA */ 2544 /* dest is 16-byte RRGGBBAA RRGGBBAA */ 2545 { 2546 int dummy_value_c; /* fix 'forbidden register spilled' */ 2547 int dummy_value_S; 2548 int dummy_value_D; 2549 2550 __asm__ __volatile__ ( 2551 "subl $8, %%edi \n\t" /* start of last block */ 2552 2553 ".loop8_pass4: \n\t" 2554 "movq (%%esi), %%mm0 \n\t" /* 7 6 5 4 3 2 1 0 */ 2555 "movq %%mm0, (%%edi) \n\t" 2556 "subl $8, %%esi \n\t" 2557 "movq %%mm0, 8(%%edi) \n\t" 2558 "subl $16, %%edi \n\t" 2559 "decl %%ecx \n\t" 2560 "jnz .loop8_pass4 \n\t" 2561 "EMMS \n\t" /* DONE */ 2562 2563 : "=c" (dummy_value_c), /* output regs (dummy) */ 2564 "=S" (dummy_value_S), 2565 "=D" (dummy_value_D) 2566 2567 : "1" (sptr), /* esi // input regs */ 2568 "2" (dp), /* edi */ 2569 "0" (width) /* ecx */ 2570 2571#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */ 2572 : "%mm0" /* clobber list */ 2573#endif 2574 ); 2575 } 2576 } 2577 2578 } /* end of pixel_bytes == 8 */ 2579 2580 //-------------------------------------------------------------- 2581 else if (pixel_bytes == 6) 2582 { 2583 for (i = width; i; i--) 2584 { 2585 png_byte v[8]; 2586 int j; 2587 png_memcpy(v, sptr, 6); 2588 for (j = 0; j < png_pass_inc[pass]; j++) 2589 { 2590 png_memcpy(dp, v, 6); 2591 dp -= 6; 2592 } 2593 sptr -= 6; 2594 } 2595 } /* end of pixel_bytes == 6 */ 2596 2597 //-------------------------------------------------------------- 2598 else 2599 { 2600 for (i = width; i; i--) 2601 { 2602 png_byte v[8]; 2603 int j; 2604 png_memcpy(v, sptr, pixel_bytes); 2605 for (j = 0; j < png_pass_inc[pass]; j++) 2606 { 2607 png_memcpy(dp, v, pixel_bytes); 2608 dp -= pixel_bytes; 2609 } 2610 sptr-= pixel_bytes; 2611 } 2612 } 2613 } /* end of _mmx_supported ======================================== */ 2614 2615 else /* MMX not supported: use modified C code - takes advantage 2616 * of inlining of png_memcpy for a constant */ 2617 /* GRR 19991007: does it? or should pixel_bytes in each 2618 * block be replaced with immediate value (e.g., 1)? */ 2619 /* GRR 19991017: replaced with constants in each case */ 2620#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */ 2621 { 2622 if (pixel_bytes == 1) 2623 { 2624 for (i = width; i; i--) 2625 { 2626 int j; 2627 for (j = 0; j < png_pass_inc[pass]; j++) 2628 { 2629 *dp-- = *sptr; 2630 } 2631 --sptr; 2632 } 2633 } 2634 else if (pixel_bytes == 3) 2635 { 2636 for (i = width; i; i--) 2637 { 2638 png_byte v[8]; 2639 int j; 2640 png_memcpy(v, sptr, 3); 2641 for (j = 0; j < png_pass_inc[pass]; j++) 2642 { 2643 png_memcpy(dp, v, 3); 2644 dp -= 3; 2645 } 2646 sptr -= 3; 2647 } 2648 } 2649 else if (pixel_bytes == 2) 2650 { 2651 for (i = width; i; i--) 2652 { 2653 png_byte v[8]; 2654 int j; 2655 png_memcpy(v, sptr, 2); 2656 for (j = 0; j < png_pass_inc[pass]; j++) 2657 { 2658 png_memcpy(dp, v, 2); 2659 dp -= 2; 2660 } 2661 sptr -= 2; 2662 } 2663 } 2664 else if (pixel_bytes == 4) 2665 { 2666 for (i = width; i; i--) 2667 { 2668 png_byte v[8]; 2669 int j; 2670 png_memcpy(v, sptr, 4); 2671 for (j = 0; j < png_pass_inc[pass]; j++) 2672 { 2673#ifdef PNG_DEBUG 2674 if (dp < row || dp+3 > row+png_ptr->row_buf_size) 2675 { 2676 printf("dp out of bounds: row=%d, dp=%d, rp=%d\n", 2677 row, dp, row+png_ptr->row_buf_size); 2678 printf("row_buf=%d\n",png_ptr->row_buf_size); 2679 } 2680#endif 2681 png_memcpy(dp, v, 4); 2682 dp -= 4; 2683 } 2684 sptr -= 4; 2685 } 2686 } 2687 else if (pixel_bytes == 6) 2688 { 2689 for (i = width; i; i--) 2690 { 2691 png_byte v[8]; 2692 int j; 2693 png_memcpy(v, sptr, 6); 2694 for (j = 0; j < png_pass_inc[pass]; j++) 2695 { 2696 png_memcpy(dp, v, 6); 2697 dp -= 6; 2698 } 2699 sptr -= 6; 2700 } 2701 } 2702 else if (pixel_bytes == 8) 2703 { 2704 for (i = width; i; i--) 2705 { 2706 png_byte v[8]; 2707 int j; 2708 png_memcpy(v, sptr, 8); 2709 for (j = 0; j < png_pass_inc[pass]; j++) 2710 { 2711 png_memcpy(dp, v, 8); 2712 dp -= 8; 2713 } 2714 sptr -= 8; 2715 } 2716 } 2717 else /* GRR: should never be reached */ 2718 { 2719 for (i = width; i; i--) 2720 { 2721 png_byte v[8]; 2722 int j; 2723 png_memcpy(v, sptr, pixel_bytes); 2724 for (j = 0; j < png_pass_inc[pass]; j++) 2725 { 2726 png_memcpy(dp, v, pixel_bytes); 2727 dp -= pixel_bytes; 2728 } 2729 sptr -= pixel_bytes; 2730 } 2731 } 2732 2733 } /* end if (MMX not supported) */ 2734 break; 2735 } 2736 } /* end switch (row_info->pixel_depth) */ 2737 2738 row_info->width = final_width; 2739 2740 row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width); 2741 } 2742 2743} /* end png_do_read_interlace() */ 2744 2745#endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */ 2746#endif /* PNG_READ_INTERLACING_SUPPORTED */ 2747 2748 2749 2750#if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW) 2751#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) 2752 2753/* These variables are utilized in the functions below. They are declared */ 2754/* globally here to ensure alignment on 8-byte boundaries. */ 2755 2756union uAll { 2757 long long use; 2758 double align; 2759} _LBCarryMask = {0x0101010101010101LL}, 2760 _HBClearMask = {0x7f7f7f7f7f7f7f7fLL}, 2761 _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem; 2762 2763#ifdef PNG_THREAD_UNSAFE_OK 2764/*===========================================================================*/ 2765/* */ 2766/* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G */ 2767/* */ 2768/*===========================================================================*/ 2769 2770/* Optimized code for PNG Average filter decoder */ 2771 2772static void /* PRIVATE */ 2773png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row, 2774 png_bytep prev_row) 2775{ 2776 int bpp; 2777 int dummy_value_c; /* fix 'forbidden register 2 (cx) was spilled' error */ 2778 int dummy_value_S; 2779 int dummy_value_D; 2780 2781 bpp = (row_info->pixel_depth + 7) >> 3; /* get # bytes per pixel */ 2782 _FullLength = row_info->rowbytes; /* # of bytes to filter */ 2783 2784 __asm__ __volatile__ ( 2785 /* initialize address pointers and offset */ 2786#ifdef __PIC__ 2787 "pushl %%ebx \n\t" /* save index to Global Offset Table */ 2788#endif 2789/*pre "movl row, %%edi \n\t" */ /* edi: Avg(x) */ 2790 "xorl %%ebx, %%ebx \n\t" /* ebx: x */ 2791 "movl %%edi, %%edx \n\t" 2792/*pre "movl prev_row, %%esi \n\t" */ /* esi: Prior(x) */ 2793/*pre "subl bpp, %%edx \n\t" */ /* (bpp is preloaded into ecx) */ 2794 "subl %%ecx, %%edx \n\t" /* edx: Raw(x-bpp) */ 2795 2796 "xorl %%eax,%%eax \n\t" 2797 2798 /* Compute the Raw value for the first bpp bytes */ 2799 /* Raw(x) = Avg(x) + (Prior(x)/2) */ 2800 "avg_rlp: \n\t" 2801 "movb (%%esi,%%ebx,),%%al \n\t" /* load al with Prior(x) */ 2802 "incl %%ebx \n\t" 2803 "shrb %%al \n\t" /* divide by 2 */ 2804 "addb -1(%%edi,%%ebx,),%%al \n\t" /* add Avg(x); -1 to offset inc ebx */ 2805/* pre "cmpl bpp, %%ebx \n\t" */ /* (bpp is preloaded into ecx) */ 2806 "cmpl %%ecx, %%ebx \n\t" 2807 "movb %%al,-1(%%edi,%%ebx,) \n\t" /* write Raw(x); -1 to offset inc ebx */ 2808 "jb avg_rlp \n\t" /* mov does not affect flags */ 2809 2810 /* get # of bytes to alignment */ 2811 "movl %%edi, _dif \n\t" /* take start of row */ 2812 "addl %%ebx, _dif \n\t" /* add bpp */ 2813 "addl $0xf, _dif \n\t" /* add 7+8 to incr past alignment bdry */ 2814 "andl $0xfffffff8, _dif \n\t" /* mask to alignment boundary */ 2815 "subl %%edi, _dif \n\t" /* subtract from start => value ebx at */ 2816 "jz avg_go \n\t" /* alignment */ 2817 2818 /* fix alignment */ 2819 /* Compute the Raw value for the bytes up to the alignment boundary */ 2820 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */ 2821 "xorl %%ecx, %%ecx \n\t" 2822 2823 "avg_lp1: \n\t" 2824 "xorl %%eax, %%eax \n\t" 2825 "movb (%%esi,%%ebx,), %%cl \n\t" /* load cl with Prior(x) */ 2826 "movb (%%edx,%%ebx,), %%al \n\t" /* load al with Raw(x-bpp) */ 2827 "addw %%cx, %%ax \n\t" 2828 "incl %%ebx \n\t" 2829 "shrw %%ax \n\t" /* divide by 2 */ 2830 "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset inc ebx */ 2831 "cmpl _dif, %%ebx \n\t" /* check if at alignment boundary */ 2832 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write Raw(x); -1 to offset inc ebx */ 2833 "jb avg_lp1 \n\t" /* repeat until at alignment boundary */ 2834 2835 "avg_go: \n\t" 2836 "movl _FullLength, %%eax \n\t" 2837 "movl %%eax, %%ecx \n\t" 2838 "subl %%ebx, %%eax \n\t" /* subtract alignment fix */ 2839 "andl $0x00000007, %%eax \n\t" /* calc bytes over mult of 8 */ 2840 "subl %%eax, %%ecx \n\t" /* drop over bytes from original length */ 2841 "movl %%ecx, _MMXLength \n\t" 2842#ifdef __PIC__ 2843 "popl %%ebx \n\t" /* restore index to Global Offset Table */ 2844#endif 2845 2846 : "=c" (dummy_value_c), /* output regs (dummy) */ 2847 "=S" (dummy_value_S), 2848 "=D" (dummy_value_D) 2849 2850 : "0" (bpp), /* ecx // input regs */ 2851 "1" (prev_row), /* esi */ 2852 "2" (row) /* edi */ 2853 2854 : "%eax", "%edx" /* clobber list */ 2855#ifndef __PIC__ 2856 , "%ebx" 2857#endif 2858 /* GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength) */ 2859 /* (seems to work fine without...) */ 2860 ); 2861 2862 /* now do the math for the rest of the row */ 2863 switch (bpp) 2864 { 2865 case 3: 2866 { 2867 _ActiveMask.use = 0x0000000000ffffffLL; 2868 _ShiftBpp.use = 24; /* == 3 * 8 */ 2869 _ShiftRem.use = 40; /* == 64 - 24 */ 2870 2871 __asm__ __volatile__ ( 2872 /* re-init address pointers and offset */ 2873 "movq _ActiveMask, %%mm7 \n\t" 2874 "movl _dif, %%ecx \n\t" /* ecx: x = offset to */ 2875 "movq _LBCarryMask, %%mm5 \n\t" /* alignment boundary */ 2876/* preload "movl row, %%edi \n\t" // edi: Avg(x) */ 2877 "movq _HBClearMask, %%mm4 \n\t" 2878/* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */ 2879 2880 /* prime the pump: load the first Raw(x-bpp) data set */ 2881 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */ 2882 /* (correct pos. in loop below) */ 2883 "avg_3lp: \n\t" 2884 "movq (%%edi,%%ecx,), %%mm0 \n\t" /* load mm0 with Avg(x) */ 2885 "movq %%mm5, %%mm3 \n\t" 2886 "psrlq _ShiftRem, %%mm2 \n\t" /* correct position Raw(x-bpp) */ 2887 /* data */ 2888 "movq (%%esi,%%ecx,), %%mm1 \n\t" /* load mm1 with Prior(x) */ 2889 "movq %%mm7, %%mm6 \n\t" 2890 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */ 2891 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */ 2892 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */ 2893 /* byte */ 2894 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */ 2895 /* each byte */ 2896 /* add 1st active group (Raw(x-bpp)/2) to average with LBCarry */ 2897 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */ 2898 /* LBCarrys */ 2899 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */ 2900 /* where both */ 2901 /* lsb's were == 1 (only valid for active group) */ 2902 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */ 2903 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */ 2904 /* byte */ 2905 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */ 2906 /* for each byte */ 2907 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 1 */ 2908 /* bytes to add to Avg */ 2909 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */ 2910 /* Avg for each Active */ 2911 /* byte */ 2912 /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */ 2913 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */ 2914 /* bytes 3-5 */ 2915 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */ 2916 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */ 2917 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */ 2918 /* LBCarrys */ 2919 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */ 2920 /* where both */ 2921 /* lsb's were == 1 (only valid for active group) */ 2922 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */ 2923 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */ 2924 /* byte */ 2925 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */ 2926 /* for each byte */ 2927 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */ 2928 /* bytes to add to Avg */ 2929 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */ 2930 /* Avg for each Active */ 2931 /* byte */ 2932 2933 /* add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry */ 2934 "psllq _ShiftBpp, %%mm6 \n\t" /* shift mm6 mask to cover last */ 2935 /* two */ 2936 /* bytes */ 2937 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */ 2938 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */ 2939 /* Data only needs to be shifted once here to */ 2940 /* get the correct x-bpp offset. */ 2941 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */ 2942 /* LBCarrys */ 2943 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */ 2944 /* where both */ 2945 /* lsb's were == 1 (only valid for active group) */ 2946 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */ 2947 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */ 2948 /* byte */ 2949 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */ 2950 /* for each byte */ 2951 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */ 2952 /* bytes to add to Avg */ 2953 "addl $8, %%ecx \n\t" 2954 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */ 2955 /* Avg for each Active */ 2956 /* byte */ 2957 /* now ready to write back to memory */ 2958 "movq %%mm0, -8(%%edi,%%ecx,) \n\t" 2959 /* move updated Raw(x) to use as Raw(x-bpp) for next loop */ 2960 "cmpl _MMXLength, %%ecx \n\t" 2961 "movq %%mm0, %%mm2 \n\t" /* mov updated Raw(x) to mm2 */ 2962 "jb avg_3lp \n\t" 2963 2964 : "=S" (dummy_value_S), /* output regs (dummy) */ 2965 "=D" (dummy_value_D) 2966 2967 : "0" (prev_row), /* esi // input regs */ 2968 "1" (row) /* edi */ 2969 2970 : "%ecx" /* clobber list */ 2971#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */ 2972 , "%mm0", "%mm1", "%mm2", "%mm3" 2973 , "%mm4", "%mm5", "%mm6", "%mm7" 2974#endif 2975 ); 2976 } 2977 break; /* end 3 bpp */ 2978 2979 case 6: 2980 case 4: 2981 //case 7: /* who wrote this? PNG doesn't support 5 or 7 bytes/pixel */ 2982 //case 5: /* GRR BOGUS */ 2983 { 2984 _ActiveMask.use = 0xffffffffffffffffLL; /* use shift below to clear */ 2985 /* appropriate inactive bytes */ 2986 _ShiftBpp.use = bpp << 3; 2987 _ShiftRem.use = 64 - _ShiftBpp.use; 2988 2989 __asm__ __volatile__ ( 2990 "movq _HBClearMask, %%mm4 \n\t" 2991 2992 /* re-init address pointers and offset */ 2993 "movl _dif, %%ecx \n\t" /* ecx: x = offset to */ 2994 /* alignment boundary */ 2995 2996 /* load _ActiveMask and clear all bytes except for 1st active group */ 2997 "movq _ActiveMask, %%mm7 \n\t" 2998/* preload "movl row, %%edi \n\t" // edi: Avg(x) */ 2999 "psrlq _ShiftRem, %%mm7 \n\t" 3000/* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */ 3001 "movq %%mm7, %%mm6 \n\t" 3002 "movq _LBCarryMask, %%mm5 \n\t" 3003 "psllq _ShiftBpp, %%mm6 \n\t" /* create mask for 2nd active */ 3004 /* group */ 3005 3006 /* prime the pump: load the first Raw(x-bpp) data set */ 3007 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */ 3008 /* (we correct pos. in loop below) */ 3009 "avg_4lp: \n\t" 3010 "movq (%%edi,%%ecx,), %%mm0 \n\t" 3011 "psrlq _ShiftRem, %%mm2 \n\t" /* shift data to pos. correctly */ 3012 "movq (%%esi,%%ecx,), %%mm1 \n\t" 3013 /* add (Prev_row/2) to average */ 3014 "movq %%mm5, %%mm3 \n\t" 3015 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */ 3016 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */ 3017 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */ 3018 /* byte */ 3019 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */ 3020 /* each byte */ 3021 /* add 1st active group (Raw(x-bpp)/2) to average with _LBCarry */ 3022 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */ 3023 /* LBCarrys */ 3024 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */ 3025 /* where both */ 3026 /* lsb's were == 1 (only valid for active group) */ 3027 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */ 3028 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */ 3029 /* byte */ 3030 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */ 3031 /* for each byte */ 3032 "pand %%mm7, %%mm2 \n\t" /* leave only Active Group 1 */ 3033 /* bytes to add to Avg */ 3034 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to Avg */ 3035 /* for each Active */ 3036 /* byte */ 3037 /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */ 3038 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */ 3039 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */ 3040 "addl $8, %%ecx \n\t" 3041 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */ 3042 /* LBCarrys */ 3043 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */ 3044 /* where both */ 3045 /* lsb's were == 1 (only valid for active group) */ 3046 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */ 3047 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */ 3048 /* byte */ 3049 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */ 3050 /* for each byte */ 3051 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */ 3052 /* bytes to add to Avg */ 3053 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */ 3054 /* Avg for each Active */ 3055 /* byte */ 3056 "cmpl _MMXLength, %%ecx \n\t" 3057 /* now ready to write back to memory */ 3058 "movq %%mm0, -8(%%edi,%%ecx,) \n\t" 3059 /* prep Raw(x-bpp) for next loop */ 3060 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */ 3061 "jb avg_4lp \n\t" 3062 3063 : "=S" (dummy_value_S), /* output regs (dummy) */ 3064 "=D" (dummy_value_D) 3065 3066 : "0" (prev_row), /* esi // input regs */ 3067 "1" (row) /* edi */ 3068 3069 : "%ecx" /* clobber list */ 3070#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */ 3071 , "%mm0", "%mm1", "%mm2", "%mm3" 3072 , "%mm4", "%mm5", "%mm6", "%mm7" 3073#endif 3074 ); 3075 } 3076 break; /* end 4,6 bpp */ 3077 3078 case 2: 3079 { 3080 _ActiveMask.use = 0x000000000000ffffLL; 3081 _ShiftBpp.use = 16; /* == 2 * 8 */ 3082 _ShiftRem.use = 48; /* == 64 - 16 */ 3083 3084 __asm__ __volatile__ ( 3085 /* load _ActiveMask */ 3086 "movq _ActiveMask, %%mm7 \n\t" 3087 /* re-init address pointers and offset */ 3088 "movl _dif, %%ecx \n\t" /* ecx: x = offset to alignment */ 3089 /* boundary */ 3090 "movq _LBCarryMask, %%mm5 \n\t" 3091/* preload "movl row, %%edi \n\t" // edi: Avg(x) */ 3092 "movq _HBClearMask, %%mm4 \n\t" 3093/* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */ 3094 3095 /* prime the pump: load the first Raw(x-bpp) data set */ 3096 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */ 3097 /* (we correct pos. in loop below) */ 3098 "avg_2lp: \n\t" 3099 "movq (%%edi,%%ecx,), %%mm0 \n\t" 3100 "psrlq _ShiftRem, %%mm2 \n\t" /* shift data to pos. correctly */ 3101 "movq (%%esi,%%ecx,), %%mm1 \n\t" /* (GRR BUGFIX: was psllq) */ 3102 /* add (Prev_row/2) to average */ 3103 "movq %%mm5, %%mm3 \n\t" 3104 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */ 3105 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */ 3106 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */ 3107 /* byte */ 3108 "movq %%mm7, %%mm6 \n\t" 3109 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */ 3110 /* each byte */ 3111 3112 /* add 1st active group (Raw(x-bpp)/2) to average with _LBCarry */ 3113 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */ 3114 /* LBCarrys */ 3115 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */ 3116 /* where both */ 3117 /* lsb's were == 1 (only valid */ 3118 /* for active group) */ 3119 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */ 3120 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */ 3121 /* byte */ 3122 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */ 3123 /* for each byte */ 3124 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 1 */ 3125 /* bytes to add to Avg */ 3126 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to Avg */ 3127 /* for each Active byte */ 3128 3129 /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */ 3130 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */ 3131 /* bytes 2 & 3 */ 3132 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */ 3133 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */ 3134 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */ 3135 /* LBCarrys */ 3136 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */ 3137 /* where both */ 3138 /* lsb's were == 1 (only valid */ 3139 /* for active group) */ 3140 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */ 3141 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */ 3142 /* byte */ 3143 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */ 3144 /* for each byte */ 3145 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */ 3146 /* bytes to add to Avg */ 3147 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */ 3148 /* Avg for each Active byte */ 3149 3150 /* add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry */ 3151 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */ 3152 /* bytes 4 & 5 */ 3153 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */ 3154 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */ 3155 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */ 3156 /* LBCarrys */ 3157 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */ 3158 /* where both lsb's were == 1 */ 3159 /* (only valid for active group) */ 3160 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */ 3161 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */ 3162 /* byte */ 3163 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */ 3164 /* for each byte */ 3165 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */ 3166 /* bytes to add to Avg */ 3167 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */ 3168 /* Avg for each Active byte */ 3169 3170 /* add 4th active group (Raw(x-bpp)/2) to average with _LBCarry */ 3171 "psllq _ShiftBpp, %%mm6 \n\t" /* shift the mm6 mask to cover */ 3172 /* bytes 6 & 7 */ 3173 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */ 3174 "psllq _ShiftBpp, %%mm2 \n\t" /* shift data to pos. correctly */ 3175 "addl $8, %%ecx \n\t" 3176 "movq %%mm3, %%mm1 \n\t" /* now use mm1 for getting */ 3177 /* LBCarrys */ 3178 "pand %%mm2, %%mm1 \n\t" /* get LBCarrys for each byte */ 3179 /* where both */ 3180 /* lsb's were == 1 (only valid */ 3181 /* for active group) */ 3182 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */ 3183 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */ 3184 /* byte */ 3185 "paddb %%mm1, %%mm2 \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */ 3186 /* for each byte */ 3187 "pand %%mm6, %%mm2 \n\t" /* leave only Active Group 2 */ 3188 /* bytes to add to Avg */ 3189 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) + LBCarrys to */ 3190 /* Avg for each Active byte */ 3191 3192 "cmpl _MMXLength, %%ecx \n\t" 3193 /* now ready to write back to memory */ 3194 "movq %%mm0, -8(%%edi,%%ecx,) \n\t" 3195 /* prep Raw(x-bpp) for next loop */ 3196 "movq %%mm0, %%mm2 \n\t" /* mov updated Raws to mm2 */ 3197 "jb avg_2lp \n\t" 3198 3199 : "=S" (dummy_value_S), /* output regs (dummy) */ 3200 "=D" (dummy_value_D) 3201 3202 : "0" (prev_row), /* esi // input regs */ 3203 "1" (row) /* edi */ 3204 3205 : "%ecx" /* clobber list */ 3206#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */ 3207 , "%mm0", "%mm1", "%mm2", "%mm3" 3208 , "%mm4", "%mm5", "%mm6", "%mm7" 3209#endif 3210 ); 3211 } 3212 break; /* end 2 bpp */ 3213 3214 case 1: 3215 { 3216 __asm__ __volatile__ ( 3217 /* re-init address pointers and offset */ 3218#ifdef __PIC__ 3219 "pushl %%ebx \n\t" /* save Global Offset Table index */ 3220#endif 3221 "movl _dif, %%ebx \n\t" /* ebx: x = offset to alignment */ 3222 /* boundary */ 3223/* preload "movl row, %%edi \n\t" // edi: Avg(x) */ 3224 "cmpl _FullLength, %%ebx \n\t" /* test if offset at end of array */ 3225 "jnb avg_1end \n\t" 3226 /* do Paeth decode for remaining bytes */ 3227/* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */ 3228 "movl %%edi, %%edx \n\t" 3229/* preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx) */ 3230 "subl %%ecx, %%edx \n\t" /* edx: Raw(x-bpp) */ 3231 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx */ 3232 /* in loop below */ 3233 "avg_1lp: \n\t" 3234 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */ 3235 "xorl %%eax, %%eax \n\t" 3236 "movb (%%esi,%%ebx,), %%cl \n\t" /* load cl with Prior(x) */ 3237 "movb (%%edx,%%ebx,), %%al \n\t" /* load al with Raw(x-bpp) */ 3238 "addw %%cx, %%ax \n\t" 3239 "incl %%ebx \n\t" 3240 "shrw %%ax \n\t" /* divide by 2 */ 3241 "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset */ 3242 /* inc ebx */ 3243 "cmpl _FullLength, %%ebx \n\t" /* check if at end of array */ 3244 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write back Raw(x); */ 3245 /* mov does not affect flags; -1 to offset inc ebx */ 3246 "jb avg_1lp \n\t" 3247 3248 "avg_1end: \n\t" 3249#ifdef __PIC__ 3250 "popl %%ebx \n\t" /* Global Offset Table index */ 3251#endif 3252 3253 : "=c" (dummy_value_c), /* output regs (dummy) */ 3254 "=S" (dummy_value_S), 3255 "=D" (dummy_value_D) 3256 3257 : "0" (bpp), /* ecx // input regs */ 3258 "1" (prev_row), /* esi */ 3259 "2" (row) /* edi */ 3260 3261 : "%eax", "%edx" /* clobber list */ 3262#ifndef __PIC__ 3263 , "%ebx" 3264#endif 3265 ); 3266 } 3267 return; /* end 1 bpp */ 3268 3269 case 8: 3270 { 3271 __asm__ __volatile__ ( 3272 /* re-init address pointers and offset */ 3273 "movl _dif, %%ecx \n\t" /* ecx: x == offset to alignment */ 3274 "movq _LBCarryMask, %%mm5 \n\t" /* boundary */ 3275/* preload "movl row, %%edi \n\t" // edi: Avg(x) */ 3276 "movq _HBClearMask, %%mm4 \n\t" 3277/* preload "movl prev_row, %%esi \n\t" // esi: Prior(x) */ 3278 3279 /* prime the pump: load the first Raw(x-bpp) data set */ 3280 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */ 3281 /* (NO NEED to correct pos. in loop below) */ 3282 3283 "avg_8lp: \n\t" 3284 "movq (%%edi,%%ecx,), %%mm0 \n\t" 3285 "movq %%mm5, %%mm3 \n\t" 3286 "movq (%%esi,%%ecx,), %%mm1 \n\t" 3287 "addl $8, %%ecx \n\t" 3288 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */ 3289 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */ 3290 "pand %%mm2, %%mm3 \n\t" /* get LBCarrys for each byte */ 3291 /* where both lsb's were == 1 */ 3292 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */ 3293 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7, each byte */ 3294 "paddb %%mm3, %%mm0 \n\t" /* add LBCarrys to Avg, each byte */ 3295 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7, each byte */ 3296 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg, each */ 3297 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) to Avg for each */ 3298 "cmpl _MMXLength, %%ecx \n\t" 3299 "movq %%mm0, -8(%%edi,%%ecx,) \n\t" 3300 "movq %%mm0, %%mm2 \n\t" /* reuse as Raw(x-bpp) */ 3301 "jb avg_8lp \n\t" 3302 3303 : "=S" (dummy_value_S), /* output regs (dummy) */ 3304 "=D" (dummy_value_D) 3305 3306 : "0" (prev_row), /* esi // input regs */ 3307 "1" (row) /* edi */ 3308 3309 : "%ecx" /* clobber list */ 3310#if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */ 3311 , "%mm0", "%mm1", "%mm2" 3312 , "%mm3", "%mm4", "%mm5" 3313#endif 3314 ); 3315 } 3316 break; /* end 8 bpp */ 3317 3318 default: /* bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8) */ 3319 { 3320 3321#ifdef PNG_DEBUG 3322 /* GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED */ 3323 png_debug(1, 3324 "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n"); 3325#endif 3326 3327#if 0 3328 __asm__ __volatile__ ( 3329 "movq _LBCarryMask, %%mm5 \n\t" 3330 /* re-init address pointers and offset */ 3331 "movl _dif, %%ebx \n\t" /* ebx: x = offset to */ 3332 /* alignment boundary */ 3333 "movl row, %%edi \n\t" /* edi: Avg(x) */ 3334 "movq _HBClearMask, %%mm4 \n\t" 3335 "movl %%edi, %%edx \n\t" 3336 "movl prev_row, %%esi \n\t" /* esi: Prior(x) */ 3337 "subl bpp, %%edx \n\t" /* edx: Raw(x-bpp) */ 3338 "avg_Alp: \n\t" 3339 "movq (%%edi,%%ebx,), %%mm0 \n\t" 3340 "movq %%mm5, %%mm3 \n\t" 3341 "movq (%%esi,%%ebx,), %%mm1 \n\t" 3342 "pand %%mm1, %%mm3 \n\t" /* get lsb for each prev_row byte */ 3343 "movq (%%edx,%%ebx,), %%mm2 \n\t" 3344 "psrlq $1, %%mm1 \n\t" /* divide prev_row bytes by 2 */ 3345 "pand %%mm2, %%mm3 \n\t" /* get LBCarrys for each byte */ 3346 /* where both lsb's were == 1 */ 3347 "psrlq $1, %%mm2 \n\t" /* divide raw bytes by 2 */ 3348 "pand %%mm4, %%mm1 \n\t" /* clear invalid bit 7 of each */ 3349 /* byte */ 3350 "paddb %%mm3, %%mm0 \n\t" /* add LBCarrys to Avg for each */ 3351 /* byte */ 3352 "pand %%mm4, %%mm2 \n\t" /* clear invalid bit 7 of each */ 3353 /* byte */ 3354 "paddb %%mm1, %%mm0 \n\t" /* add (Prev_row/2) to Avg for */ 3355 /* each byte */ 3356 "addl $8, %%ebx \n\t" 3357 "paddb %%mm2, %%mm0 \n\t" /* add (Raw/2) to Avg for each */ 3358 /* byte */ 3359 "cmpl _MMXLength, %%ebx \n\t" 3360 "movq %%mm0, -8(%%edi,%%ebx,) \n\t" 3361 "jb avg_Alp \n\t" 3362 3363 : /* FIXASM: output regs/vars go here, e.g.: "=m" (memory_var) */ 3364 3365 : /* FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest) */ 3366 3367 : "%ebx", "%edx", "%edi", "%esi" /* CHECKASM: clobber list */ 3368 ); 3369#endif /* 0 - NEVER REACHED */ 3370 } 3371 break; 3372 3373 } /* end switch (bpp) */ 3374 3375 __asm__ __volatile__ ( 3376 /* MMX acceleration complete; now do clean-up */ 3377 /* check if any remaining bytes left to decode */ 3378#ifdef __PIC__ 3379 "pushl %%ebx \n\t" /* save index to Global Offset Table */ 3380#endif 3381 "movl _MMXLength, %%ebx \n\t" /* ebx: x == offset bytes after MMX */ 3382/* pre "movl row, %%edi \n\t" */ /* edi: Avg(x) */ 3383 "cmpl _FullLength, %%ebx \n\t" /* test if offset at end of array */ 3384 "jnb avg_end \n\t" 3385 3386 /* do Avg decode for remaining bytes */ 3387/*pre "movl prev_row, %%esi \n\t" */ /* esi: Prior(x) */ 3388 "movl %%edi, %%edx \n\t" 3389/*pre "subl bpp, %%edx \n\t" */ /* (bpp is preloaded into ecx) */ 3390 "subl %%ecx, %%edx \n\t" /* edx: Raw(x-bpp) */ 3391 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx below */ 3392 3393 "avg_lp2: \n\t" 3394 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */ 3395 "xorl %%eax, %%eax \n\t" 3396 "movb (%%esi,%%ebx,), %%cl \n\t" /* load cl with Prior(x) */ 3397 "movb (%%edx,%%ebx,), %%al \n\t" /* load al with Raw(x-bpp) */ 3398 "addw %%cx, %%ax \n\t" 3399 "incl %%ebx \n\t" 3400 "shrw %%ax \n\t" /* divide by 2 */ 3401 "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset inc ebx */ 3402 "cmpl _FullLength, %%ebx \n\t" /* check if at end of array */ 3403 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write back Raw(x) [mov does not */ 3404 "jb avg_lp2 \n\t" /* affect flags; -1 to offset inc ebx] */ 3405 3406 "avg_end: \n\t" 3407 "EMMS \n\t" /* end MMX; prep for poss. FP instrs. */ 3408#ifdef __PIC__ 3409 "popl %%ebx \n\t" /* restore index to Global Offset Table */ 3410#endif 3411 3412 : "=c" (dummy_value_c), /* output regs (dummy) */ 3413 "=S" (dummy_value_S), 3414 "=D" (dummy_value_D) 3415 3416 : "0" (bpp), /* ecx // input regs */ 3417 "1" (prev_row), /* esi */ 3418 "2" (row) /* edi */ 3419 3420 : "%eax", "%edx" /* clobber list */ 3421#ifndef __PIC__ 3422 , "%ebx" 3423#endif 3424 ); 3425 3426} /* end png_read_filter_row_mmx_avg() */ 3427#endif 3428 3429 3430 3431#ifdef PNG_THREAD_UNSAFE_OK 3432/*===========================================================================*/ 3433/* */ 3434/* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H */ 3435/* */ 3436/*===========================================================================*/ 3437 3438/* Optimized code for PNG Paeth filter decoder */ 3439 3440static void /* PRIVATE */ 3441png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row, 3442 png_bytep prev_row) 3443{ 3444 int bpp; 3445 int dummy_value_c; /* fix 'forbidden register 2 (cx) was spilled' error */ 3446 int dummy_value_S; 3447 int dummy_value_D; 3448 3449 bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */ 3450 _FullLength = row_info->rowbytes; /* # of bytes to filter */ 3451 3452 __asm__ __volatile__ ( 3453#ifdef __PIC__ 3454 "pushl %%ebx \n\t" /* save index to Global Offset Table */ 3455#endif 3456 "xorl %%ebx, %%ebx \n\t" /* ebx: x offset */ 3457/*pre "movl row, %%edi \n\t" */ 3458 "xorl %%edx, %%edx \n\t" /* edx: x-bpp offset */ 3459/*pre "movl prev_row, %%esi \n\t" */ 3460 "xorl %%eax, %%eax \n\t" 3461 3462 /* Compute the Raw value for the first bpp bytes */ 3463 /* Note: the formula works out to be always */ 3464 /* Paeth(x) = Raw(x) + Prior(x) where x < bpp */ 3465 "paeth_rlp: \n\t" 3466 "movb (%%edi,%%ebx,), %%al \n\t" 3467 "addb (%%esi,%%ebx,), %%al \n\t" 3468 "incl %%ebx \n\t" 3469/*pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx) */ 3470 "cmpl %%ecx, %%ebx \n\t" 3471 "movb %%al, -1(%%edi,%%ebx,) \n\t" 3472 "jb paeth_rlp \n\t" 3473 /* get # of bytes to alignment */ 3474 "movl %%edi, _dif \n\t" /* take start of row */ 3475 "addl %%ebx, _dif \n\t" /* add bpp */ 3476 "xorl %%ecx, %%ecx \n\t" 3477 "addl $0xf, _dif \n\t" /* add 7 + 8 to incr past alignment */ 3478 /* boundary */ 3479 "andl $0xfffffff8, _dif \n\t" /* mask to alignment boundary */ 3480 "subl %%edi, _dif \n\t" /* subtract from start ==> value ebx */ 3481 /* at alignment */ 3482 "jz paeth_go \n\t" 3483 /* fix alignment */ 3484 3485 "paeth_lp1: \n\t" 3486 "xorl %%eax, %%eax \n\t" 3487 /* pav = p - a = (a + b - c) - a = b - c */ 3488 "movb (%%esi,%%ebx,), %%al \n\t" /* load Prior(x) into al */ 3489 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */ 3490 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */ 3491 "movl %%eax, _patemp \n\t" /* Save pav for later use */ 3492 "xorl %%eax, %%eax \n\t" 3493 /* pbv = p - b = (a + b - c) - b = a - c */ 3494 "movb (%%edi,%%edx,), %%al \n\t" /* load Raw(x-bpp) into al */ 3495 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */ 3496 "movl %%eax, %%ecx \n\t" 3497 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 3498 "addl _patemp, %%eax \n\t" /* pcv = pav + pbv */ 3499 /* pc = abs(pcv) */ 3500 "testl $0x80000000, %%eax \n\t" 3501 "jz paeth_pca \n\t" 3502 "negl %%eax \n\t" /* reverse sign of neg values */ 3503 3504 "paeth_pca: \n\t" 3505 "movl %%eax, _pctemp \n\t" /* save pc for later use */ 3506 /* pb = abs(pbv) */ 3507 "testl $0x80000000, %%ecx \n\t" 3508 "jz paeth_pba \n\t" 3509 "negl %%ecx \n\t" /* reverse sign of neg values */ 3510 3511 "paeth_pba: \n\t" 3512 "movl %%ecx, _pbtemp \n\t" /* save pb for later use */ 3513 /* pa = abs(pav) */ 3514 "movl _patemp, %%eax \n\t" 3515 "testl $0x80000000, %%eax \n\t" 3516 "jz paeth_paa \n\t" 3517 "negl %%eax \n\t" /* reverse sign of neg values */ 3518 3519 "paeth_paa: \n\t" 3520 "movl %%eax, _patemp \n\t" /* save pa for later use */ 3521 /* test if pa <= pb */ 3522 "cmpl %%ecx, %%eax \n\t" 3523 "jna paeth_abb \n\t" 3524 /* pa > pb; now test if pb <= pc */ 3525 "cmpl _pctemp, %%ecx \n\t" 3526 "jna paeth_bbc \n\t" 3527 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */ 3528 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */ 3529 "jmp paeth_paeth \n\t" 3530 3531 "paeth_bbc: \n\t" 3532 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */ 3533 "movb (%%esi,%%ebx,), %%cl \n\t" /* load Prior(x) into cl */ 3534 "jmp paeth_paeth \n\t" 3535 3536 "paeth_abb: \n\t" 3537 /* pa <= pb; now test if pa <= pc */ 3538 "cmpl _pctemp, %%eax \n\t" 3539 "jna paeth_abc \n\t" 3540 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */ 3541 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */ 3542 "jmp paeth_paeth \n\t" 3543 3544 "paeth_abc: \n\t" 3545 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */ 3546 "movb (%%edi,%%edx,), %%cl \n\t" /* load Raw(x-bpp) into cl */ 3547 3548 "paeth_paeth: \n\t" 3549 "incl %%ebx \n\t" 3550 "incl %%edx \n\t" 3551 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */ 3552 "addb %%cl, -1(%%edi,%%ebx,) \n\t" 3553 "cmpl _dif, %%ebx \n\t" 3554 "jb paeth_lp1 \n\t" 3555 3556 "paeth_go: \n\t" 3557 "movl _FullLength, %%ecx \n\t" 3558 "movl %%ecx, %%eax \n\t" 3559 "subl %%ebx, %%eax \n\t" /* subtract alignment fix */ 3560 "andl $0x00000007, %%eax \n\t" /* calc bytes over mult of 8 */ 3561 "subl %%eax, %%ecx \n\t" /* drop over bytes from original length */ 3562 "movl %%ecx, _MMXLength \n\t" 3563#ifdef __PIC__ 3564 "popl %%ebx \n\t" /* restore index to Global Offset Table */ 3565#endif 3566 3567 : "=c" (dummy_value_c), /* output regs (dummy) */ 3568 "=S" (dummy_value_S), 3569 "=D" (dummy_value_D) 3570 3571 : "0" (bpp), /* ecx // input regs */ 3572 "1" (prev_row), /* esi */ 3573 "2" (row) /* edi */ 3574 3575 : "%eax", "%edx" /* clobber list */ 3576#ifndef __PIC__ 3577 , "%ebx" 3578#endif 3579 ); 3580 3581 /* now do the math for the rest of the row */ 3582 switch (bpp) 3583 { 3584 case 3: 3585 { 3586 _ActiveMask.use = 0x0000000000ffffffLL; 3587 _ActiveMaskEnd.use = 0xffff000000000000LL; 3588 _ShiftBpp.use = 24; /* == bpp(3) * 8 */ 3589 _ShiftRem.use = 40; /* == 64 - 24 */ 3590 3591 __asm__ __volatile__ ( 3592 "movl _dif, %%ecx \n\t" 3593/* preload "movl row, %%edi \n\t" */ 3594/* preload "movl prev_row, %%esi \n\t" */ 3595 "pxor %%mm0, %%mm0 \n\t" 3596 /* prime the pump: load the first Raw(x-bpp) data set */ 3597 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" 3598 "paeth_3lp: \n\t" 3599 "psrlq _ShiftRem, %%mm1 \n\t" /* shift last 3 bytes to 1st */ 3600 /* 3 bytes */ 3601 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */ 3602 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */ 3603 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* prep c=Prior(x-bpp) bytes */ 3604 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */ 3605 "psrlq _ShiftRem, %%mm3 \n\t" /* shift last 3 bytes to 1st */ 3606 /* 3 bytes */ 3607 /* pav = p - a = (a + b - c) - a = b - c */ 3608 "movq %%mm2, %%mm4 \n\t" 3609 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */ 3610 /* pbv = p - b = (a + b - c) - b = a - c */ 3611 "movq %%mm1, %%mm5 \n\t" 3612 "psubw %%mm3, %%mm4 \n\t" 3613 "pxor %%mm7, %%mm7 \n\t" 3614 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 3615 "movq %%mm4, %%mm6 \n\t" 3616 "psubw %%mm3, %%mm5 \n\t" 3617 3618 /* pa = abs(p-a) = abs(pav) */ 3619 /* pb = abs(p-b) = abs(pbv) */ 3620 /* pc = abs(p-c) = abs(pcv) */ 3621 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */ 3622 "paddw %%mm5, %%mm6 \n\t" 3623 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */ 3624 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */ 3625 "psubw %%mm0, %%mm4 \n\t" 3626 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */ 3627 "psubw %%mm0, %%mm4 \n\t" 3628 "psubw %%mm7, %%mm5 \n\t" 3629 "pxor %%mm0, %%mm0 \n\t" 3630 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */ 3631 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */ 3632 "psubw %%mm7, %%mm5 \n\t" 3633 "psubw %%mm0, %%mm6 \n\t" 3634 /* test pa <= pb */ 3635 "movq %%mm4, %%mm7 \n\t" 3636 "psubw %%mm0, %%mm6 \n\t" 3637 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */ 3638 "movq %%mm7, %%mm0 \n\t" 3639 /* use mm7 mask to merge pa & pb */ 3640 "pand %%mm7, %%mm5 \n\t" 3641 /* use mm0 mask copy to merge a & b */ 3642 "pand %%mm0, %%mm2 \n\t" 3643 "pandn %%mm4, %%mm7 \n\t" 3644 "pandn %%mm1, %%mm0 \n\t" 3645 "paddw %%mm5, %%mm7 \n\t" 3646 "paddw %%mm2, %%mm0 \n\t" 3647 /* test ((pa <= pb)? pa:pb) <= pc */ 3648 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */ 3649 "pxor %%mm1, %%mm1 \n\t" 3650 "pand %%mm7, %%mm3 \n\t" 3651 "pandn %%mm0, %%mm7 \n\t" 3652 "paddw %%mm3, %%mm7 \n\t" 3653 "pxor %%mm0, %%mm0 \n\t" 3654 "packuswb %%mm1, %%mm7 \n\t" 3655 "movq (%%esi,%%ecx,), %%mm3 \n\t" /* load c=Prior(x-bpp) */ 3656 "pand _ActiveMask, %%mm7 \n\t" 3657 "movq %%mm3, %%mm2 \n\t" /* load b=Prior(x) step 1 */ 3658 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */ 3659 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */ 3660 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */ 3661 "movq %%mm7, %%mm1 \n\t" /* now mm1 will be used as */ 3662 /* Raw(x-bpp) */ 3663 /* now do Paeth for 2nd set of bytes (3-5) */ 3664 "psrlq _ShiftBpp, %%mm2 \n\t" /* load b=Prior(x) step 2 */ 3665 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */ 3666 "pxor %%mm7, %%mm7 \n\t" 3667 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */ 3668 /* pbv = p - b = (a + b - c) - b = a - c */ 3669 "movq %%mm1, %%mm5 \n\t" 3670 /* pav = p - a = (a + b - c) - a = b - c */ 3671 "movq %%mm2, %%mm4 \n\t" 3672 "psubw %%mm3, %%mm5 \n\t" 3673 "psubw %%mm3, %%mm4 \n\t" 3674 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = */ 3675 /* pav + pbv = pbv + pav */ 3676 "movq %%mm5, %%mm6 \n\t" 3677 "paddw %%mm4, %%mm6 \n\t" 3678 3679 /* pa = abs(p-a) = abs(pav) */ 3680 /* pb = abs(p-b) = abs(pbv) */ 3681 /* pc = abs(p-c) = abs(pcv) */ 3682 "pcmpgtw %%mm5, %%mm0 \n\t" /* create mask pbv bytes < 0 */ 3683 "pcmpgtw %%mm4, %%mm7 \n\t" /* create mask pav bytes < 0 */ 3684 "pand %%mm5, %%mm0 \n\t" /* only pbv bytes < 0 in mm0 */ 3685 "pand %%mm4, %%mm7 \n\t" /* only pav bytes < 0 in mm7 */ 3686 "psubw %%mm0, %%mm5 \n\t" 3687 "psubw %%mm7, %%mm4 \n\t" 3688 "psubw %%mm0, %%mm5 \n\t" 3689 "psubw %%mm7, %%mm4 \n\t" 3690 "pxor %%mm0, %%mm0 \n\t" 3691 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */ 3692 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */ 3693 "psubw %%mm0, %%mm6 \n\t" 3694 /* test pa <= pb */ 3695 "movq %%mm4, %%mm7 \n\t" 3696 "psubw %%mm0, %%mm6 \n\t" 3697 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */ 3698 "movq %%mm7, %%mm0 \n\t" 3699 /* use mm7 mask to merge pa & pb */ 3700 "pand %%mm7, %%mm5 \n\t" 3701 /* use mm0 mask copy to merge a & b */ 3702 "pand %%mm0, %%mm2 \n\t" 3703 "pandn %%mm4, %%mm7 \n\t" 3704 "pandn %%mm1, %%mm0 \n\t" 3705 "paddw %%mm5, %%mm7 \n\t" 3706 "paddw %%mm2, %%mm0 \n\t" 3707 /* test ((pa <= pb)? pa:pb) <= pc */ 3708 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */ 3709 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */ 3710 "pand %%mm7, %%mm3 \n\t" 3711 "pandn %%mm0, %%mm7 \n\t" 3712 "pxor %%mm1, %%mm1 \n\t" 3713 "paddw %%mm3, %%mm7 \n\t" 3714 "pxor %%mm0, %%mm0 \n\t" 3715 "packuswb %%mm1, %%mm7 \n\t" 3716 "movq %%mm2, %%mm3 \n\t" /* load c=Prior(x-bpp) step 1 */ 3717 "pand _ActiveMask, %%mm7 \n\t" 3718 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */ 3719 "psllq _ShiftBpp, %%mm7 \n\t" /* shift bytes to 2nd group of */ 3720 /* 3 bytes */ 3721 /* pav = p - a = (a + b - c) - a = b - c */ 3722 "movq %%mm2, %%mm4 \n\t" 3723 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */ 3724 "psllq _ShiftBpp, %%mm3 \n\t" /* load c=Prior(x-bpp) step 2 */ 3725 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */ 3726 "movq %%mm7, %%mm1 \n\t" 3727 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */ 3728 "psllq _ShiftBpp, %%mm1 \n\t" /* shift bytes */ 3729 /* now mm1 will be used as Raw(x-bpp) */ 3730 /* now do Paeth for 3rd, and final, set of bytes (6-7) */ 3731 "pxor %%mm7, %%mm7 \n\t" 3732 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */ 3733 "psubw %%mm3, %%mm4 \n\t" 3734 /* pbv = p - b = (a + b - c) - b = a - c */ 3735 "movq %%mm1, %%mm5 \n\t" 3736 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 3737 "movq %%mm4, %%mm6 \n\t" 3738 "psubw %%mm3, %%mm5 \n\t" 3739 "pxor %%mm0, %%mm0 \n\t" 3740 "paddw %%mm5, %%mm6 \n\t" 3741 3742 /* pa = abs(p-a) = abs(pav) */ 3743 /* pb = abs(p-b) = abs(pbv) */ 3744 /* pc = abs(p-c) = abs(pcv) */ 3745 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */ 3746 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */ 3747 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */ 3748 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */ 3749 "psubw %%mm0, %%mm4 \n\t" 3750 "psubw %%mm7, %%mm5 \n\t" 3751 "psubw %%mm0, %%mm4 \n\t" 3752 "psubw %%mm7, %%mm5 \n\t" 3753 "pxor %%mm0, %%mm0 \n\t" 3754 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */ 3755 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */ 3756 "psubw %%mm0, %%mm6 \n\t" 3757 /* test pa <= pb */ 3758 "movq %%mm4, %%mm7 \n\t" 3759 "psubw %%mm0, %%mm6 \n\t" 3760 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */ 3761 "movq %%mm7, %%mm0 \n\t" 3762 /* use mm0 mask copy to merge a & b */ 3763 "pand %%mm0, %%mm2 \n\t" 3764 /* use mm7 mask to merge pa & pb */ 3765 "pand %%mm7, %%mm5 \n\t" 3766 "pandn %%mm1, %%mm0 \n\t" 3767 "pandn %%mm4, %%mm7 \n\t" 3768 "paddw %%mm2, %%mm0 \n\t" 3769 "paddw %%mm5, %%mm7 \n\t" 3770 /* test ((pa <= pb)? pa:pb) <= pc */ 3771 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */ 3772 "pand %%mm7, %%mm3 \n\t" 3773 "pandn %%mm0, %%mm7 \n\t" 3774 "paddw %%mm3, %%mm7 \n\t" 3775 "pxor %%mm1, %%mm1 \n\t" 3776 "packuswb %%mm7, %%mm1 \n\t" 3777 /* step ecx to next set of 8 bytes and repeat loop til done */ 3778 "addl $8, %%ecx \n\t" 3779 "pand _ActiveMaskEnd, %%mm1 \n\t" 3780 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with */ 3781 /* Raw(x) */ 3782 3783 "cmpl _MMXLength, %%ecx \n\t" 3784 "pxor %%mm0, %%mm0 \n\t" /* pxor does not affect flags */ 3785 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */ 3786 /* mm1 will be used as Raw(x-bpp) next loop */ 3787 /* mm3 ready to be used as Prior(x-bpp) next loop */ 3788 "jb paeth_3lp \n\t" 3789 3790 : "=S" (dummy_value_S), /* output regs (dummy) */ 3791 "=D" (dummy_value_D) 3792 3793 : "0" (prev_row), /* esi // input regs */ 3794 "1" (row) /* edi */ 3795 3796 : "%ecx" /* clobber list */ 3797#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */ 3798 , "%mm0", "%mm1", "%mm2", "%mm3" 3799 , "%mm4", "%mm5", "%mm6", "%mm7" 3800#endif 3801 ); 3802 } 3803 break; /* end 3 bpp */ 3804 3805 case 6: 3806 //case 7: /* GRR BOGUS */ 3807 //case 5: /* GRR BOGUS */ 3808 { 3809 _ActiveMask.use = 0x00000000ffffffffLL; 3810 _ActiveMask2.use = 0xffffffff00000000LL; 3811 _ShiftBpp.use = bpp << 3; /* == bpp * 8 */ 3812 _ShiftRem.use = 64 - _ShiftBpp.use; 3813 3814 __asm__ __volatile__ ( 3815 "movl _dif, %%ecx \n\t" 3816/* preload "movl row, %%edi \n\t" */ 3817/* preload "movl prev_row, %%esi \n\t" */ 3818 /* prime the pump: load the first Raw(x-bpp) data set */ 3819 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" 3820 "pxor %%mm0, %%mm0 \n\t" 3821 3822 "paeth_6lp: \n\t" 3823 /* must shift to position Raw(x-bpp) data */ 3824 "psrlq _ShiftRem, %%mm1 \n\t" 3825 /* do first set of 4 bytes */ 3826 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */ 3827 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */ 3828 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */ 3829 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack Low bytes of b */ 3830 /* must shift to position Prior(x-bpp) data */ 3831 "psrlq _ShiftRem, %%mm3 \n\t" 3832 /* pav = p - a = (a + b - c) - a = b - c */ 3833 "movq %%mm2, %%mm4 \n\t" 3834 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack Low bytes of c */ 3835 /* pbv = p - b = (a + b - c) - b = a - c */ 3836 "movq %%mm1, %%mm5 \n\t" 3837 "psubw %%mm3, %%mm4 \n\t" 3838 "pxor %%mm7, %%mm7 \n\t" 3839 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 3840 "movq %%mm4, %%mm6 \n\t" 3841 "psubw %%mm3, %%mm5 \n\t" 3842 /* pa = abs(p-a) = abs(pav) */ 3843 /* pb = abs(p-b) = abs(pbv) */ 3844 /* pc = abs(p-c) = abs(pcv) */ 3845 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */ 3846 "paddw %%mm5, %%mm6 \n\t" 3847 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */ 3848 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */ 3849 "psubw %%mm0, %%mm4 \n\t" 3850 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */ 3851 "psubw %%mm0, %%mm4 \n\t" 3852 "psubw %%mm7, %%mm5 \n\t" 3853 "pxor %%mm0, %%mm0 \n\t" 3854 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */ 3855 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */ 3856 "psubw %%mm7, %%mm5 \n\t" 3857 "psubw %%mm0, %%mm6 \n\t" 3858 /* test pa <= pb */ 3859 "movq %%mm4, %%mm7 \n\t" 3860 "psubw %%mm0, %%mm6 \n\t" 3861 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */ 3862 "movq %%mm7, %%mm0 \n\t" 3863 /* use mm7 mask to merge pa & pb */ 3864 "pand %%mm7, %%mm5 \n\t" 3865 /* use mm0 mask copy to merge a & b */ 3866 "pand %%mm0, %%mm2 \n\t" 3867 "pandn %%mm4, %%mm7 \n\t" 3868 "pandn %%mm1, %%mm0 \n\t" 3869 "paddw %%mm5, %%mm7 \n\t" 3870 "paddw %%mm2, %%mm0 \n\t" 3871 /* test ((pa <= pb)? pa:pb) <= pc */ 3872 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */ 3873 "pxor %%mm1, %%mm1 \n\t" 3874 "pand %%mm7, %%mm3 \n\t" 3875 "pandn %%mm0, %%mm7 \n\t" 3876 "paddw %%mm3, %%mm7 \n\t" 3877 "pxor %%mm0, %%mm0 \n\t" 3878 "packuswb %%mm1, %%mm7 \n\t" 3879 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* load c=Prior(x-bpp) */ 3880 "pand _ActiveMask, %%mm7 \n\t" 3881 "psrlq _ShiftRem, %%mm3 \n\t" 3882 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) step 1 */ 3883 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor and Raw(x) */ 3884 "movq %%mm2, %%mm6 \n\t" 3885 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */ 3886 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" 3887 "psllq _ShiftBpp, %%mm6 \n\t" 3888 "movq %%mm7, %%mm5 \n\t" 3889 "psrlq _ShiftRem, %%mm1 \n\t" 3890 "por %%mm6, %%mm3 \n\t" 3891 "psllq _ShiftBpp, %%mm5 \n\t" 3892 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */ 3893 "por %%mm5, %%mm1 \n\t" 3894 /* do second set of 4 bytes */ 3895 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */ 3896 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */ 3897 /* pav = p - a = (a + b - c) - a = b - c */ 3898 "movq %%mm2, %%mm4 \n\t" 3899 /* pbv = p - b = (a + b - c) - b = a - c */ 3900 "movq %%mm1, %%mm5 \n\t" 3901 "psubw %%mm3, %%mm4 \n\t" 3902 "pxor %%mm7, %%mm7 \n\t" 3903 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 3904 "movq %%mm4, %%mm6 \n\t" 3905 "psubw %%mm3, %%mm5 \n\t" 3906 /* pa = abs(p-a) = abs(pav) */ 3907 /* pb = abs(p-b) = abs(pbv) */ 3908 /* pc = abs(p-c) = abs(pcv) */ 3909 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */ 3910 "paddw %%mm5, %%mm6 \n\t" 3911 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */ 3912 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */ 3913 "psubw %%mm0, %%mm4 \n\t" 3914 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */ 3915 "psubw %%mm0, %%mm4 \n\t" 3916 "psubw %%mm7, %%mm5 \n\t" 3917 "pxor %%mm0, %%mm0 \n\t" 3918 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */ 3919 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */ 3920 "psubw %%mm7, %%mm5 \n\t" 3921 "psubw %%mm0, %%mm6 \n\t" 3922 /* test pa <= pb */ 3923 "movq %%mm4, %%mm7 \n\t" 3924 "psubw %%mm0, %%mm6 \n\t" 3925 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */ 3926 "movq %%mm7, %%mm0 \n\t" 3927 /* use mm7 mask to merge pa & pb */ 3928 "pand %%mm7, %%mm5 \n\t" 3929 /* use mm0 mask copy to merge a & b */ 3930 "pand %%mm0, %%mm2 \n\t" 3931 "pandn %%mm4, %%mm7 \n\t" 3932 "pandn %%mm1, %%mm0 \n\t" 3933 "paddw %%mm5, %%mm7 \n\t" 3934 "paddw %%mm2, %%mm0 \n\t" 3935 /* test ((pa <= pb)? pa:pb) <= pc */ 3936 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */ 3937 "pxor %%mm1, %%mm1 \n\t" 3938 "pand %%mm7, %%mm3 \n\t" 3939 "pandn %%mm0, %%mm7 \n\t" 3940 "pxor %%mm1, %%mm1 \n\t" 3941 "paddw %%mm3, %%mm7 \n\t" 3942 "pxor %%mm0, %%mm0 \n\t" 3943 /* step ecx to next set of 8 bytes and repeat loop til done */ 3944 "addl $8, %%ecx \n\t" 3945 "packuswb %%mm7, %%mm1 \n\t" 3946 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with Raw(x) */ 3947 "cmpl _MMXLength, %%ecx \n\t" 3948 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */ 3949 /* mm1 will be used as Raw(x-bpp) next loop */ 3950 "jb paeth_6lp \n\t" 3951 3952 : "=S" (dummy_value_S), /* output regs (dummy) */ 3953 "=D" (dummy_value_D) 3954 3955 : "0" (prev_row), /* esi // input regs */ 3956 "1" (row) /* edi */ 3957 3958 : "%ecx" /* clobber list */ 3959#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */ 3960 , "%mm0", "%mm1", "%mm2", "%mm3" 3961 , "%mm4", "%mm5", "%mm6", "%mm7" 3962#endif 3963 ); 3964 } 3965 break; /* end 6 bpp */ 3966 3967 case 4: 3968 { 3969 _ActiveMask.use = 0x00000000ffffffffLL; 3970 3971 __asm__ __volatile__ ( 3972 "movl _dif, %%ecx \n\t" 3973/* preload "movl row, %%edi \n\t" */ 3974/* preload "movl prev_row, %%esi \n\t" */ 3975 "pxor %%mm0, %%mm0 \n\t" 3976 /* prime the pump: load the first Raw(x-bpp) data set */ 3977 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* only time should need to read */ 3978 /* a=Raw(x-bpp) bytes */ 3979 "paeth_4lp: \n\t" 3980 /* do first set of 4 bytes */ 3981 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */ 3982 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */ 3983 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */ 3984 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */ 3985 /* pav = p - a = (a + b - c) - a = b - c */ 3986 "movq %%mm2, %%mm4 \n\t" 3987 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */ 3988 /* pbv = p - b = (a + b - c) - b = a - c */ 3989 "movq %%mm1, %%mm5 \n\t" 3990 "psubw %%mm3, %%mm4 \n\t" 3991 "pxor %%mm7, %%mm7 \n\t" 3992 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 3993 "movq %%mm4, %%mm6 \n\t" 3994 "psubw %%mm3, %%mm5 \n\t" 3995 /* pa = abs(p-a) = abs(pav) */ 3996 /* pb = abs(p-b) = abs(pbv) */ 3997 /* pc = abs(p-c) = abs(pcv) */ 3998 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */ 3999 "paddw %%mm5, %%mm6 \n\t" 4000 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */ 4001 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */ 4002 "psubw %%mm0, %%mm4 \n\t" 4003 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */ 4004 "psubw %%mm0, %%mm4 \n\t" 4005 "psubw %%mm7, %%mm5 \n\t" 4006 "pxor %%mm0, %%mm0 \n\t" 4007 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */ 4008 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */ 4009 "psubw %%mm7, %%mm5 \n\t" 4010 "psubw %%mm0, %%mm6 \n\t" 4011 /* test pa <= pb */ 4012 "movq %%mm4, %%mm7 \n\t" 4013 "psubw %%mm0, %%mm6 \n\t" 4014 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */ 4015 "movq %%mm7, %%mm0 \n\t" 4016 /* use mm7 mask to merge pa & pb */ 4017 "pand %%mm7, %%mm5 \n\t" 4018 /* use mm0 mask copy to merge a & b */ 4019 "pand %%mm0, %%mm2 \n\t" 4020 "pandn %%mm4, %%mm7 \n\t" 4021 "pandn %%mm1, %%mm0 \n\t" 4022 "paddw %%mm5, %%mm7 \n\t" 4023 "paddw %%mm2, %%mm0 \n\t" 4024 /* test ((pa <= pb)? pa:pb) <= pc */ 4025 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */ 4026 "pxor %%mm1, %%mm1 \n\t" 4027 "pand %%mm7, %%mm3 \n\t" 4028 "pandn %%mm0, %%mm7 \n\t" 4029 "paddw %%mm3, %%mm7 \n\t" 4030 "pxor %%mm0, %%mm0 \n\t" 4031 "packuswb %%mm1, %%mm7 \n\t" 4032 "movq (%%esi,%%ecx,), %%mm3 \n\t" /* load c=Prior(x-bpp) */ 4033 "pand _ActiveMask, %%mm7 \n\t" 4034 "movq %%mm3, %%mm2 \n\t" /* load b=Prior(x) step 1 */ 4035 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */ 4036 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */ 4037 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */ 4038 "movq %%mm7, %%mm1 \n\t" /* now mm1 will be used as Raw(x-bpp) */ 4039 /* do second set of 4 bytes */ 4040 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack Low bytes of b */ 4041 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */ 4042 /* pav = p - a = (a + b - c) - a = b - c */ 4043 "movq %%mm2, %%mm4 \n\t" 4044 /* pbv = p - b = (a + b - c) - b = a - c */ 4045 "movq %%mm1, %%mm5 \n\t" 4046 "psubw %%mm3, %%mm4 \n\t" 4047 "pxor %%mm7, %%mm7 \n\t" 4048 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 4049 "movq %%mm4, %%mm6 \n\t" 4050 "psubw %%mm3, %%mm5 \n\t" 4051 /* pa = abs(p-a) = abs(pav) */ 4052 /* pb = abs(p-b) = abs(pbv) */ 4053 /* pc = abs(p-c) = abs(pcv) */ 4054 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */ 4055 "paddw %%mm5, %%mm6 \n\t" 4056 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */ 4057 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */ 4058 "psubw %%mm0, %%mm4 \n\t" 4059 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */ 4060 "psubw %%mm0, %%mm4 \n\t" 4061 "psubw %%mm7, %%mm5 \n\t" 4062 "pxor %%mm0, %%mm0 \n\t" 4063 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */ 4064 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */ 4065 "psubw %%mm7, %%mm5 \n\t" 4066 "psubw %%mm0, %%mm6 \n\t" 4067 /* test pa <= pb */ 4068 "movq %%mm4, %%mm7 \n\t" 4069 "psubw %%mm0, %%mm6 \n\t" 4070 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */ 4071 "movq %%mm7, %%mm0 \n\t" 4072 /* use mm7 mask to merge pa & pb */ 4073 "pand %%mm7, %%mm5 \n\t" 4074 /* use mm0 mask copy to merge a & b */ 4075 "pand %%mm0, %%mm2 \n\t" 4076 "pandn %%mm4, %%mm7 \n\t" 4077 "pandn %%mm1, %%mm0 \n\t" 4078 "paddw %%mm5, %%mm7 \n\t" 4079 "paddw %%mm2, %%mm0 \n\t" 4080 /* test ((pa <= pb)? pa:pb) <= pc */ 4081 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */ 4082 "pxor %%mm1, %%mm1 \n\t" 4083 "pand %%mm7, %%mm3 \n\t" 4084 "pandn %%mm0, %%mm7 \n\t" 4085 "pxor %%mm1, %%mm1 \n\t" 4086 "paddw %%mm3, %%mm7 \n\t" 4087 "pxor %%mm0, %%mm0 \n\t" 4088 /* step ecx to next set of 8 bytes and repeat loop til done */ 4089 "addl $8, %%ecx \n\t" 4090 "packuswb %%mm7, %%mm1 \n\t" 4091 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add predictor with Raw(x) */ 4092 "cmpl _MMXLength, %%ecx \n\t" 4093 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */ 4094 /* mm1 will be used as Raw(x-bpp) next loop */ 4095 "jb paeth_4lp \n\t" 4096 4097 : "=S" (dummy_value_S), /* output regs (dummy) */ 4098 "=D" (dummy_value_D) 4099 4100 : "0" (prev_row), /* esi // input regs */ 4101 "1" (row) /* edi */ 4102 4103 : "%ecx" /* clobber list */ 4104#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */ 4105 , "%mm0", "%mm1", "%mm2", "%mm3" 4106 , "%mm4", "%mm5", "%mm6", "%mm7" 4107#endif 4108 ); 4109 } 4110 break; /* end 4 bpp */ 4111 4112 case 8: /* bpp == 8 */ 4113 { 4114 _ActiveMask.use = 0x00000000ffffffffLL; 4115 4116 __asm__ __volatile__ ( 4117 "movl _dif, %%ecx \n\t" 4118/* preload "movl row, %%edi \n\t" */ 4119/* preload "movl prev_row, %%esi \n\t" */ 4120 "pxor %%mm0, %%mm0 \n\t" 4121 /* prime the pump: load the first Raw(x-bpp) data set */ 4122 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* only time should need to read */ 4123 /* a=Raw(x-bpp) bytes */ 4124 "paeth_8lp: \n\t" 4125 /* do first set of 4 bytes */ 4126 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */ 4127 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack Low bytes of a */ 4128 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */ 4129 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack Low bytes of b */ 4130 /* pav = p - a = (a + b - c) - a = b - c */ 4131 "movq %%mm2, %%mm4 \n\t" 4132 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack Low bytes of c */ 4133 /* pbv = p - b = (a + b - c) - b = a - c */ 4134 "movq %%mm1, %%mm5 \n\t" 4135 "psubw %%mm3, %%mm4 \n\t" 4136 "pxor %%mm7, %%mm7 \n\t" 4137 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 4138 "movq %%mm4, %%mm6 \n\t" 4139 "psubw %%mm3, %%mm5 \n\t" 4140 /* pa = abs(p-a) = abs(pav) */ 4141 /* pb = abs(p-b) = abs(pbv) */ 4142 /* pc = abs(p-c) = abs(pcv) */ 4143 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */ 4144 "paddw %%mm5, %%mm6 \n\t" 4145 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */ 4146 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */ 4147 "psubw %%mm0, %%mm4 \n\t" 4148 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */ 4149 "psubw %%mm0, %%mm4 \n\t" 4150 "psubw %%mm7, %%mm5 \n\t" 4151 "pxor %%mm0, %%mm0 \n\t" 4152 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */ 4153 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */ 4154 "psubw %%mm7, %%mm5 \n\t" 4155 "psubw %%mm0, %%mm6 \n\t" 4156 /* test pa <= pb */ 4157 "movq %%mm4, %%mm7 \n\t" 4158 "psubw %%mm0, %%mm6 \n\t" 4159 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */ 4160 "movq %%mm7, %%mm0 \n\t" 4161 /* use mm7 mask to merge pa & pb */ 4162 "pand %%mm7, %%mm5 \n\t" 4163 /* use mm0 mask copy to merge a & b */ 4164 "pand %%mm0, %%mm2 \n\t" 4165 "pandn %%mm4, %%mm7 \n\t" 4166 "pandn %%mm1, %%mm0 \n\t" 4167 "paddw %%mm5, %%mm7 \n\t" 4168 "paddw %%mm2, %%mm0 \n\t" 4169 /* test ((pa <= pb)? pa:pb) <= pc */ 4170 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */ 4171 "pxor %%mm1, %%mm1 \n\t" 4172 "pand %%mm7, %%mm3 \n\t" 4173 "pandn %%mm0, %%mm7 \n\t" 4174 "paddw %%mm3, %%mm7 \n\t" 4175 "pxor %%mm0, %%mm0 \n\t" 4176 "packuswb %%mm1, %%mm7 \n\t" 4177 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */ 4178 "pand _ActiveMask, %%mm7 \n\t" 4179 "movq (%%esi,%%ecx,), %%mm2 \n\t" /* load b=Prior(x) */ 4180 "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */ 4181 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack High bytes of c */ 4182 "movq %%mm7, (%%edi,%%ecx,) \n\t" /* write back updated value */ 4183 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* read a=Raw(x-bpp) bytes */ 4184 4185 /* do second set of 4 bytes */ 4186 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack High bytes of b */ 4187 "punpckhbw %%mm0, %%mm1 \n\t" /* unpack High bytes of a */ 4188 /* pav = p - a = (a + b - c) - a = b - c */ 4189 "movq %%mm2, %%mm4 \n\t" 4190 /* pbv = p - b = (a + b - c) - b = a - c */ 4191 "movq %%mm1, %%mm5 \n\t" 4192 "psubw %%mm3, %%mm4 \n\t" 4193 "pxor %%mm7, %%mm7 \n\t" 4194 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 4195 "movq %%mm4, %%mm6 \n\t" 4196 "psubw %%mm3, %%mm5 \n\t" 4197 /* pa = abs(p-a) = abs(pav) */ 4198 /* pb = abs(p-b) = abs(pbv) */ 4199 /* pc = abs(p-c) = abs(pcv) */ 4200 "pcmpgtw %%mm4, %%mm0 \n\t" /* create mask pav bytes < 0 */ 4201 "paddw %%mm5, %%mm6 \n\t" 4202 "pand %%mm4, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */ 4203 "pcmpgtw %%mm5, %%mm7 \n\t" /* create mask pbv bytes < 0 */ 4204 "psubw %%mm0, %%mm4 \n\t" 4205 "pand %%mm5, %%mm7 \n\t" /* only pbv bytes < 0 in mm0 */ 4206 "psubw %%mm0, %%mm4 \n\t" 4207 "psubw %%mm7, %%mm5 \n\t" 4208 "pxor %%mm0, %%mm0 \n\t" 4209 "pcmpgtw %%mm6, %%mm0 \n\t" /* create mask pcv bytes < 0 */ 4210 "pand %%mm6, %%mm0 \n\t" /* only pav bytes < 0 in mm7 */ 4211 "psubw %%mm7, %%mm5 \n\t" 4212 "psubw %%mm0, %%mm6 \n\t" 4213 /* test pa <= pb */ 4214 "movq %%mm4, %%mm7 \n\t" 4215 "psubw %%mm0, %%mm6 \n\t" 4216 "pcmpgtw %%mm5, %%mm7 \n\t" /* pa > pb? */ 4217 "movq %%mm7, %%mm0 \n\t" 4218 /* use mm7 mask to merge pa & pb */ 4219 "pand %%mm7, %%mm5 \n\t" 4220 /* use mm0 mask copy to merge a & b */ 4221 "pand %%mm0, %%mm2 \n\t" 4222 "pandn %%mm4, %%mm7 \n\t" 4223 "pandn %%mm1, %%mm0 \n\t" 4224 "paddw %%mm5, %%mm7 \n\t" 4225 "paddw %%mm2, %%mm0 \n\t" 4226 /* test ((pa <= pb)? pa:pb) <= pc */ 4227 "pcmpgtw %%mm6, %%mm7 \n\t" /* pab > pc? */ 4228 "pxor %%mm1, %%mm1 \n\t" 4229 "pand %%mm7, %%mm3 \n\t" 4230 "pandn %%mm0, %%mm7 \n\t" 4231 "pxor %%mm1, %%mm1 \n\t" 4232 "paddw %%mm3, %%mm7 \n\t" 4233 "pxor %%mm0, %%mm0 \n\t" 4234 /* step ecx to next set of 8 bytes and repeat loop til done */ 4235 "addl $8, %%ecx \n\t" 4236 "packuswb %%mm7, %%mm1 \n\t" 4237 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with Raw(x) */ 4238 "cmpl _MMXLength, %%ecx \n\t" 4239 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */ 4240 /* mm1 will be used as Raw(x-bpp) next loop */ 4241 "jb paeth_8lp \n\t" 4242 4243 : "=S" (dummy_value_S), /* output regs (dummy) */ 4244 "=D" (dummy_value_D) 4245 4246 : "0" (prev_row), /* esi // input regs */ 4247 "1" (row) /* edi */ 4248 4249 : "%ecx" /* clobber list */ 4250#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */ 4251 , "%mm0", "%mm1", "%mm2", "%mm3" 4252 , "%mm4", "%mm5", "%mm6", "%mm7" 4253#endif 4254 ); 4255 } 4256 break; /* end 8 bpp */ 4257 4258 case 1: /* bpp = 1 */ 4259 case 2: /* bpp = 2 */ 4260 default: /* bpp > 8 */ 4261 { 4262 __asm__ __volatile__ ( 4263#ifdef __PIC__ 4264 "pushl %%ebx \n\t" /* save Global Offset Table index */ 4265#endif 4266 "movl _dif, %%ebx \n\t" 4267 "cmpl _FullLength, %%ebx \n\t" 4268 "jnb paeth_dend \n\t" 4269 4270/* preload "movl row, %%edi \n\t" */ 4271/* preload "movl prev_row, %%esi \n\t" */ 4272 /* do Paeth decode for remaining bytes */ 4273 "movl %%ebx, %%edx \n\t" 4274/* preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx) */ 4275 "subl %%ecx, %%edx \n\t" /* edx = ebx - bpp */ 4276 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx */ 4277 4278 "paeth_dlp: \n\t" 4279 "xorl %%eax, %%eax \n\t" 4280 /* pav = p - a = (a + b - c) - a = b - c */ 4281 "movb (%%esi,%%ebx,), %%al \n\t" /* load Prior(x) into al */ 4282 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */ 4283 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */ 4284 "movl %%eax, _patemp \n\t" /* Save pav for later use */ 4285 "xorl %%eax, %%eax \n\t" 4286 /* pbv = p - b = (a + b - c) - b = a - c */ 4287 "movb (%%edi,%%edx,), %%al \n\t" /* load Raw(x-bpp) into al */ 4288 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */ 4289 "movl %%eax, %%ecx \n\t" 4290 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 4291 "addl _patemp, %%eax \n\t" /* pcv = pav + pbv */ 4292 /* pc = abs(pcv) */ 4293 "testl $0x80000000, %%eax \n\t" 4294 "jz paeth_dpca \n\t" 4295 "negl %%eax \n\t" /* reverse sign of neg values */ 4296 4297 "paeth_dpca: \n\t" 4298 "movl %%eax, _pctemp \n\t" /* save pc for later use */ 4299 /* pb = abs(pbv) */ 4300 "testl $0x80000000, %%ecx \n\t" 4301 "jz paeth_dpba \n\t" 4302 "negl %%ecx \n\t" /* reverse sign of neg values */ 4303 4304 "paeth_dpba: \n\t" 4305 "movl %%ecx, _pbtemp \n\t" /* save pb for later use */ 4306 /* pa = abs(pav) */ 4307 "movl _patemp, %%eax \n\t" 4308 "testl $0x80000000, %%eax \n\t" 4309 "jz paeth_dpaa \n\t" 4310 "negl %%eax \n\t" /* reverse sign of neg values */ 4311 4312 "paeth_dpaa: \n\t" 4313 "movl %%eax, _patemp \n\t" /* save pa for later use */ 4314 /* test if pa <= pb */ 4315 "cmpl %%ecx, %%eax \n\t" 4316 "jna paeth_dabb \n\t" 4317 /* pa > pb; now test if pb <= pc */ 4318 "cmpl _pctemp, %%ecx \n\t" 4319 "jna paeth_dbbc \n\t" 4320 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */ 4321 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */ 4322 "jmp paeth_dpaeth \n\t" 4323 4324 "paeth_dbbc: \n\t" 4325 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */ 4326 "movb (%%esi,%%ebx,), %%cl \n\t" /* load Prior(x) into cl */ 4327 "jmp paeth_dpaeth \n\t" 4328 4329 "paeth_dabb: \n\t" 4330 /* pa <= pb; now test if pa <= pc */ 4331 "cmpl _pctemp, %%eax \n\t" 4332 "jna paeth_dabc \n\t" 4333 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */ 4334 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */ 4335 "jmp paeth_dpaeth \n\t" 4336 4337 "paeth_dabc: \n\t" 4338 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */ 4339 "movb (%%edi,%%edx,), %%cl \n\t" /* load Raw(x-bpp) into cl */ 4340 4341 "paeth_dpaeth: \n\t" 4342 "incl %%ebx \n\t" 4343 "incl %%edx \n\t" 4344 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */ 4345 "addb %%cl, -1(%%edi,%%ebx,) \n\t" 4346 "cmpl _FullLength, %%ebx \n\t" 4347 "jb paeth_dlp \n\t" 4348 4349 "paeth_dend: \n\t" 4350#ifdef __PIC__ 4351 "popl %%ebx \n\t" /* index to Global Offset Table */ 4352#endif 4353 4354 : "=c" (dummy_value_c), /* output regs (dummy) */ 4355 "=S" (dummy_value_S), 4356 "=D" (dummy_value_D) 4357 4358 : "0" (bpp), /* ecx // input regs */ 4359 "1" (prev_row), /* esi */ 4360 "2" (row) /* edi */ 4361 4362 : "%eax", "%edx" /* clobber list */ 4363#ifndef __PIC__ 4364 , "%ebx" 4365#endif 4366 ); 4367 } 4368 return; /* No need to go further with this one */ 4369 4370 } /* end switch (bpp) */ 4371 4372 __asm__ __volatile__ ( 4373 /* MMX acceleration complete; now do clean-up */ 4374 /* check if any remaining bytes left to decode */ 4375#ifdef __PIC__ 4376 "pushl %%ebx \n\t" /* save index to Global Offset Table */ 4377#endif 4378 "movl _MMXLength, %%ebx \n\t" 4379 "cmpl _FullLength, %%ebx \n\t" 4380 "jnb paeth_end \n\t" 4381/*pre "movl row, %%edi \n\t" */ 4382/*pre "movl prev_row, %%esi \n\t" */ 4383 /* do Paeth decode for remaining bytes */ 4384 "movl %%ebx, %%edx \n\t" 4385/*pre "subl bpp, %%edx \n\t" */ /* (bpp is preloaded into ecx) */ 4386 "subl %%ecx, %%edx \n\t" /* edx = ebx - bpp */ 4387 "xorl %%ecx, %%ecx \n\t" /* zero ecx before using cl & cx below */ 4388 4389 "paeth_lp2: \n\t" 4390 "xorl %%eax, %%eax \n\t" 4391 /* pav = p - a = (a + b - c) - a = b - c */ 4392 "movb (%%esi,%%ebx,), %%al \n\t" /* load Prior(x) into al */ 4393 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */ 4394 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */ 4395 "movl %%eax, _patemp \n\t" /* Save pav for later use */ 4396 "xorl %%eax, %%eax \n\t" 4397 /* pbv = p - b = (a + b - c) - b = a - c */ 4398 "movb (%%edi,%%edx,), %%al \n\t" /* load Raw(x-bpp) into al */ 4399 "subl %%ecx, %%eax \n\t" /* subtract Prior(x-bpp) */ 4400 "movl %%eax, %%ecx \n\t" 4401 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */ 4402 "addl _patemp, %%eax \n\t" /* pcv = pav + pbv */ 4403 /* pc = abs(pcv) */ 4404 "testl $0x80000000, %%eax \n\t" 4405 "jz paeth_pca2 \n\t" 4406 "negl %%eax \n\t" /* reverse sign of neg values */ 4407 4408 "paeth_pca2: \n\t" 4409 "movl %%eax, _pctemp \n\t" /* save pc for later use */ 4410 /* pb = abs(pbv) */ 4411 "testl $0x80000000, %%ecx \n\t" 4412 "jz paeth_pba2 \n\t" 4413 "negl %%ecx \n\t" /* reverse sign of neg values */ 4414 4415 "paeth_pba2: \n\t" 4416 "movl %%ecx, _pbtemp \n\t" /* save pb for later use */ 4417 /* pa = abs(pav) */ 4418 "movl _patemp, %%eax \n\t" 4419 "testl $0x80000000, %%eax \n\t" 4420 "jz paeth_paa2 \n\t" 4421 "negl %%eax \n\t" /* reverse sign of neg values */ 4422 4423 "paeth_paa2: \n\t" 4424 "movl %%eax, _patemp \n\t" /* save pa for later use */ 4425 /* test if pa <= pb */ 4426 "cmpl %%ecx, %%eax \n\t" 4427 "jna paeth_abb2 \n\t" 4428 /* pa > pb; now test if pb <= pc */ 4429 "cmpl _pctemp, %%ecx \n\t" 4430 "jna paeth_bbc2 \n\t" 4431 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */ 4432 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */ 4433 "jmp paeth_paeth2 \n\t" 4434 4435 "paeth_bbc2: \n\t" 4436 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */ 4437 "movb (%%esi,%%ebx,), %%cl \n\t" /* load Prior(x) into cl */ 4438 "jmp paeth_paeth2 \n\t" 4439 4440 "paeth_abb2: \n\t" 4441 /* pa <= pb; now test if pa <= pc */ 4442 "cmpl _pctemp, %%eax \n\t" 4443 "jna paeth_abc2 \n\t" 4444 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */ 4445 "movb (%%esi,%%edx,), %%cl \n\t" /* load Prior(x-bpp) into cl */ 4446 "jmp paeth_paeth2 \n\t" 4447 4448 "paeth_abc2: \n\t" 4449 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */ 4450 "movb (%%edi,%%edx,), %%cl \n\t" /* load Raw(x-bpp) into cl */ 4451 4452 "paeth_paeth2: \n\t" 4453 "incl %%ebx \n\t" 4454 "incl %%edx \n\t" 4455 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */ 4456 "addb %%cl, -1(%%edi,%%ebx,) \n\t" 4457 "cmpl _FullLength, %%ebx \n\t" 4458 "jb paeth_lp2 \n\t" 4459 4460 "paeth_end: \n\t" 4461 "EMMS \n\t" /* end MMX; prep for poss. FP instrs. */ 4462#ifdef __PIC__ 4463 "popl %%ebx \n\t" /* restore index to Global Offset Table */ 4464#endif 4465 4466 : "=c" (dummy_value_c), /* output regs (dummy) */ 4467 "=S" (dummy_value_S), 4468 "=D" (dummy_value_D) 4469 4470 : "0" (bpp), /* ecx // input regs */ 4471 "1" (prev_row), /* esi */ 4472 "2" (row) /* edi */ 4473 4474 : "%eax", "%edx" /* clobber list (no input regs!) */ 4475#ifndef __PIC__ 4476 , "%ebx" 4477#endif 4478 ); 4479 4480} /* end png_read_filter_row_mmx_paeth() */ 4481#endif 4482 4483 4484 4485 4486#ifdef PNG_THREAD_UNSAFE_OK 4487/*===========================================================================*/ 4488/* */ 4489/* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B */ 4490/* */ 4491/*===========================================================================*/ 4492 4493/* Optimized code for PNG Sub filter decoder */ 4494 4495static void /* PRIVATE */ 4496png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row) 4497{ 4498 int bpp; 4499 int dummy_value_a; 4500 int dummy_value_D; 4501 4502 bpp = (row_info->pixel_depth + 7) >> 3; /* calc number of bytes per pixel */ 4503 _FullLength = row_info->rowbytes - bpp; /* number of bytes to filter */ 4504 4505 __asm__ __volatile__ ( 4506/*pre "movl row, %%edi \n\t" */ 4507 "movl %%edi, %%esi \n\t" /* lp = row */ 4508/*pre "movl bpp, %%eax \n\t" */ 4509 "addl %%eax, %%edi \n\t" /* rp = row + bpp */ 4510/*irr "xorl %%eax, %%eax \n\t" */ 4511 /* get # of bytes to alignment */ 4512 "movl %%edi, _dif \n\t" /* take start of row */ 4513 "addl $0xf, _dif \n\t" /* add 7 + 8 to incr past */ 4514 /* alignment boundary */ 4515 "xorl %%ecx, %%ecx \n\t" 4516 "andl $0xfffffff8, _dif \n\t" /* mask to alignment boundary */ 4517 "subl %%edi, _dif \n\t" /* subtract from start ==> value */ 4518 "jz sub_go \n\t" /* ecx at alignment */ 4519 4520 "sub_lp1: \n\t" /* fix alignment */ 4521 "movb (%%esi,%%ecx,), %%al \n\t" 4522 "addb %%al, (%%edi,%%ecx,) \n\t" 4523 "incl %%ecx \n\t" 4524 "cmpl _dif, %%ecx \n\t" 4525 "jb sub_lp1 \n\t" 4526 4527 "sub_go: \n\t" 4528 "movl _FullLength, %%eax \n\t" 4529 "movl %%eax, %%edx \n\t" 4530 "subl %%ecx, %%edx \n\t" /* subtract alignment fix */ 4531 "andl $0x00000007, %%edx \n\t" /* calc bytes over mult of 8 */ 4532 "subl %%edx, %%eax \n\t" /* drop over bytes from length */ 4533 "movl %%eax, _MMXLength \n\t" 4534 4535 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */ 4536 "=D" (dummy_value_D) /* 1 */ 4537 4538 : "0" (bpp), /* eax // input regs */ 4539 "1" (row) /* edi */ 4540 4541 : "%esi", "%ecx", "%edx" // clobber list 4542 4543#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */ 4544 , "%mm0", "%mm1", "%mm2", "%mm3" 4545 , "%mm4", "%mm5", "%mm6", "%mm7" 4546#endif 4547 ); 4548 4549 /* now do the math for the rest of the row */ 4550 switch (bpp) 4551 { 4552 case 3: 4553 { 4554 _ActiveMask.use = 0x0000ffffff000000LL; 4555 _ShiftBpp.use = 24; /* == 3 * 8 */ 4556 _ShiftRem.use = 40; /* == 64 - 24 */ 4557 4558 __asm__ __volatile__ ( 4559/* preload "movl row, %%edi \n\t" */ 4560 "movq _ActiveMask, %%mm7 \n\t" /* load _ActiveMask for 2nd */ 4561 /* active byte group */ 4562 "movl %%edi, %%esi \n\t" /* lp = row */ 4563/* preload "movl bpp, %%eax \n\t" */ 4564 "addl %%eax, %%edi \n\t" /* rp = row + bpp */ 4565 "movq %%mm7, %%mm6 \n\t" 4566 "movl _dif, %%edx \n\t" 4567 "psllq _ShiftBpp, %%mm6 \n\t" /* move mask in mm6 to cover */ 4568 /* 3rd active byte group */ 4569 /* prime the pump: load the first Raw(x-bpp) data set */ 4570 "movq -8(%%edi,%%edx,), %%mm1 \n\t" 4571 4572 "sub_3lp: \n\t" /* shift data for adding first */ 4573 "psrlq _ShiftRem, %%mm1 \n\t" /* bpp bytes (no need for mask; */ 4574 /* shift clears inactive bytes) */ 4575 /* add 1st active group */ 4576 "movq (%%edi,%%edx,), %%mm0 \n\t" 4577 "paddb %%mm1, %%mm0 \n\t" 4578 4579 /* add 2nd active group */ 4580 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */ 4581 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */ 4582 "pand %%mm7, %%mm1 \n\t" /* mask to use 2nd active group */ 4583 "paddb %%mm1, %%mm0 \n\t" 4584 4585 /* add 3rd active group */ 4586 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */ 4587 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */ 4588 "pand %%mm6, %%mm1 \n\t" /* mask to use 3rd active group */ 4589 "addl $8, %%edx \n\t" 4590 "paddb %%mm1, %%mm0 \n\t" 4591 4592 "cmpl _MMXLength, %%edx \n\t" 4593 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* write updated Raws to array */ 4594 "movq %%mm0, %%mm1 \n\t" /* prep 1st add at top of loop */ 4595 "jb sub_3lp \n\t" 4596 4597 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */ 4598 "=D" (dummy_value_D) /* 1 */ 4599 4600 : "0" (bpp), /* eax // input regs */ 4601 "1" (row) /* edi */ 4602 4603 : "%edx", "%esi" /* clobber list */ 4604#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */ 4605 , "%mm0", "%mm1", "%mm6", "%mm7" 4606#endif 4607 ); 4608 } 4609 break; 4610 4611 case 1: 4612 { 4613 __asm__ __volatile__ ( 4614 "movl _dif, %%edx \n\t" 4615/* preload "movl row, %%edi \n\t" */ 4616 "cmpl _FullLength, %%edx \n\t" 4617 "jnb sub_1end \n\t" 4618 "movl %%edi, %%esi \n\t" /* lp = row */ 4619 "xorl %%eax, %%eax \n\t" 4620/* preload "movl bpp, %%eax \n\t" */ 4621 "addl %%eax, %%edi \n\t" /* rp = row + bpp */ 4622 4623 "sub_1lp: \n\t" 4624 "movb (%%esi,%%edx,), %%al \n\t" 4625 "addb %%al, (%%edi,%%edx,) \n\t" 4626 "incl %%edx \n\t" 4627 "cmpl _FullLength, %%edx \n\t" 4628 "jb sub_1lp \n\t" 4629 4630 "sub_1end: \n\t" 4631 4632 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */ 4633 "=D" (dummy_value_D) /* 1 */ 4634 4635 : "0" (bpp), /* eax // input regs */ 4636 "1" (row) /* edi */ 4637 4638 : "%edx", "%esi" /* clobber list */ 4639 ); 4640 } 4641 return; 4642 4643 case 6: 4644 case 4: 4645 //case 7: /* GRR BOGUS */ 4646 //case 5: /* GRR BOGUS */ 4647 { 4648 _ShiftBpp.use = bpp << 3; 4649 _ShiftRem.use = 64 - _ShiftBpp.use; 4650 4651 __asm__ __volatile__ ( 4652/* preload "movl row, %%edi \n\t" */ 4653 "movl _dif, %%edx \n\t" 4654 "movl %%edi, %%esi \n\t" /* lp = row */ 4655/* preload "movl bpp, %%eax \n\t" */ 4656 "addl %%eax, %%edi \n\t" /* rp = row + bpp */ 4657 4658 /* prime the pump: load the first Raw(x-bpp) data set */ 4659 "movq -8(%%edi,%%edx,), %%mm1 \n\t" 4660 4661 "sub_4lp: \n\t" /* shift data for adding first */ 4662 "psrlq _ShiftRem, %%mm1 \n\t" /* bpp bytes (no need for mask; */ 4663 /* shift clears inactive bytes) */ 4664 "movq (%%edi,%%edx,), %%mm0 \n\t" 4665 "paddb %%mm1, %%mm0 \n\t" 4666 4667 /* add 2nd active group */ 4668 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */ 4669 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */ 4670 "addl $8, %%edx \n\t" 4671 "paddb %%mm1, %%mm0 \n\t" 4672 4673 "cmpl _MMXLength, %%edx \n\t" 4674 "movq %%mm0, -8(%%edi,%%edx,) \n\t" 4675 "movq %%mm0, %%mm1 \n\t" /* prep 1st add at top of loop */ 4676 "jb sub_4lp \n\t" 4677 4678 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */ 4679 "=D" (dummy_value_D) /* 1 */ 4680 4681 : "0" (bpp), /* eax // input regs */ 4682 "1" (row) /* edi */ 4683 4684 : "%edx", "%esi" /* clobber list */ 4685#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */ 4686 , "%mm0", "%mm1" 4687#endif 4688 ); 4689 } 4690 break; 4691 4692 case 2: 4693 { 4694 _ActiveMask.use = 0x00000000ffff0000LL; 4695 _ShiftBpp.use = 16; /* == 2 * 8 */ 4696 _ShiftRem.use = 48; /* == 64 - 16 */ 4697 4698 __asm__ __volatile__ ( 4699 "movq _ActiveMask, %%mm7 \n\t" /* load _ActiveMask for 2nd */ 4700 /* active byte group */ 4701 "movl _dif, %%edx \n\t" 4702 "movq %%mm7, %%mm6 \n\t" 4703/* preload "movl row, %%edi \n\t" */ 4704 "psllq _ShiftBpp, %%mm6 \n\t" /* move mask in mm6 to cover */ 4705 /* 3rd active byte group */ 4706 "movl %%edi, %%esi \n\t" /* lp = row */ 4707 "movq %%mm6, %%mm5 \n\t" 4708/* preload "movl bpp, %%eax \n\t" */ 4709 "addl %%eax, %%edi \n\t" /* rp = row + bpp */ 4710 "psllq _ShiftBpp, %%mm5 \n\t" /* move mask in mm5 to cover */ 4711 /* 4th active byte group */ 4712 /* prime the pump: load the first Raw(x-bpp) data set */ 4713 "movq -8(%%edi,%%edx,), %%mm1 \n\t" 4714 4715 "sub_2lp: \n\t" /* shift data for adding first */ 4716 "psrlq _ShiftRem, %%mm1 \n\t" /* bpp bytes (no need for mask; */ 4717 /* shift clears inactive bytes) */ 4718 /* add 1st active group */ 4719 "movq (%%edi,%%edx,), %%mm0 \n\t" 4720 "paddb %%mm1, %%mm0 \n\t" 4721 4722 /* add 2nd active group */ 4723 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */ 4724 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */ 4725 "pand %%mm7, %%mm1 \n\t" /* mask to use 2nd active group */ 4726 "paddb %%mm1, %%mm0 \n\t" 4727 4728 /* add 3rd active group */ 4729 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */ 4730 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */ 4731 "pand %%mm6, %%mm1 \n\t" /* mask to use 3rd active group */ 4732 "paddb %%mm1, %%mm0 \n\t" 4733 4734 /* add 4th active group */ 4735 "movq %%mm0, %%mm1 \n\t" /* mov updated Raws to mm1 */ 4736 "psllq _ShiftBpp, %%mm1 \n\t" /* shift data to pos. correctly */ 4737 "pand %%mm5, %%mm1 \n\t" /* mask to use 4th active group */ 4738 "addl $8, %%edx \n\t" 4739 "paddb %%mm1, %%mm0 \n\t" 4740 "cmpl _MMXLength, %%edx \n\t" 4741 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* write updated Raws to array */ 4742 "movq %%mm0, %%mm1 \n\t" /* prep 1st add at top of loop */ 4743 "jb sub_2lp \n\t" 4744 4745 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */ 4746 "=D" (dummy_value_D) /* 1 */ 4747 4748 : "0" (bpp), /* eax // input regs */ 4749 "1" (row) /* edi */ 4750 4751 : "%edx", "%esi" /* clobber list */ 4752#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */ 4753 , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7" 4754#endif 4755 ); 4756 } 4757 break; 4758 4759 case 8: 4760 { 4761 __asm__ __volatile__ ( 4762/* preload "movl row, %%edi \n\t" */ 4763 "movl _dif, %%edx \n\t" 4764 "movl %%edi, %%esi \n\t" /* lp = row */ 4765/* preload "movl bpp, %%eax \n\t" */ 4766 "addl %%eax, %%edi \n\t" /* rp = row + bpp */ 4767 "movl _MMXLength, %%ecx \n\t" 4768 4769 /* prime the pump: load the first Raw(x-bpp) data set */ 4770 "movq -8(%%edi,%%edx,), %%mm7 \n\t" 4771 "andl $0x0000003f, %%ecx \n\t" /* calc bytes over mult of 64 */ 4772 4773 "sub_8lp: \n\t" 4774 "movq (%%edi,%%edx,), %%mm0 \n\t" /* load Sub(x) for 1st 8 bytes */ 4775 "paddb %%mm7, %%mm0 \n\t" 4776 "movq 8(%%edi,%%edx,), %%mm1 \n\t" /* load Sub(x) for 2nd 8 bytes */ 4777 "movq %%mm0, (%%edi,%%edx,) \n\t" /* write Raw(x) for 1st 8 bytes */ 4778 4779 /* Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes. */ 4780 /* This will be repeated for each group of 8 bytes with the 8th */ 4781 /* group being used as the Raw(x-bpp) for the 1st group of the */ 4782 /* next loop. */ 4783 4784 "paddb %%mm0, %%mm1 \n\t" 4785 "movq 16(%%edi,%%edx,), %%mm2 \n\t" /* load Sub(x) for 3rd 8 bytes */ 4786 "movq %%mm1, 8(%%edi,%%edx,) \n\t" /* write Raw(x) for 2nd 8 bytes */ 4787 "paddb %%mm1, %%mm2 \n\t" 4788 "movq 24(%%edi,%%edx,), %%mm3 \n\t" /* load Sub(x) for 4th 8 bytes */ 4789 "movq %%mm2, 16(%%edi,%%edx,) \n\t" /* write Raw(x) for 3rd 8 bytes */ 4790 "paddb %%mm2, %%mm3 \n\t" 4791 "movq 32(%%edi,%%edx,), %%mm4 \n\t" /* load Sub(x) for 5th 8 bytes */ 4792 "movq %%mm3, 24(%%edi,%%edx,) \n\t" /* write Raw(x) for 4th 8 bytes */ 4793 "paddb %%mm3, %%mm4 \n\t" 4794 "movq 40(%%edi,%%edx,), %%mm5 \n\t" /* load Sub(x) for 6th 8 bytes */ 4795 "movq %%mm4, 32(%%edi,%%edx,) \n\t" /* write Raw(x) for 5th 8 bytes */ 4796 "paddb %%mm4, %%mm5 \n\t" 4797 "movq 48(%%edi,%%edx,), %%mm6 \n\t" /* load Sub(x) for 7th 8 bytes */ 4798 "movq %%mm5, 40(%%edi,%%edx,) \n\t" /* write Raw(x) for 6th 8 bytes */ 4799 "paddb %%mm5, %%mm6 \n\t" 4800 "movq 56(%%edi,%%edx,), %%mm7 \n\t" /* load Sub(x) for 8th 8 bytes */ 4801 "movq %%mm6, 48(%%edi,%%edx,) \n\t" /* write Raw(x) for 7th 8 bytes */ 4802 "addl $64, %%edx \n\t" 4803 "paddb %%mm6, %%mm7 \n\t" 4804 "cmpl %%ecx, %%edx \n\t" 4805 "movq %%mm7, -8(%%edi,%%edx,) \n\t" /* write Raw(x) for 8th 8 bytes */ 4806 "jb sub_8lp \n\t" 4807 4808 "cmpl _MMXLength, %%edx \n\t" 4809 "jnb sub_8lt8 \n\t" 4810 4811 "sub_8lpA: \n\t" 4812 "movq (%%edi,%%edx,), %%mm0 \n\t" 4813 "addl $8, %%edx \n\t" 4814 "paddb %%mm7, %%mm0 \n\t" 4815 "cmpl _MMXLength, %%edx \n\t" 4816 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* -8 to offset early addl edx */ 4817 "movq %%mm0, %%mm7 \n\t" /* move calculated Raw(x) data */ 4818 /* to mm1 to be new Raw(x-bpp) */ 4819 /* for next loop */ 4820 "jb sub_8lpA \n\t" 4821 4822 "sub_8lt8: \n\t" 4823 4824 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */ 4825 "=D" (dummy_value_D) /* 1 */ 4826 4827 : "0" (bpp), /* eax // input regs */ 4828 "1" (row) /* edi */ 4829 4830 : "%ecx", "%edx", "%esi" /* clobber list */ 4831#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */ 4832 , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" 4833#endif 4834 ); 4835 } 4836 break; 4837 4838 default: /* bpp greater than 8 bytes GRR BOGUS */ 4839 { 4840 __asm__ __volatile__ ( 4841 "movl _dif, %%edx \n\t" 4842/* preload "movl row, %%edi \n\t" */ 4843 "movl %%edi, %%esi \n\t" /* lp = row */ 4844/* preload "movl bpp, %%eax \n\t" */ 4845 "addl %%eax, %%edi \n\t" /* rp = row + bpp */ 4846 4847 "sub_Alp: \n\t" 4848 "movq (%%edi,%%edx,), %%mm0 \n\t" 4849 "movq (%%esi,%%edx,), %%mm1 \n\t" 4850 "addl $8, %%edx \n\t" 4851 "paddb %%mm1, %%mm0 \n\t" 4852 "cmpl _MMXLength, %%edx \n\t" 4853 "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* mov does not affect flags; */ 4854 /* -8 to offset addl edx */ 4855 "jb sub_Alp \n\t" 4856 4857 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */ 4858 "=D" (dummy_value_D) /* 1 */ 4859 4860 : "0" (bpp), /* eax // input regs */ 4861 "1" (row) /* edi */ 4862 4863 : "%edx", "%esi" /* clobber list */ 4864#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */ 4865 , "%mm0", "%mm1" 4866#endif 4867 ); 4868 } 4869 break; 4870 4871 } /* end switch (bpp) */ 4872 4873 __asm__ __volatile__ ( 4874 "movl _MMXLength, %%edx \n\t" 4875/* pre "movl row, %%edi \n\t" */ 4876 "cmpl _FullLength, %%edx \n\t" 4877 "jnb sub_end \n\t" 4878 4879 "movl %%edi, %%esi \n\t" /* lp = row */ 4880/* pre "movl bpp, %%eax \n\t" */ 4881 "addl %%eax, %%edi \n\t" /* rp = row + bpp */ 4882 "xorl %%eax, %%eax \n\t" 4883 4884 "sub_lp2: \n\t" 4885 "movb (%%esi,%%edx,), %%al \n\t" 4886 "addb %%al, (%%edi,%%edx,) \n\t" 4887 "incl %%edx \n\t" 4888 "cmpl _FullLength, %%edx \n\t" 4889 "jb sub_lp2 \n\t" 4890 4891 "sub_end: \n\t" 4892 "EMMS \n\t" /* end MMX instructions */ 4893 4894 : "=a" (dummy_value_a), /* 0 // output regs (dummy) */ 4895 "=D" (dummy_value_D) /* 1 */ 4896 4897 : "0" (bpp), /* eax // input regs */ 4898 "1" (row) /* edi */ 4899 4900 : "%edx", "%esi" /* clobber list */ 4901 ); 4902 4903} /* end of png_read_filter_row_mmx_sub() */ 4904#endif 4905 4906 4907 4908 4909/*===========================================================================*/ 4910/* */ 4911/* P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P */ 4912/* */ 4913/*===========================================================================*/ 4914 4915/* Optimized code for PNG Up filter decoder */ 4916 4917static void /* PRIVATE */ 4918png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row, 4919 png_bytep prev_row) 4920{ 4921 png_uint_32 len; 4922 int dummy_value_d; /* fix 'forbidden register 3 (dx) was spilled' error */ 4923 int dummy_value_S; 4924 int dummy_value_D; 4925 4926 len = row_info->rowbytes; /* number of bytes to filter */ 4927 4928 __asm__ __volatile__ ( 4929/* pre "movl row, %%edi \n\t" */ 4930 /* get # of bytes to alignment */ 4931#ifdef __PIC__ 4932 "pushl %%ebx \n\t" 4933#endif 4934 "movl %%edi, %%ecx \n\t" 4935 "xorl %%ebx, %%ebx \n\t" 4936 "addl $0x7, %%ecx \n\t" 4937 "xorl %%eax, %%eax \n\t" 4938 "andl $0xfffffff8, %%ecx \n\t" 4939/* pre "movl prev_row, %%esi \n\t" */ 4940 "subl %%edi, %%ecx \n\t" 4941 "jz up_go \n\t" 4942 4943 "up_lp1: \n\t" /* fix alignment */ 4944 "movb (%%edi,%%ebx,), %%al \n\t" 4945 "addb (%%esi,%%ebx,), %%al \n\t" 4946 "incl %%ebx \n\t" 4947 "cmpl %%ecx, %%ebx \n\t" 4948 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* mov does not affect flags; -1 to */ 4949 "jb up_lp1 \n\t" /* offset incl ebx */ 4950 4951 "up_go: \n\t" 4952/* pre "movl len, %%edx \n\t" */ 4953 "movl %%edx, %%ecx \n\t" 4954 "subl %%ebx, %%edx \n\t" /* subtract alignment fix */ 4955 "andl $0x0000003f, %%edx \n\t" /* calc bytes over mult of 64 */ 4956 "subl %%edx, %%ecx \n\t" /* drop over bytes from length */ 4957 4958 /* unrolled loop - use all MMX registers and interleave to reduce */ 4959 /* number of branch instructions (loops) and reduce partial stalls */ 4960 "up_loop: \n\t" 4961 "movq (%%esi,%%ebx,), %%mm1 \n\t" 4962 "movq (%%edi,%%ebx,), %%mm0 \n\t" 4963 "movq 8(%%esi,%%ebx,), %%mm3 \n\t" 4964 "paddb %%mm1, %%mm0 \n\t" 4965 "movq 8(%%edi,%%ebx,), %%mm2 \n\t" 4966 "movq %%mm0, (%%edi,%%ebx,) \n\t" 4967 "paddb %%mm3, %%mm2 \n\t" 4968 "movq 16(%%esi,%%ebx,), %%mm5 \n\t" 4969 "movq %%mm2, 8(%%edi,%%ebx,) \n\t" 4970 "movq 16(%%edi,%%ebx,), %%mm4 \n\t" 4971 "movq 24(%%esi,%%ebx,), %%mm7 \n\t" 4972 "paddb %%mm5, %%mm4 \n\t" 4973 "movq 24(%%edi,%%ebx,), %%mm6 \n\t" 4974 "movq %%mm4, 16(%%edi,%%ebx,) \n\t" 4975 "paddb %%mm7, %%mm6 \n\t" 4976 "movq 32(%%esi,%%ebx,), %%mm1 \n\t" 4977 "movq %%mm6, 24(%%edi,%%ebx,) \n\t" 4978 "movq 32(%%edi,%%ebx,), %%mm0 \n\t" 4979 "movq 40(%%esi,%%ebx,), %%mm3 \n\t" 4980 "paddb %%mm1, %%mm0 \n\t" 4981 "movq 40(%%edi,%%ebx,), %%mm2 \n\t" 4982 "movq %%mm0, 32(%%edi,%%ebx,) \n\t" 4983 "paddb %%mm3, %%mm2 \n\t" 4984 "movq 48(%%esi,%%ebx,), %%mm5 \n\t" 4985 "movq %%mm2, 40(%%edi,%%ebx,) \n\t" 4986 "movq 48(%%edi,%%ebx,), %%mm4 \n\t" 4987 "movq 56(%%esi,%%ebx,), %%mm7 \n\t" 4988 "paddb %%mm5, %%mm4 \n\t" 4989 "movq 56(%%edi,%%ebx,), %%mm6 \n\t" 4990 "movq %%mm4, 48(%%edi,%%ebx,) \n\t" 4991 "addl $64, %%ebx \n\t" 4992 "paddb %%mm7, %%mm6 \n\t" 4993 "cmpl %%ecx, %%ebx \n\t" 4994 "movq %%mm6, -8(%%edi,%%ebx,) \n\t" /* (+56)movq does not affect flags; */ 4995 "jb up_loop \n\t" /* -8 to offset addl ebx */ 4996 4997 "cmpl $0, %%edx \n\t" /* test for bytes over mult of 64 */ 4998 "jz up_end \n\t" 4999 5000 "cmpl $8, %%edx \n\t" /* test for less than 8 bytes */ 5001 "jb up_lt8 \n\t" /* [added by lcreeve@netins.net] */ 5002 5003 "addl %%edx, %%ecx \n\t" 5004 "andl $0x00000007, %%edx \n\t" /* calc bytes over mult of 8 */ 5005 "subl %%edx, %%ecx \n\t" /* drop over bytes from length */ 5006 "jz up_lt8 \n\t" 5007 5008 "up_lpA: \n\t" /* use MMX regs to update 8 bytes sim. */ 5009 "movq (%%esi,%%ebx,), %%mm1 \n\t" 5010 "movq (%%edi,%%ebx,), %%mm0 \n\t" 5011 "addl $8, %%ebx \n\t" 5012 "paddb %%mm1, %%mm0 \n\t" 5013 "cmpl %%ecx, %%ebx \n\t" 5014 "movq %%mm0, -8(%%edi,%%ebx,) \n\t" /* movq does not affect flags; -8 to */ 5015 "jb up_lpA \n\t" /* offset add ebx */ 5016 "cmpl $0, %%edx \n\t" /* test for bytes over mult of 8 */ 5017 "jz up_end \n\t" 5018 5019 "up_lt8: \n\t" 5020 "xorl %%eax, %%eax \n\t" 5021 "addl %%edx, %%ecx \n\t" /* move over byte count into counter */ 5022 5023 "up_lp2: \n\t" /* use x86 regs for remaining bytes */ 5024 "movb (%%edi,%%ebx,), %%al \n\t" 5025 "addb (%%esi,%%ebx,), %%al \n\t" 5026 "incl %%ebx \n\t" 5027 "cmpl %%ecx, %%ebx \n\t" 5028 "movb %%al, -1(%%edi,%%ebx,) \n\t" /* mov does not affect flags; -1 to */ 5029 "jb up_lp2 \n\t" /* offset inc ebx */ 5030 5031 "up_end: \n\t" 5032 "EMMS \n\t" /* conversion of filtered row complete */ 5033#ifdef __PIC__ 5034 "popl %%ebx \n\t" 5035#endif 5036 5037 : "=d" (dummy_value_d), /* 0 // output regs (dummy) */ 5038 "=S" (dummy_value_S), /* 1 */ 5039 "=D" (dummy_value_D) /* 2 */ 5040 5041 : "0" (len), /* edx // input regs */ 5042 "1" (prev_row), /* esi */ 5043 "2" (row) /* edi */ 5044 5045 : "%eax", "%ecx" // clobber list (no input regs!) 5046#ifndef __PIC__ 5047 , "%ebx" 5048#endif 5049 5050#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */ 5051 , "%mm0", "%mm1", "%mm2", "%mm3" 5052 , "%mm4", "%mm5", "%mm6", "%mm7" 5053#endif 5054 ); 5055 5056} /* end of png_read_filter_row_mmx_up() */ 5057 5058#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */ 5059 5060 5061 5062 5063/*===========================================================================*/ 5064/* */ 5065/* P N G _ R E A D _ F I L T E R _ R O W */ 5066/* */ 5067/*===========================================================================*/ 5068 5069 5070/* Optimized png_read_filter_row routines */ 5071 5072void /* PRIVATE */ 5073png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep 5074 row, png_bytep prev_row, int filter) 5075{ 5076#ifdef PNG_DEBUG 5077 char filnm[10]; 5078#endif 5079 5080#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) 5081/* GRR: these are superseded by png_ptr->asm_flags: */ 5082#define UseMMX_sub 1 /* GRR: converted 20000730 */ 5083#define UseMMX_up 1 /* GRR: converted 20000729 */ 5084#define UseMMX_avg 1 /* GRR: converted 20000828 (+ 16-bit bugfix 20000916) */ 5085#define UseMMX_paeth 1 /* GRR: converted 20000828 */ 5086 5087 if (_mmx_supported == 2) { 5088 /* this should have happened in png_init_mmx_flags() already */ 5089#if !defined(PNG_1_0_X) 5090 png_warning(png_ptr, "asm_flags may not have been initialized"); 5091#endif 5092 png_mmx_support(); 5093 } 5094#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */ 5095 5096#ifdef PNG_DEBUG 5097 png_debug(1, "in png_read_filter_row (pnggccrd.c)\n"); 5098 switch (filter) 5099 { 5100 case 0: sprintf(filnm, "none"); 5101 break; 5102 case 1: sprintf(filnm, "sub-%s", 5103#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK) 5104#if !defined(PNG_1_0_X) 5105 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : 5106#endif 5107#endif 5108"x86"); 5109 break; 5110 case 2: sprintf(filnm, "up-%s", 5111#ifdef PNG_ASSEMBLER_CODE_SUPPORTED 5112#if !defined(PNG_1_0_X) 5113 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : 5114#endif 5115#endif 5116 "x86"); 5117 break; 5118 case 3: sprintf(filnm, "avg-%s", 5119#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK) 5120#if !defined(PNG_1_0_X) 5121 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : 5122#endif 5123#endif 5124 "x86"); 5125 break; 5126 case 4: sprintf(filnm, "Paeth-%s", 5127#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK) 5128#if !defined(PNG_1_0_X) 5129 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX": 5130#endif 5131#endif 5132"x86"); 5133 break; 5134 default: sprintf(filnm, "unknw"); 5135 break; 5136 } 5137 png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm); 5138 png_debug1(0, "row=0x%08lx, ", (unsigned long)row); 5139 png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth, 5140 (int)((row_info->pixel_depth + 7) >> 3)); 5141 png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes); 5142#endif /* PNG_DEBUG */ 5143 5144 switch (filter) 5145 { 5146 case PNG_FILTER_VALUE_NONE: 5147 break; 5148 5149 case PNG_FILTER_VALUE_SUB: 5150#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK) 5151#if !defined(PNG_1_0_X) 5152 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) && 5153 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 5154 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 5155#else 5156 if (_mmx_supported) 5157#endif 5158 { 5159 png_read_filter_row_mmx_sub(row_info, row); 5160 } 5161 else 5162#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */ 5163 { 5164 png_uint_32 i; 5165 png_uint_32 istop = row_info->rowbytes; 5166 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; 5167 png_bytep rp = row + bpp; 5168 png_bytep lp = row; 5169 5170 for (i = bpp; i < istop; i++) 5171 { 5172 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff); 5173 rp++; 5174 } 5175 } /* end !UseMMX_sub */ 5176 break; 5177 5178 case PNG_FILTER_VALUE_UP: 5179#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) 5180#if !defined(PNG_1_0_X) 5181 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) && 5182 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 5183 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 5184#else 5185 if (_mmx_supported) 5186#endif 5187 { 5188 png_read_filter_row_mmx_up(row_info, row, prev_row); 5189 } 5190 else 5191#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */ 5192 { 5193 png_uint_32 i; 5194 png_uint_32 istop = row_info->rowbytes; 5195 png_bytep rp = row; 5196 png_bytep pp = prev_row; 5197 5198 for (i = 0; i < istop; ++i) 5199 { 5200 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); 5201 rp++; 5202 } 5203 } /* end !UseMMX_up */ 5204 break; 5205 5206 case PNG_FILTER_VALUE_AVG: 5207#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK) 5208#if !defined(PNG_1_0_X) 5209 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) && 5210 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 5211 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 5212#else 5213 if (_mmx_supported) 5214#endif 5215 { 5216 png_read_filter_row_mmx_avg(row_info, row, prev_row); 5217 } 5218 else 5219#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */ 5220 { 5221 png_uint_32 i; 5222 png_bytep rp = row; 5223 png_bytep pp = prev_row; 5224 png_bytep lp = row; 5225 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; 5226 png_uint_32 istop = row_info->rowbytes - bpp; 5227 5228 for (i = 0; i < bpp; i++) 5229 { 5230 *rp = (png_byte)(((int)(*rp) + 5231 ((int)(*pp++) >> 1)) & 0xff); 5232 rp++; 5233 } 5234 5235 for (i = 0; i < istop; i++) 5236 { 5237 *rp = (png_byte)(((int)(*rp) + 5238 ((int)(*pp++ + *lp++) >> 1)) & 0xff); 5239 rp++; 5240 } 5241 } /* end !UseMMX_avg */ 5242 break; 5243 5244 case PNG_FILTER_VALUE_PAETH: 5245#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK) 5246#if !defined(PNG_1_0_X) 5247 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) && 5248 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) && 5249 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold)) 5250#else 5251 if (_mmx_supported) 5252#endif 5253 { 5254 png_read_filter_row_mmx_paeth(row_info, row, prev_row); 5255 } 5256 else 5257#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */ 5258 { 5259 png_uint_32 i; 5260 png_bytep rp = row; 5261 png_bytep pp = prev_row; 5262 png_bytep lp = row; 5263 png_bytep cp = prev_row; 5264 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3; 5265 png_uint_32 istop = row_info->rowbytes - bpp; 5266 5267 for (i = 0; i < bpp; i++) 5268 { 5269 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); 5270 rp++; 5271 } 5272 5273 for (i = 0; i < istop; i++) /* use leftover rp,pp */ 5274 { 5275 int a, b, c, pa, pb, pc, p; 5276 5277 a = *lp++; 5278 b = *pp++; 5279 c = *cp++; 5280 5281 p = b - c; 5282 pc = a - c; 5283 5284#ifdef PNG_USE_ABS 5285 pa = abs(p); 5286 pb = abs(pc); 5287 pc = abs(p + pc); 5288#else 5289 pa = p < 0 ? -p : p; 5290 pb = pc < 0 ? -pc : pc; 5291 pc = (p + pc) < 0 ? -(p + pc) : p + pc; 5292#endif 5293 5294 /* 5295 if (pa <= pb && pa <= pc) 5296 p = a; 5297 else if (pb <= pc) 5298 p = b; 5299 else 5300 p = c; 5301 */ 5302 5303 p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c; 5304 5305 *rp = (png_byte)(((int)(*rp) + p) & 0xff); 5306 rp++; 5307 } 5308 } /* end !UseMMX_paeth */ 5309 break; 5310 5311 default: 5312 png_warning(png_ptr, "Ignoring bad row-filter type"); 5313 *row=0; 5314 break; 5315 } 5316} 5317 5318#endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */ 5319 5320 5321/*===========================================================================*/ 5322/* */ 5323/* P N G _ M M X _ S U P P O R T */ 5324/* */ 5325/*===========================================================================*/ 5326 5327/* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl) 5328 * (2) all instructions compile with gcc 2.7.2.3 and later 5329 * (3) the function is moved down here to prevent gcc from 5330 * inlining it in multiple places and then barfing be- 5331 * cause the ".NOT_SUPPORTED" label is multiply defined 5332 * [is there a way to signal that a *single* function should 5333 * not be inlined? is there a way to modify the label for 5334 * each inlined instance, e.g., by appending _1, _2, etc.? 5335 * maybe if don't use leading "." in label name? (nope...sigh)] 5336 */ 5337 5338int PNGAPI 5339png_mmx_support(void) 5340{ 5341#if defined(PNG_MMX_CODE_SUPPORTED) 5342 __asm__ __volatile__ ( 5343 "pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction 5344 "pushl %%ecx \n\t" // so does ecx... 5345 "pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux) 5346// ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd 5347// "pushf \n\t" // 16-bit pushf 5348 "pushfl \n\t" // save Eflag to stack 5349 "popl %%eax \n\t" // get Eflag from stack into eax 5350 "movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx 5351 "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21) 5352 "pushl %%eax \n\t" // save modified Eflag back to stack 5353// ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd 5354// "popf \n\t" // 16-bit popf 5355 "popfl \n\t" // restore modified value to Eflag reg 5356 "pushfl \n\t" // save Eflag to stack 5357 "popl %%eax \n\t" // get Eflag from stack 5358 "pushl %%ecx \n\t" // save original Eflag to stack 5359 "popfl \n\t" // restore original Eflag 5360 "xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag 5361 "jz 0f \n\t" // if same, CPUID instr. is not supported 5362 5363 "xorl %%eax, %%eax \n\t" // set eax to zero 5364// ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode) 5365 "cpuid \n\t" // get the CPU identification info 5366 "cmpl $1, %%eax \n\t" // make sure eax return non-zero value 5367 "jl 0f \n\t" // if eax is zero, MMX is not supported 5368 5369 "xorl %%eax, %%eax \n\t" // set eax to zero and... 5370 "incl %%eax \n\t" // ...increment eax to 1. This pair is 5371 // faster than the instruction "mov eax, 1" 5372 "cpuid \n\t" // get the CPU identification info again 5373 "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23) 5374 "cmpl $0, %%edx \n\t" // 0 = MMX not supported 5375 "jz 0f \n\t" // non-zero = yes, MMX IS supported 5376 5377 "movl $1, %%eax \n\t" // set return value to 1 5378 "jmp 1f \n\t" // DONE: have MMX support 5379 5380 "0: \n\t" // .NOT_SUPPORTED: target label for jump instructions 5381 "movl $0, %%eax \n\t" // set return value to 0 5382 "1: \n\t" // .RETURN: target label for jump instructions 5383 "movl %%eax, _mmx_supported \n\t" // save in global static variable, too 5384 "popl %%edx \n\t" // restore edx 5385 "popl %%ecx \n\t" // restore ecx 5386 "popl %%ebx \n\t" // restore ebx 5387 5388// "ret \n\t" // DONE: no MMX support 5389 // (fall through to standard C "ret") 5390 5391 : // output list (none) 5392 5393 : // any variables used on input (none) 5394 5395 : "%eax" // clobber list 5396// , "%ebx", "%ecx", "%edx" // GRR: we handle these manually 5397// , "memory" // if write to a variable gcc thought was in a reg 5398// , "cc" // "condition codes" (flag bits) 5399 ); 5400#else 5401 _mmx_supported = 0; 5402#endif /* PNG_MMX_CODE_SUPPORTED */ 5403 5404 return _mmx_supported; 5405} 5406 5407 5408#endif /* PNG_USE_PNGGCCRD */ 5409