1/* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
2 *
3 * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
4 *
5 *     See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
6 *     and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
7 *     for Intel's performance analysis of the MMX vs. non-MMX code.
8 *
9 * libpng version 1.2.7 - September 12, 2004
10 * For conditions of distribution and use, see copyright notice in png.h
11 * Copyright (c) 1998-2004 Glenn Randers-Pehrson
12 * Copyright (c) 1998, Intel Corporation
13 *
14 * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
15 * Interface to libpng contributed by Gilles Vollant, 1999.
16 * GNU C port by Greg Roelofs, 1999-2001.
17 *
18 * Lines 2350-4300 converted in place with intel2gas 1.3.1:
19 *
20 *   intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
21 *
22 * and then cleaned up by hand.  See http://hermes.terminal.at/intel2gas/ .
23 *
24 * NOTE:  A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
25 *        is required to assemble the newer MMX instructions such as movq.
26 *        For djgpp, see
27 *
28 *           ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
29 *
30 *        (or a later version in the same directory).  For Linux, check your
31 *        distribution's web site(s) or try these links:
32 *
33 *           http://rufus.w3.org/linux/RPM/binutils.html
34 *           http://www.debian.org/Packages/stable/devel/binutils.html
35 *           ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
36 *             binutils.tgz
37 *
38 *        For other platforms, see the main GNU site:
39 *
40 *           ftp://ftp.gnu.org/pub/gnu/binutils/
41 *
42 *        Version 2.5.2l.15 is definitely too old...
43 */
44
45/*
46 * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
47 * =====================================
48 *
49 * 19991006:
50 *  - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
51 *
52 * 19991007:
53 *  - additional optimizations (possible or definite):
54 *     x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
55 *     - write MMX code for 48-bit case (pixel_bytes == 6)
56 *     - figure out what's up with 24-bit case (pixel_bytes == 3):
57 *        why subtract 8 from width_mmx in the pass 4/5 case?
58 *        (only width_mmx case) (near line 1606)
59 *     x [DONE] replace pixel_bytes within each block with the true
60 *        constant value (or are compilers smart enough to do that?)
61 *     - rewrite all MMX interlacing code so it's aligned with
62 *        the *beginning* of the row buffer, not the end.  This
63 *        would not only allow one to eliminate half of the memory
64 *        writes for odd passes (that is, pass == odd), it may also
65 *        eliminate some unaligned-data-access exceptions (assuming
66 *        there's a penalty for not aligning 64-bit accesses on
67 *        64-bit boundaries).  The only catch is that the "leftover"
68 *        pixel(s) at the end of the row would have to be saved,
69 *        but there are enough unused MMX registers in every case,
70 *        so this is not a problem.  A further benefit is that the
71 *        post-MMX cleanup code (C code) in at least some of the
72 *        cases could be done within the assembler block.
73 *  x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
74 *     inconsistent, and don't match the MMX Programmer's Reference
75 *     Manual conventions anyway.  They should be changed to
76 *     "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
77 *     was lowest in memory (e.g., corresponding to a left pixel)
78 *     and b7 is the byte that was highest (e.g., a right pixel).
79 *
80 * 19991016:
81 *  - Brennan's Guide notwithstanding, gcc under Linux does *not*
82 *     want globals prefixed by underscores when referencing them--
83 *     i.e., if the variable is const4, then refer to it as const4,
84 *     not _const4.  This seems to be a djgpp-specific requirement.
85 *     Also, such variables apparently *must* be declared outside
86 *     of functions; neither static nor automatic variables work if
87 *     defined within the scope of a single function, but both
88 *     static and truly global (multi-module) variables work fine.
89 *
90 * 19991023:
91 *  - fixed png_combine_row() non-MMX replication bug (odd passes only?)
92 *  - switched from string-concatenation-with-macros to cleaner method of
93 *     renaming global variables for djgpp--i.e., always use prefixes in
94 *     inlined assembler code (== strings) and conditionally rename the
95 *     variables, not the other way around.  Hence _const4, _mask8_0, etc.
96 *
97 * 19991024:
98 *  - fixed mmxsupport()/png_do_read_interlace() first-row bug
99 *     This one was severely weird:  even though mmxsupport() doesn't touch
100 *     ebx (where "row" pointer was stored), it nevertheless managed to zero
101 *     the register (even in static/non-fPIC code--see below), which in turn
102 *     caused png_do_read_interlace() to return prematurely on the first row of
103 *     interlaced images (i.e., without expanding the interlaced pixels).
104 *     Inspection of the generated assembly code didn't turn up any clues,
105 *     although it did point at a minor optimization (i.e., get rid of
106 *     mmx_supported_local variable and just use eax).  Possibly the CPUID
107 *     instruction is more destructive than it looks?  (Not yet checked.)
108 *  - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
109 *     listings...  Apparently register spillage has to do with ebx, since
110 *     it's used to index the global offset table.  Commenting it out of the
111 *     input-reg lists in png_combine_row() eliminated compiler barfage, so
112 *     ifdef'd with __PIC__ macro:  if defined, use a global for unmask
113 *
114 * 19991107:
115 *  - verified CPUID clobberage:  12-char string constant ("GenuineIntel",
116 *     "AuthenticAMD", etc.) placed in ebx:ecx:edx.  Still need to polish.
117 *
118 * 19991120:
119 *  - made "diff" variable (now "_dif") global to simplify conversion of
120 *     filtering routines (running out of regs, sigh).  "diff" is still used
121 *     in interlacing routines, however.
122 *  - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
123 *     macro determines which is used); original not yet tested.
124 *
125 * 20000213:
126 *  - when compiling with gcc, be sure to use  -fomit-frame-pointer
127 *
128 * 20000319:
129 *  - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
130 *     pass == 4 or 5, that caused visible corruption of interlaced images
131 *
132 * 20000623:
133 *  - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
134 *     many of the form "forbidden register 0 (ax) was spilled for class AREG."
135 *     This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
136 *     Chuck Wilson supplied a patch involving dummy output registers.  See
137 *     http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
138 *     for the original (anonymous) SourceForge bug report.
139 *
140 * 20000706:
141 *  - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
142 *       pnggccrd.c: In function `png_combine_row':
143 *       pnggccrd.c:525: more than 10 operands in `asm'
144 *       pnggccrd.c:669: more than 10 operands in `asm'
145 *       pnggccrd.c:828: more than 10 operands in `asm'
146 *       pnggccrd.c:994: more than 10 operands in `asm'
147 *       pnggccrd.c:1177: more than 10 operands in `asm'
148 *     They are all the same problem and can be worked around by using the
149 *     global _unmask variable unconditionally, not just in the -fPIC case.
150 *     Reportedly earlier versions of gcc also have the problem with more than
151 *     10 operands; they just don't report it.  Much strangeness ensues, etc.
152 *
153 * 20000729:
154 *  - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
155 *     MMX routine); began converting png_read_filter_row_mmx_sub()
156 *  - to finish remaining sections:
157 *     - clean up indentation and comments
158 *     - preload local variables
159 *     - add output and input regs (order of former determines numerical
160 *        mapping of latter)
161 *     - avoid all usage of ebx (including bx, bh, bl) register [20000823]
162 *     - remove "$" from addressing of Shift and Mask variables [20000823]
163 *
164 * 20000731:
165 *  - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
166 *
167 * 20000822:
168 *  - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
169 *     shared-library (-fPIC) version!  Code works just fine as part of static
170 *     library.  Damn damn damn damn damn, should have tested that sooner.
171 *     ebx is getting clobbered again (explicitly this time); need to save it
172 *     on stack or rewrite asm code to avoid using it altogether.  Blargh!
173 *
174 * 20000823:
175 *  - first section was trickiest; all remaining sections have ebx -> edx now.
176 *     (-fPIC works again.)  Also added missing underscores to various Shift*
177 *     and *Mask* globals and got rid of leading "$" signs.
178 *
179 * 20000826:
180 *  - added visual separators to help navigate microscopic printed copies
181 *     (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
182 *     on png_read_filter_row_mmx_avg()
183 *
184 * 20000828:
185 *  - finished png_read_filter_row_mmx_avg():  only Paeth left! (930 lines...)
186 *     What the hell, did png_read_filter_row_mmx_paeth(), too.  Comments not
187 *     cleaned up/shortened in either routine, but functionality is complete
188 *     and seems to be working fine.
189 *
190 * 20000829:
191 *  - ahhh, figured out last(?) bit of gcc/gas asm-fu:  if register is listed
192 *     as an input reg (with dummy output variables, etc.), then it *cannot*
193 *     also appear in the clobber list or gcc 2.95.2 will barf.  The solution
194 *     is simple enough...
195 *
196 * 20000914:
197 *  - bug in png_read_filter_row_mmx_avg():  16-bit grayscale not handled
198 *     correctly (but 48-bit RGB just fine)
199 *
200 * 20000916:
201 *  - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
202 *     - "_ShiftBpp.use = 24;"      should have been   "_ShiftBpp.use = 16;"
203 *     - "_ShiftRem.use = 40;"      should have been   "_ShiftRem.use = 48;"
204 *     - "psllq _ShiftRem, %%mm2"   should have been   "psrlq _ShiftRem, %%mm2"
205 *
206 * 20010101:
207 *  - added new png_init_mmx_flags() function (here only because it needs to
208 *     call mmxsupport(), which should probably become global png_mmxsupport());
209 *     modified other MMX routines to run conditionally (png_ptr->asm_flags)
210 *
211 * 20010103:
212 *  - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
213 *     and made it public; moved png_init_mmx_flags() to png.c as internal func
214 *
215 * 20010104:
216 *  - removed dependency on png_read_filter_row_c() (C code already duplicated
217 *     within MMX version of png_read_filter_row()) so no longer necessary to
218 *     compile it into pngrutil.o
219 *
220 * 20010310:
221 *  - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
222 *
223 * 20020304:
224 *  - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
225 *
226 * 20040724:
227 *   - more tinkering with clobber list at lines 4529 and 5033, to get
228 *     it to compile on gcc-3.4.
229 *
230 * STILL TO DO:
231 *     - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
232 *     - write MMX code for 48-bit case (pixel_bytes == 6)
233 *     - figure out what's up with 24-bit case (pixel_bytes == 3):
234 *        why subtract 8 from width_mmx in the pass 4/5 case?
235 *        (only width_mmx case) (near line 1606)
236 *     - rewrite all MMX interlacing code so it's aligned with beginning
237 *        of the row buffer, not the end (see 19991007 for details)
238 *     x pick one version of mmxsupport() and get rid of the other
239 *     - add error messages to any remaining bogus default cases
240 *     - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
241 *     x add support for runtime enable/disable/query of various MMX routines
242 */
243
244#define PNG_INTERNAL
245#include "png.h"
246
247#if defined(PNG_USE_PNGGCCRD)
248
249int PNGAPI png_mmx_support(void);
250
251#ifdef PNG_USE_LOCAL_ARRAYS
252static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
253static const int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
254static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
255#endif
256
257#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
258/* djgpp, Win32, and Cygwin add their own underscores to global variables,
259 * so define them without: */
260#if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
261#  define _mmx_supported  mmx_supported
262#  define _const4         const4
263#  define _const6         const6
264#  define _mask8_0        mask8_0
265#  define _mask16_1       mask16_1
266#  define _mask16_0       mask16_0
267#  define _mask24_2       mask24_2
268#  define _mask24_1       mask24_1
269#  define _mask24_0       mask24_0
270#  define _mask32_3       mask32_3
271#  define _mask32_2       mask32_2
272#  define _mask32_1       mask32_1
273#  define _mask32_0       mask32_0
274#  define _mask48_5       mask48_5
275#  define _mask48_4       mask48_4
276#  define _mask48_3       mask48_3
277#  define _mask48_2       mask48_2
278#  define _mask48_1       mask48_1
279#  define _mask48_0       mask48_0
280#  define _LBCarryMask    LBCarryMask
281#  define _HBClearMask    HBClearMask
282#  define _ActiveMask     ActiveMask
283#  define _ActiveMask2    ActiveMask2
284#  define _ActiveMaskEnd  ActiveMaskEnd
285#  define _ShiftBpp       ShiftBpp
286#  define _ShiftRem       ShiftRem
287#ifdef PNG_THREAD_UNSAFE_OK
288#  define _unmask         unmask
289#  define _FullLength     FullLength
290#  define _MMXLength      MMXLength
291#  define _dif            dif
292#  define _patemp         patemp
293#  define _pbtemp         pbtemp
294#  define _pctemp         pctemp
295#endif
296#endif
297
298
299/* These constants are used in the inlined MMX assembly code.
300   Ignore gcc's "At top level: defined but not used" warnings. */
301
302/* GRR 20000706:  originally _unmask was needed only when compiling with -fPIC,
303 *  since that case uses the %ebx register for indexing the Global Offset Table
304 *  and there were no other registers available.  But gcc 2.95 and later emit
305 *  "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
306 *  in the non-PIC case, so we'll just use the global unconditionally now.
307 */
308#ifdef PNG_THREAD_UNSAFE_OK
309static int _unmask;
310#endif
311
312static unsigned long long _mask8_0  = 0x0102040810204080LL;
313
314static unsigned long long _mask16_1 = 0x0101020204040808LL;
315static unsigned long long _mask16_0 = 0x1010202040408080LL;
316
317static unsigned long long _mask24_2 = 0x0101010202020404LL;
318static unsigned long long _mask24_1 = 0x0408080810101020LL;
319static unsigned long long _mask24_0 = 0x2020404040808080LL;
320
321static unsigned long long _mask32_3 = 0x0101010102020202LL;
322static unsigned long long _mask32_2 = 0x0404040408080808LL;
323static unsigned long long _mask32_1 = 0x1010101020202020LL;
324static unsigned long long _mask32_0 = 0x4040404080808080LL;
325
326static unsigned long long _mask48_5 = 0x0101010101010202LL;
327static unsigned long long _mask48_4 = 0x0202020204040404LL;
328static unsigned long long _mask48_3 = 0x0404080808080808LL;
329static unsigned long long _mask48_2 = 0x1010101010102020LL;
330static unsigned long long _mask48_1 = 0x2020202040404040LL;
331static unsigned long long _mask48_0 = 0x4040808080808080LL;
332
333static unsigned long long _const4   = 0x0000000000FFFFFFLL;
334/* static unsigned long long _const5 = 0x000000FFFFFF0000LL; */  /* NOT USED */
335static unsigned long long _const6   = 0x00000000000000FFLL;
336
337/* These are used in the row-filter routines and should/would be local */
338/*  variables if not for gcc addressing limitations. */
339/* WARNING: Their presence probably defeats the thread safety of libpng. */
340
341#ifdef PNG_THREAD_UNSAFE_OK
342static png_uint_32  _FullLength;
343static png_uint_32  _MMXLength;
344static int          _dif;
345static int          _patemp; /* temp variables for Paeth routine */
346static int          _pbtemp;
347static int          _pctemp;
348#endif
349
350void /* PRIVATE */
351png_squelch_warnings(void)
352{
353#ifdef PNG_THREAD_UNSAFE_OK
354   _dif = _dif;
355   _patemp = _patemp;
356   _pbtemp = _pbtemp;
357   _pctemp = _pctemp;
358   _MMXLength = _MMXLength;
359#endif
360   _const4  = _const4;
361   _const6  = _const6;
362   _mask8_0  = _mask8_0;
363   _mask16_1 = _mask16_1;
364   _mask16_0 = _mask16_0;
365   _mask24_2 = _mask24_2;
366   _mask24_1 = _mask24_1;
367   _mask24_0 = _mask24_0;
368   _mask32_3 = _mask32_3;
369   _mask32_2 = _mask32_2;
370   _mask32_1 = _mask32_1;
371   _mask32_0 = _mask32_0;
372   _mask48_5 = _mask48_5;
373   _mask48_4 = _mask48_4;
374   _mask48_3 = _mask48_3;
375   _mask48_2 = _mask48_2;
376   _mask48_1 = _mask48_1;
377   _mask48_0 = _mask48_0;
378}
379#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
380
381
382static int _mmx_supported = 2;
383
384/*===========================================================================*/
385/*                                                                           */
386/*                       P N G _ C O M B I N E _ R O W                       */
387/*                                                                           */
388/*===========================================================================*/
389
390#if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
391
392#define BPP2  2
393#define BPP3  3 /* bytes per pixel (a.k.a. pixel_bytes) */
394#define BPP4  4
395#define BPP6  6 /* (defined only to help avoid cut-and-paste errors) */
396#define BPP8  8
397
398/* Combines the row recently read in with the previous row.
399   This routine takes care of alpha and transparency if requested.
400   This routine also handles the two methods of progressive display
401   of interlaced images, depending on the mask value.
402   The mask value describes which pixels are to be combined with
403   the row.  The pattern always repeats every 8 pixels, so just 8
404   bits are needed.  A one indicates the pixel is to be combined; a
405   zero indicates the pixel is to be skipped.  This is in addition
406   to any alpha or transparency value associated with the pixel.
407   If you want all pixels to be combined, pass 0xff (255) in mask. */
408
409/* Use this routine for the x86 platform - it uses a faster MMX routine
410   if the machine supports MMX. */
411
412void /* PRIVATE */
413png_combine_row(png_structp png_ptr, png_bytep row, int mask)
414{
415   png_debug(1, "in png_combine_row (pnggccrd.c)\n");
416
417#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
418   if (_mmx_supported == 2) {
419#if !defined(PNG_1_0_X)
420       /* this should have happened in png_init_mmx_flags() already */
421       png_warning(png_ptr, "asm_flags may not have been initialized");
422#endif
423       png_mmx_support();
424   }
425#endif
426
427   if (mask == 0xff)
428   {
429      png_debug(2,"mask == 0xff:  doing single png_memcpy()\n");
430      png_memcpy(row, png_ptr->row_buf + 1,
431       (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
432   }
433   else   /* (png_combine_row() is never called with mask == 0) */
434   {
435      switch (png_ptr->row_info.pixel_depth)
436      {
437         case 1:        /* png_ptr->row_info.pixel_depth */
438         {
439            png_bytep sp;
440            png_bytep dp;
441            int s_inc, s_start, s_end;
442            int m;
443            int shift;
444            png_uint_32 i;
445
446            sp = png_ptr->row_buf + 1;
447            dp = row;
448            m = 0x80;
449#if defined(PNG_READ_PACKSWAP_SUPPORTED)
450            if (png_ptr->transformations & PNG_PACKSWAP)
451            {
452                s_start = 0;
453                s_end = 7;
454                s_inc = 1;
455            }
456            else
457#endif
458            {
459                s_start = 7;
460                s_end = 0;
461                s_inc = -1;
462            }
463
464            shift = s_start;
465
466            for (i = 0; i < png_ptr->width; i++)
467            {
468               if (m & mask)
469               {
470                  int value;
471
472                  value = (*sp >> shift) & 0x1;
473                  *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
474                  *dp |= (png_byte)(value << shift);
475               }
476
477               if (shift == s_end)
478               {
479                  shift = s_start;
480                  sp++;
481                  dp++;
482               }
483               else
484                  shift += s_inc;
485
486               if (m == 1)
487                  m = 0x80;
488               else
489                  m >>= 1;
490            }
491            break;
492         }
493
494         case 2:        /* png_ptr->row_info.pixel_depth */
495         {
496            png_bytep sp;
497            png_bytep dp;
498            int s_start, s_end, s_inc;
499            int m;
500            int shift;
501            png_uint_32 i;
502            int value;
503
504            sp = png_ptr->row_buf + 1;
505            dp = row;
506            m = 0x80;
507#if defined(PNG_READ_PACKSWAP_SUPPORTED)
508            if (png_ptr->transformations & PNG_PACKSWAP)
509            {
510               s_start = 0;
511               s_end = 6;
512               s_inc = 2;
513            }
514            else
515#endif
516            {
517               s_start = 6;
518               s_end = 0;
519               s_inc = -2;
520            }
521
522            shift = s_start;
523
524            for (i = 0; i < png_ptr->width; i++)
525            {
526               if (m & mask)
527               {
528                  value = (*sp >> shift) & 0x3;
529                  *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
530                  *dp |= (png_byte)(value << shift);
531               }
532
533               if (shift == s_end)
534               {
535                  shift = s_start;
536                  sp++;
537                  dp++;
538               }
539               else
540                  shift += s_inc;
541               if (m == 1)
542                  m = 0x80;
543               else
544                  m >>= 1;
545            }
546            break;
547         }
548
549         case 4:        /* png_ptr->row_info.pixel_depth */
550         {
551            png_bytep sp;
552            png_bytep dp;
553            int s_start, s_end, s_inc;
554            int m;
555            int shift;
556            png_uint_32 i;
557            int value;
558
559            sp = png_ptr->row_buf + 1;
560            dp = row;
561            m = 0x80;
562#if defined(PNG_READ_PACKSWAP_SUPPORTED)
563            if (png_ptr->transformations & PNG_PACKSWAP)
564            {
565               s_start = 0;
566               s_end = 4;
567               s_inc = 4;
568            }
569            else
570#endif
571            {
572               s_start = 4;
573               s_end = 0;
574               s_inc = -4;
575            }
576            shift = s_start;
577
578            for (i = 0; i < png_ptr->width; i++)
579            {
580               if (m & mask)
581               {
582                  value = (*sp >> shift) & 0xf;
583                  *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
584                  *dp |= (png_byte)(value << shift);
585               }
586
587               if (shift == s_end)
588               {
589                  shift = s_start;
590                  sp++;
591                  dp++;
592               }
593               else
594                  shift += s_inc;
595               if (m == 1)
596                  m = 0x80;
597               else
598                  m >>= 1;
599            }
600            break;
601         }
602
603         case 8:        /* png_ptr->row_info.pixel_depth */
604         {
605            png_bytep srcptr;
606            png_bytep dstptr;
607
608#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
609#if !defined(PNG_1_0_X)
610            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
611                /* && _mmx_supported */ )
612#else
613            if (_mmx_supported)
614#endif
615            {
616               png_uint_32 len;
617               int diff;
618               int dummy_value_a;   /* fix 'forbidden register spilled' error */
619               int dummy_value_d;
620               int dummy_value_c;
621               int dummy_value_S;
622               int dummy_value_D;
623               _unmask = ~mask;            /* global variable for -fPIC version */
624               srcptr = png_ptr->row_buf + 1;
625               dstptr = row;
626               len  = png_ptr->width &~7;  /* reduce to multiple of 8 */
627               diff = (int) (png_ptr->width & 7);  /* amount lost */
628
629               __asm__ __volatile__ (
630                  "movd      _unmask, %%mm7  \n\t" /* load bit pattern */
631                  "psubb     %%mm6, %%mm6    \n\t" /* zero mm6 */
632                  "punpcklbw %%mm7, %%mm7    \n\t"
633                  "punpcklwd %%mm7, %%mm7    \n\t"
634                  "punpckldq %%mm7, %%mm7    \n\t" /* fill reg with 8 masks */
635
636                  "movq      _mask8_0, %%mm0 \n\t"
637                  "pand      %%mm7, %%mm0    \n\t" /* nonzero if keep byte */
638                  "pcmpeqb   %%mm6, %%mm0    \n\t" /* zeros->1s, v versa */
639
640/* preload        "movl      len, %%ecx      \n\t" // load length of line */
641/* preload        "movl      srcptr, %%esi   \n\t" // load source */
642/* preload        "movl      dstptr, %%edi   \n\t" // load dest */
643
644                  "cmpl      $0, %%ecx       \n\t" /* len == 0 ? */
645                  "je        mainloop8end    \n\t"
646
647                "mainloop8:                  \n\t"
648                  "movq      (%%esi), %%mm4  \n\t" /* *srcptr */
649                  "pand      %%mm0, %%mm4    \n\t"
650                  "movq      %%mm0, %%mm6    \n\t"
651                  "pandn     (%%edi), %%mm6  \n\t" /* *dstptr */
652                  "por       %%mm6, %%mm4    \n\t"
653                  "movq      %%mm4, (%%edi)  \n\t"
654                  "addl      $8, %%esi       \n\t" /* inc by 8 bytes processed */
655                  "addl      $8, %%edi       \n\t"
656                  "subl      $8, %%ecx       \n\t" /* dec by 8 pixels processed */
657                  "ja        mainloop8       \n\t"
658
659                "mainloop8end:               \n\t"
660/* preload        "movl      diff, %%ecx     \n\t" // (diff is in eax) */
661                  "movl      %%eax, %%ecx    \n\t"
662                  "cmpl      $0, %%ecx       \n\t"
663                  "jz        end8            \n\t"
664/* preload        "movl      mask, %%edx     \n\t" */
665                  "sall      $24, %%edx      \n\t" /* make low byte, high byte */
666
667                "secondloop8:                \n\t"
668                  "sall      %%edx           \n\t" /* move high bit to CF */
669                  "jnc       skip8           \n\t" /* if CF = 0 */
670                  "movb      (%%esi), %%al   \n\t"
671                  "movb      %%al, (%%edi)   \n\t"
672
673                "skip8:                      \n\t"
674                  "incl      %%esi           \n\t"
675                  "incl      %%edi           \n\t"
676                  "decl      %%ecx           \n\t"
677                  "jnz       secondloop8     \n\t"
678
679                "end8:                       \n\t"
680                  "EMMS                      \n\t"  /* DONE */
681
682                  : "=a" (dummy_value_a),           /* output regs (dummy) */
683                    "=d" (dummy_value_d),
684                    "=c" (dummy_value_c),
685                    "=S" (dummy_value_S),
686                    "=D" (dummy_value_D)
687
688                  : "3" (srcptr),      /* esi       // input regs */
689                    "4" (dstptr),      /* edi */
690                    "0" (diff),        /* eax */
691/* was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx */
692                    "2" (len),         /* ecx */
693                    "1" (mask)         /* edx */
694
695#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
696                  : "%mm0", "%mm4", "%mm6", "%mm7"  /* clobber list */
697#endif
698               );
699            }
700            else /* mmx _not supported - Use modified C routine */
701#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
702            {
703               register png_uint_32 i;
704               png_uint_32 initial_val = png_pass_start[png_ptr->pass];
705                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
706               register int stride = png_pass_inc[png_ptr->pass];
707                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
708               register int rep_bytes = png_pass_width[png_ptr->pass];
709                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
710               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
711               int diff = (int) (png_ptr->width & 7); /* amount lost */
712               register png_uint_32 final_val = len;  /* GRR bugfix */
713
714               srcptr = png_ptr->row_buf + 1 + initial_val;
715               dstptr = row + initial_val;
716
717               for (i = initial_val; i < final_val; i += stride)
718               {
719                  png_memcpy(dstptr, srcptr, rep_bytes);
720                  srcptr += stride;
721                  dstptr += stride;
722               }
723               if (diff)  /* number of leftover pixels:  3 for pngtest */
724               {
725                  final_val+=diff /* *BPP1 */ ;
726                  for (; i < final_val; i += stride)
727                  {
728                     if (rep_bytes > (int)(final_val-i))
729                        rep_bytes = (int)(final_val-i);
730                     png_memcpy(dstptr, srcptr, rep_bytes);
731                     srcptr += stride;
732                     dstptr += stride;
733                  }
734               }
735
736            } /* end of else (_mmx_supported) */
737
738            break;
739         }       /* end 8 bpp */
740
741         case 16:       /* png_ptr->row_info.pixel_depth */
742         {
743            png_bytep srcptr;
744            png_bytep dstptr;
745
746#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
747#if !defined(PNG_1_0_X)
748            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
749                /* && _mmx_supported */ )
750#else
751            if (_mmx_supported)
752#endif
753            {
754               png_uint_32 len;
755               int diff;
756               int dummy_value_a;   /* fix 'forbidden register spilled' error */
757               int dummy_value_d;
758               int dummy_value_c;
759               int dummy_value_S;
760               int dummy_value_D;
761               _unmask = ~mask;            /* global variable for -fPIC version */
762               srcptr = png_ptr->row_buf + 1;
763               dstptr = row;
764               len  = png_ptr->width &~7;  /* reduce to multiple of 8 */
765               diff = (int) (png_ptr->width & 7); /* amount lost // */
766
767               __asm__ __volatile__ (
768                  "movd      _unmask, %%mm7   \n\t" /* load bit pattern */
769                  "psubb     %%mm6, %%mm6     \n\t" /* zero mm6 */
770                  "punpcklbw %%mm7, %%mm7     \n\t"
771                  "punpcklwd %%mm7, %%mm7     \n\t"
772                  "punpckldq %%mm7, %%mm7     \n\t" /* fill reg with 8 masks */
773
774                  "movq      _mask16_0, %%mm0 \n\t"
775                  "movq      _mask16_1, %%mm1 \n\t"
776
777                  "pand      %%mm7, %%mm0     \n\t"
778                  "pand      %%mm7, %%mm1     \n\t"
779
780                  "pcmpeqb   %%mm6, %%mm0     \n\t"
781                  "pcmpeqb   %%mm6, %%mm1     \n\t"
782
783/* preload        "movl      len, %%ecx       \n\t" // load length of line */
784/* preload        "movl      srcptr, %%esi    \n\t" // load source */
785/* preload        "movl      dstptr, %%edi    \n\t" // load dest */
786
787                  "cmpl      $0, %%ecx        \n\t"
788                  "jz        mainloop16end    \n\t"
789
790                "mainloop16:                  \n\t"
791                  "movq      (%%esi), %%mm4   \n\t"
792                  "pand      %%mm0, %%mm4     \n\t"
793                  "movq      %%mm0, %%mm6     \n\t"
794                  "movq      (%%edi), %%mm7   \n\t"
795                  "pandn     %%mm7, %%mm6     \n\t"
796                  "por       %%mm6, %%mm4     \n\t"
797                  "movq      %%mm4, (%%edi)   \n\t"
798
799                  "movq      8(%%esi), %%mm5  \n\t"
800                  "pand      %%mm1, %%mm5     \n\t"
801                  "movq      %%mm1, %%mm7     \n\t"
802                  "movq      8(%%edi), %%mm6  \n\t"
803                  "pandn     %%mm6, %%mm7     \n\t"
804                  "por       %%mm7, %%mm5     \n\t"
805                  "movq      %%mm5, 8(%%edi)  \n\t"
806
807                  "addl      $16, %%esi       \n\t" /* inc by 16 bytes processed */
808                  "addl      $16, %%edi       \n\t"
809                  "subl      $8, %%ecx        \n\t" /* dec by 8 pixels processed */
810                  "ja        mainloop16       \n\t"
811
812                "mainloop16end:               \n\t"
813/* preload        "movl      diff, %%ecx      \n\t" // (diff is in eax) */
814                  "movl      %%eax, %%ecx     \n\t"
815                  "cmpl      $0, %%ecx        \n\t"
816                  "jz        end16            \n\t"
817/* preload        "movl      mask, %%edx      \n\t" */
818                  "sall      $24, %%edx       \n\t" /* make low byte, high byte */
819
820                "secondloop16:                \n\t"
821                  "sall      %%edx            \n\t" /* move high bit to CF */
822                  "jnc       skip16           \n\t" /* if CF = 0 */
823                  "movw      (%%esi), %%ax    \n\t"
824                  "movw      %%ax, (%%edi)    \n\t"
825
826                "skip16:                      \n\t"
827                  "addl      $2, %%esi        \n\t"
828                  "addl      $2, %%edi        \n\t"
829                  "decl      %%ecx            \n\t"
830                  "jnz       secondloop16     \n\t"
831
832                "end16:                       \n\t"
833                  "EMMS                       \n\t" /* DONE */
834
835                  : "=a" (dummy_value_a),           /* output regs (dummy) */
836                    "=c" (dummy_value_c),
837                    "=d" (dummy_value_d),
838                    "=S" (dummy_value_S),
839                    "=D" (dummy_value_D)
840
841                  : "0" (diff),        /* eax       // input regs */
842/* was (unmask)     " "    RESERVED    // ebx       // Global Offset Table idx */
843                    "1" (len),         /* ecx */
844                    "2" (mask),        /* edx */
845                    "3" (srcptr),      /* esi */
846                    "4" (dstptr)       /* edi */
847
848#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
849                  : "%mm0", "%mm1", "%mm4"          /* clobber list */
850                  , "%mm5", "%mm6", "%mm7"
851#endif
852               );
853            }
854            else /* mmx _not supported - Use modified C routine */
855#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
856            {
857               register png_uint_32 i;
858               png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
859                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
860               register int stride = BPP2 * png_pass_inc[png_ptr->pass];
861                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
862               register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
863                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
864               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
865               int diff = (int) (png_ptr->width & 7); /* amount lost */
866               register png_uint_32 final_val = BPP2 * len;   /* GRR bugfix */
867
868               srcptr = png_ptr->row_buf + 1 + initial_val;
869               dstptr = row + initial_val;
870
871               for (i = initial_val; i < final_val; i += stride)
872               {
873                  png_memcpy(dstptr, srcptr, rep_bytes);
874                  srcptr += stride;
875                  dstptr += stride;
876               }
877               if (diff)  /* number of leftover pixels:  3 for pngtest */
878               {
879                  final_val+=diff*BPP2;
880                  for (; i < final_val; i += stride)
881                  {
882                     if (rep_bytes > (int)(final_val-i))
883                        rep_bytes = (int)(final_val-i);
884                     png_memcpy(dstptr, srcptr, rep_bytes);
885                     srcptr += stride;
886                     dstptr += stride;
887                  }
888               }
889            } /* end of else (_mmx_supported) */
890
891            break;
892         }       /* end 16 bpp */
893
894         case 24:       /* png_ptr->row_info.pixel_depth */
895         {
896            png_bytep srcptr;
897            png_bytep dstptr;
898
899#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
900#if !defined(PNG_1_0_X)
901            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
902                /* && _mmx_supported */ )
903#else
904            if (_mmx_supported)
905#endif
906            {
907               png_uint_32 len;
908               int diff;
909               int dummy_value_a;   /* fix 'forbidden register spilled' error */
910               int dummy_value_d;
911               int dummy_value_c;
912               int dummy_value_S;
913               int dummy_value_D;
914               _unmask = ~mask;            /* global variable for -fPIC version */
915               srcptr = png_ptr->row_buf + 1;
916               dstptr = row;
917               len  = png_ptr->width &~7;  /* reduce to multiple of 8 */
918               diff = (int) (png_ptr->width & 7); /* amount lost // */
919
920               __asm__ __volatile__ (
921                  "movd      _unmask, %%mm7   \n\t" /* load bit pattern */
922                  "psubb     %%mm6, %%mm6     \n\t" /* zero mm6 */
923                  "punpcklbw %%mm7, %%mm7     \n\t"
924                  "punpcklwd %%mm7, %%mm7     \n\t"
925                  "punpckldq %%mm7, %%mm7     \n\t" /* fill reg with 8 masks */
926
927                  "movq      _mask24_0, %%mm0 \n\t"
928                  "movq      _mask24_1, %%mm1 \n\t"
929                  "movq      _mask24_2, %%mm2 \n\t"
930
931                  "pand      %%mm7, %%mm0     \n\t"
932                  "pand      %%mm7, %%mm1     \n\t"
933                  "pand      %%mm7, %%mm2     \n\t"
934
935                  "pcmpeqb   %%mm6, %%mm0     \n\t"
936                  "pcmpeqb   %%mm6, %%mm1     \n\t"
937                  "pcmpeqb   %%mm6, %%mm2     \n\t"
938
939/* preload        "movl      len, %%ecx       \n\t" // load length of line */
940/* preload        "movl      srcptr, %%esi    \n\t" // load source */
941/* preload        "movl      dstptr, %%edi    \n\t" // load dest */
942
943                  "cmpl      $0, %%ecx        \n\t"
944                  "jz        mainloop24end    \n\t"
945
946                "mainloop24:                  \n\t"
947                  "movq      (%%esi), %%mm4   \n\t"
948                  "pand      %%mm0, %%mm4     \n\t"
949                  "movq      %%mm0, %%mm6     \n\t"
950                  "movq      (%%edi), %%mm7   \n\t"
951                  "pandn     %%mm7, %%mm6     \n\t"
952                  "por       %%mm6, %%mm4     \n\t"
953                  "movq      %%mm4, (%%edi)   \n\t"
954
955                  "movq      8(%%esi), %%mm5  \n\t"
956                  "pand      %%mm1, %%mm5     \n\t"
957                  "movq      %%mm1, %%mm7     \n\t"
958                  "movq      8(%%edi), %%mm6  \n\t"
959                  "pandn     %%mm6, %%mm7     \n\t"
960                  "por       %%mm7, %%mm5     \n\t"
961                  "movq      %%mm5, 8(%%edi)  \n\t"
962
963                  "movq      16(%%esi), %%mm6 \n\t"
964                  "pand      %%mm2, %%mm6     \n\t"
965                  "movq      %%mm2, %%mm4     \n\t"
966                  "movq      16(%%edi), %%mm7 \n\t"
967                  "pandn     %%mm7, %%mm4     \n\t"
968                  "por       %%mm4, %%mm6     \n\t"
969                  "movq      %%mm6, 16(%%edi) \n\t"
970
971                  "addl      $24, %%esi       \n\t" /* inc by 24 bytes processed */
972                  "addl      $24, %%edi       \n\t"
973                  "subl      $8, %%ecx        \n\t" /* dec by 8 pixels processed */
974
975                  "ja        mainloop24       \n\t"
976
977                "mainloop24end:               \n\t"
978/* preload        "movl      diff, %%ecx      \n\t" // (diff is in eax) */
979                  "movl      %%eax, %%ecx     \n\t"
980                  "cmpl      $0, %%ecx        \n\t"
981                  "jz        end24            \n\t"
982/* preload        "movl      mask, %%edx      \n\t" */
983                  "sall      $24, %%edx       \n\t" /* make low byte, high byte */
984
985                "secondloop24:                \n\t"
986                  "sall      %%edx            \n\t" /* move high bit to CF */
987                  "jnc       skip24           \n\t" /* if CF = 0 */
988                  "movw      (%%esi), %%ax    \n\t"
989                  "movw      %%ax, (%%edi)    \n\t"
990                  "xorl      %%eax, %%eax     \n\t"
991                  "movb      2(%%esi), %%al   \n\t"
992                  "movb      %%al, 2(%%edi)   \n\t"
993
994                "skip24:                      \n\t"
995                  "addl      $3, %%esi        \n\t"
996                  "addl      $3, %%edi        \n\t"
997                  "decl      %%ecx            \n\t"
998                  "jnz       secondloop24     \n\t"
999
1000                "end24:                       \n\t"
1001                  "EMMS                       \n\t" /* DONE */
1002
1003                  : "=a" (dummy_value_a),           /* output regs (dummy) */
1004                    "=d" (dummy_value_d),
1005                    "=c" (dummy_value_c),
1006                    "=S" (dummy_value_S),
1007                    "=D" (dummy_value_D)
1008
1009                  : "3" (srcptr),      /* esi       // input regs */
1010                    "4" (dstptr),      /* edi */
1011                    "0" (diff),        /* eax */
1012/* was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx */
1013                    "2" (len),         /* ecx */
1014                    "1" (mask)         /* edx */
1015
1016#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1017                  : "%mm0", "%mm1", "%mm2"          /* clobber list */
1018                  , "%mm4", "%mm5", "%mm6", "%mm7"
1019#endif
1020               );
1021            }
1022            else /* mmx _not supported - Use modified C routine */
1023#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1024            {
1025               register png_uint_32 i;
1026               png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
1027                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1028               register int stride = BPP3 * png_pass_inc[png_ptr->pass];
1029                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1030               register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
1031                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1032               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1033               int diff = (int) (png_ptr->width & 7); /* amount lost */
1034               register png_uint_32 final_val = BPP3 * len;   /* GRR bugfix */
1035
1036               srcptr = png_ptr->row_buf + 1 + initial_val;
1037               dstptr = row + initial_val;
1038
1039               for (i = initial_val; i < final_val; i += stride)
1040               {
1041                  png_memcpy(dstptr, srcptr, rep_bytes);
1042                  srcptr += stride;
1043                  dstptr += stride;
1044               }
1045               if (diff)  /* number of leftover pixels:  3 for pngtest */
1046               {
1047                  final_val+=diff*BPP3;
1048                  for (; i < final_val; i += stride)
1049                  {
1050                     if (rep_bytes > (int)(final_val-i))
1051                        rep_bytes = (int)(final_val-i);
1052                     png_memcpy(dstptr, srcptr, rep_bytes);
1053                     srcptr += stride;
1054                     dstptr += stride;
1055                  }
1056               }
1057            } /* end of else (_mmx_supported) */
1058
1059            break;
1060         }       /* end 24 bpp */
1061
1062         case 32:       /* png_ptr->row_info.pixel_depth */
1063         {
1064            png_bytep srcptr;
1065            png_bytep dstptr;
1066
1067#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1068#if !defined(PNG_1_0_X)
1069            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1070                /* && _mmx_supported */ )
1071#else
1072            if (_mmx_supported)
1073#endif
1074            {
1075               png_uint_32 len;
1076               int diff;
1077               int dummy_value_a;   /* fix 'forbidden register spilled' error */
1078               int dummy_value_d;
1079               int dummy_value_c;
1080               int dummy_value_S;
1081               int dummy_value_D;
1082               _unmask = ~mask;            /* global variable for -fPIC version */
1083               srcptr = png_ptr->row_buf + 1;
1084               dstptr = row;
1085               len  = png_ptr->width &~7;  /* reduce to multiple of 8 */
1086               diff = (int) (png_ptr->width & 7); /* amount lost // */
1087
1088               __asm__ __volatile__ (
1089                  "movd      _unmask, %%mm7   \n\t" /* load bit pattern */
1090                  "psubb     %%mm6, %%mm6     \n\t" /* zero mm6 */
1091                  "punpcklbw %%mm7, %%mm7     \n\t"
1092                  "punpcklwd %%mm7, %%mm7     \n\t"
1093                  "punpckldq %%mm7, %%mm7     \n\t" /* fill reg with 8 masks */
1094
1095                  "movq      _mask32_0, %%mm0 \n\t"
1096                  "movq      _mask32_1, %%mm1 \n\t"
1097                  "movq      _mask32_2, %%mm2 \n\t"
1098                  "movq      _mask32_3, %%mm3 \n\t"
1099
1100                  "pand      %%mm7, %%mm0     \n\t"
1101                  "pand      %%mm7, %%mm1     \n\t"
1102                  "pand      %%mm7, %%mm2     \n\t"
1103                  "pand      %%mm7, %%mm3     \n\t"
1104
1105                  "pcmpeqb   %%mm6, %%mm0     \n\t"
1106                  "pcmpeqb   %%mm6, %%mm1     \n\t"
1107                  "pcmpeqb   %%mm6, %%mm2     \n\t"
1108                  "pcmpeqb   %%mm6, %%mm3     \n\t"
1109
1110/* preload        "movl      len, %%ecx       \n\t" // load length of line */
1111/* preload        "movl      srcptr, %%esi    \n\t" // load source */
1112/* preload        "movl      dstptr, %%edi    \n\t" // load dest */
1113
1114                  "cmpl      $0, %%ecx        \n\t" /* lcr */
1115                  "jz        mainloop32end    \n\t"
1116
1117                "mainloop32:                  \n\t"
1118                  "movq      (%%esi), %%mm4   \n\t"
1119                  "pand      %%mm0, %%mm4     \n\t"
1120                  "movq      %%mm0, %%mm6     \n\t"
1121                  "movq      (%%edi), %%mm7   \n\t"
1122                  "pandn     %%mm7, %%mm6     \n\t"
1123                  "por       %%mm6, %%mm4     \n\t"
1124                  "movq      %%mm4, (%%edi)   \n\t"
1125
1126                  "movq      8(%%esi), %%mm5  \n\t"
1127                  "pand      %%mm1, %%mm5     \n\t"
1128                  "movq      %%mm1, %%mm7     \n\t"
1129                  "movq      8(%%edi), %%mm6  \n\t"
1130                  "pandn     %%mm6, %%mm7     \n\t"
1131                  "por       %%mm7, %%mm5     \n\t"
1132                  "movq      %%mm5, 8(%%edi)  \n\t"
1133
1134                  "movq      16(%%esi), %%mm6 \n\t"
1135                  "pand      %%mm2, %%mm6     \n\t"
1136                  "movq      %%mm2, %%mm4     \n\t"
1137                  "movq      16(%%edi), %%mm7 \n\t"
1138                  "pandn     %%mm7, %%mm4     \n\t"
1139                  "por       %%mm4, %%mm6     \n\t"
1140                  "movq      %%mm6, 16(%%edi) \n\t"
1141
1142                  "movq      24(%%esi), %%mm7 \n\t"
1143                  "pand      %%mm3, %%mm7     \n\t"
1144                  "movq      %%mm3, %%mm5     \n\t"
1145                  "movq      24(%%edi), %%mm4 \n\t"
1146                  "pandn     %%mm4, %%mm5     \n\t"
1147                  "por       %%mm5, %%mm7     \n\t"
1148                  "movq      %%mm7, 24(%%edi) \n\t"
1149
1150                  "addl      $32, %%esi       \n\t" /* inc by 32 bytes processed */
1151                  "addl      $32, %%edi       \n\t"
1152                  "subl      $8, %%ecx        \n\t" /* dec by 8 pixels processed */
1153                  "ja        mainloop32       \n\t"
1154
1155                "mainloop32end:               \n\t"
1156/* preload        "movl      diff, %%ecx      \n\t" // (diff is in eax) */
1157                  "movl      %%eax, %%ecx     \n\t"
1158                  "cmpl      $0, %%ecx        \n\t"
1159                  "jz        end32            \n\t"
1160/* preload        "movl      mask, %%edx      \n\t" */
1161                  "sall      $24, %%edx       \n\t" /* low byte => high byte */
1162
1163                "secondloop32:                \n\t"
1164                  "sall      %%edx            \n\t" /* move high bit to CF */
1165                  "jnc       skip32           \n\t" /* if CF = 0 */
1166                  "movl      (%%esi), %%eax   \n\t"
1167                  "movl      %%eax, (%%edi)   \n\t"
1168
1169                "skip32:                      \n\t"
1170                  "addl      $4, %%esi        \n\t"
1171                  "addl      $4, %%edi        \n\t"
1172                  "decl      %%ecx            \n\t"
1173                  "jnz       secondloop32     \n\t"
1174
1175                "end32:                       \n\t"
1176                  "EMMS                       \n\t" /* DONE */
1177
1178                  : "=a" (dummy_value_a),           /* output regs (dummy) */
1179                    "=d" (dummy_value_d),
1180                    "=c" (dummy_value_c),
1181                    "=S" (dummy_value_S),
1182                    "=D" (dummy_value_D)
1183
1184                  : "3" (srcptr),      /* esi       // input regs */
1185                    "4" (dstptr),      /* edi */
1186                    "0" (diff),        /* eax */
1187/* was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx */
1188                    "2" (len),         /* ecx */
1189                    "1" (mask)         /* edx */
1190
1191#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1192                  : "%mm0", "%mm1", "%mm2", "%mm3"  /* clobber list */
1193                  , "%mm4", "%mm5", "%mm6", "%mm7"
1194#endif
1195               );
1196            }
1197            else /* mmx _not supported - Use modified C routine */
1198#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1199            {
1200               register png_uint_32 i;
1201               png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
1202                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1203               register int stride = BPP4 * png_pass_inc[png_ptr->pass];
1204                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1205               register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
1206                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1207               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1208               int diff = (int) (png_ptr->width & 7); /* amount lost */
1209               register png_uint_32 final_val = BPP4 * len;   /* GRR bugfix */
1210
1211               srcptr = png_ptr->row_buf + 1 + initial_val;
1212               dstptr = row + initial_val;
1213
1214               for (i = initial_val; i < final_val; i += stride)
1215               {
1216                  png_memcpy(dstptr, srcptr, rep_bytes);
1217                  srcptr += stride;
1218                  dstptr += stride;
1219               }
1220               if (diff)  /* number of leftover pixels:  3 for pngtest */
1221               {
1222                  final_val+=diff*BPP4;
1223                  for (; i < final_val; i += stride)
1224                  {
1225                     if (rep_bytes > (int)(final_val-i))
1226                        rep_bytes = (int)(final_val-i);
1227                     png_memcpy(dstptr, srcptr, rep_bytes);
1228                     srcptr += stride;
1229                     dstptr += stride;
1230                  }
1231               }
1232            } /* end of else (_mmx_supported) */
1233
1234            break;
1235         }       /* end 32 bpp */
1236
1237         case 48:       /* png_ptr->row_info.pixel_depth */
1238         {
1239            png_bytep srcptr;
1240            png_bytep dstptr;
1241
1242#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1243#if !defined(PNG_1_0_X)
1244            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1245                /* && _mmx_supported */ )
1246#else
1247            if (_mmx_supported)
1248#endif
1249            {
1250               png_uint_32 len;
1251               int diff;
1252               int dummy_value_a;   /* fix 'forbidden register spilled' error */
1253               int dummy_value_d;
1254               int dummy_value_c;
1255               int dummy_value_S;
1256               int dummy_value_D;
1257               _unmask = ~mask;            /* global variable for -fPIC version */
1258               srcptr = png_ptr->row_buf + 1;
1259               dstptr = row;
1260               len  = png_ptr->width &~7;  /* reduce to multiple of 8 */
1261               diff = (int) (png_ptr->width & 7); /* amount lost // */
1262
1263               __asm__ __volatile__ (
1264                  "movd      _unmask, %%mm7   \n\t" /* load bit pattern */
1265                  "psubb     %%mm6, %%mm6     \n\t" /* zero mm6 */
1266                  "punpcklbw %%mm7, %%mm7     \n\t"
1267                  "punpcklwd %%mm7, %%mm7     \n\t"
1268                  "punpckldq %%mm7, %%mm7     \n\t" /* fill reg with 8 masks */
1269
1270                  "movq      _mask48_0, %%mm0 \n\t"
1271                  "movq      _mask48_1, %%mm1 \n\t"
1272                  "movq      _mask48_2, %%mm2 \n\t"
1273                  "movq      _mask48_3, %%mm3 \n\t"
1274                  "movq      _mask48_4, %%mm4 \n\t"
1275                  "movq      _mask48_5, %%mm5 \n\t"
1276
1277                  "pand      %%mm7, %%mm0     \n\t"
1278                  "pand      %%mm7, %%mm1     \n\t"
1279                  "pand      %%mm7, %%mm2     \n\t"
1280                  "pand      %%mm7, %%mm3     \n\t"
1281                  "pand      %%mm7, %%mm4     \n\t"
1282                  "pand      %%mm7, %%mm5     \n\t"
1283
1284                  "pcmpeqb   %%mm6, %%mm0     \n\t"
1285                  "pcmpeqb   %%mm6, %%mm1     \n\t"
1286                  "pcmpeqb   %%mm6, %%mm2     \n\t"
1287                  "pcmpeqb   %%mm6, %%mm3     \n\t"
1288                  "pcmpeqb   %%mm6, %%mm4     \n\t"
1289                  "pcmpeqb   %%mm6, %%mm5     \n\t"
1290
1291/* preload        "movl      len, %%ecx       \n\t" // load length of line */
1292/* preload        "movl      srcptr, %%esi    \n\t" // load source */
1293/* preload        "movl      dstptr, %%edi    \n\t" // load dest */
1294
1295                  "cmpl      $0, %%ecx        \n\t"
1296                  "jz        mainloop48end    \n\t"
1297
1298                "mainloop48:                  \n\t"
1299                  "movq      (%%esi), %%mm7   \n\t"
1300                  "pand      %%mm0, %%mm7     \n\t"
1301                  "movq      %%mm0, %%mm6     \n\t"
1302                  "pandn     (%%edi), %%mm6   \n\t"
1303                  "por       %%mm6, %%mm7     \n\t"
1304                  "movq      %%mm7, (%%edi)   \n\t"
1305
1306                  "movq      8(%%esi), %%mm6  \n\t"
1307                  "pand      %%mm1, %%mm6     \n\t"
1308                  "movq      %%mm1, %%mm7     \n\t"
1309                  "pandn     8(%%edi), %%mm7  \n\t"
1310                  "por       %%mm7, %%mm6     \n\t"
1311                  "movq      %%mm6, 8(%%edi)  \n\t"
1312
1313                  "movq      16(%%esi), %%mm6 \n\t"
1314                  "pand      %%mm2, %%mm6     \n\t"
1315                  "movq      %%mm2, %%mm7     \n\t"
1316                  "pandn     16(%%edi), %%mm7 \n\t"
1317                  "por       %%mm7, %%mm6     \n\t"
1318                  "movq      %%mm6, 16(%%edi) \n\t"
1319
1320                  "movq      24(%%esi), %%mm7 \n\t"
1321                  "pand      %%mm3, %%mm7     \n\t"
1322                  "movq      %%mm3, %%mm6     \n\t"
1323                  "pandn     24(%%edi), %%mm6 \n\t"
1324                  "por       %%mm6, %%mm7     \n\t"
1325                  "movq      %%mm7, 24(%%edi) \n\t"
1326
1327                  "movq      32(%%esi), %%mm6 \n\t"
1328                  "pand      %%mm4, %%mm6     \n\t"
1329                  "movq      %%mm4, %%mm7     \n\t"
1330                  "pandn     32(%%edi), %%mm7 \n\t"
1331                  "por       %%mm7, %%mm6     \n\t"
1332                  "movq      %%mm6, 32(%%edi) \n\t"
1333
1334                  "movq      40(%%esi), %%mm7 \n\t"
1335                  "pand      %%mm5, %%mm7     \n\t"
1336                  "movq      %%mm5, %%mm6     \n\t"
1337                  "pandn     40(%%edi), %%mm6 \n\t"
1338                  "por       %%mm6, %%mm7     \n\t"
1339                  "movq      %%mm7, 40(%%edi) \n\t"
1340
1341                  "addl      $48, %%esi       \n\t" /* inc by 48 bytes processed */
1342                  "addl      $48, %%edi       \n\t"
1343                  "subl      $8, %%ecx        \n\t" /* dec by 8 pixels processed */
1344
1345                  "ja        mainloop48       \n\t"
1346
1347                "mainloop48end:               \n\t"
1348/* preload        "movl      diff, %%ecx      \n\t" // (diff is in eax) */
1349                  "movl      %%eax, %%ecx     \n\t"
1350                  "cmpl      $0, %%ecx        \n\t"
1351                  "jz        end48            \n\t"
1352/* preload        "movl      mask, %%edx      \n\t" */
1353                  "sall      $24, %%edx       \n\t" /* make low byte, high byte */
1354
1355                "secondloop48:                \n\t"
1356                  "sall      %%edx            \n\t" /* move high bit to CF */
1357                  "jnc       skip48           \n\t" /* if CF = 0 */
1358                  "movl      (%%esi), %%eax   \n\t"
1359                  "movl      %%eax, (%%edi)   \n\t"
1360
1361                "skip48:                      \n\t"
1362                  "addl      $4, %%esi        \n\t"
1363                  "addl      $4, %%edi        \n\t"
1364                  "decl      %%ecx            \n\t"
1365                  "jnz       secondloop48     \n\t"
1366
1367                "end48:                       \n\t"
1368                  "EMMS                       \n\t" /* DONE */
1369
1370                  : "=a" (dummy_value_a),           /* output regs (dummy) */
1371                    "=d" (dummy_value_d),
1372                    "=c" (dummy_value_c),
1373                    "=S" (dummy_value_S),
1374                    "=D" (dummy_value_D)
1375
1376                  : "3" (srcptr),      /* esi       // input regs */
1377                    "4" (dstptr),      /* edi */
1378                    "0" (diff),        /* eax */
1379/* was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx */
1380                    "2" (len),         /* ecx */
1381                    "1" (mask)         /* edx */
1382
1383#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1384                  : "%mm0", "%mm1", "%mm2", "%mm3"  /* clobber list */
1385                  , "%mm4", "%mm5", "%mm6", "%mm7"
1386#endif
1387               );
1388            }
1389            else /* mmx _not supported - Use modified C routine */
1390#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1391            {
1392               register png_uint_32 i;
1393               png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
1394                 /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1395               register int stride = BPP6 * png_pass_inc[png_ptr->pass];
1396                 /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1397               register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
1398                 /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1399               png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1400               int diff = (int) (png_ptr->width & 7); /* amount lost */
1401               register png_uint_32 final_val = BPP6 * len;   /* GRR bugfix */
1402
1403               srcptr = png_ptr->row_buf + 1 + initial_val;
1404               dstptr = row + initial_val;
1405
1406               for (i = initial_val; i < final_val; i += stride)
1407               {
1408                  png_memcpy(dstptr, srcptr, rep_bytes);
1409                  srcptr += stride;
1410                  dstptr += stride;
1411               }
1412               if (diff)  /* number of leftover pixels:  3 for pngtest */
1413               {
1414                  final_val+=diff*BPP6;
1415                  for (; i < final_val; i += stride)
1416                  {
1417                     if (rep_bytes > (int)(final_val-i))
1418                        rep_bytes = (int)(final_val-i);
1419                     png_memcpy(dstptr, srcptr, rep_bytes);
1420                     srcptr += stride;
1421                     dstptr += stride;
1422                  }
1423               }
1424            } /* end of else (_mmx_supported) */
1425
1426            break;
1427         }       /* end 48 bpp */
1428
1429         case 64:       /* png_ptr->row_info.pixel_depth */
1430         {
1431            png_bytep srcptr;
1432            png_bytep dstptr;
1433            register png_uint_32 i;
1434            png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
1435              /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1436            register int stride = BPP8 * png_pass_inc[png_ptr->pass];
1437              /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1438            register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
1439              /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1440            png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1441            int diff = (int) (png_ptr->width & 7); /* amount lost */
1442            register png_uint_32 final_val = BPP8 * len;   /* GRR bugfix */
1443
1444            srcptr = png_ptr->row_buf + 1 + initial_val;
1445            dstptr = row + initial_val;
1446
1447            for (i = initial_val; i < final_val; i += stride)
1448            {
1449               png_memcpy(dstptr, srcptr, rep_bytes);
1450               srcptr += stride;
1451               dstptr += stride;
1452            }
1453            if (diff)  /* number of leftover pixels:  3 for pngtest */
1454            {
1455               final_val+=diff*BPP8;
1456               for (; i < final_val; i += stride)
1457               {
1458                  if (rep_bytes > (int)(final_val-i))
1459                     rep_bytes = (int)(final_val-i);
1460                  png_memcpy(dstptr, srcptr, rep_bytes);
1461                  srcptr += stride;
1462                  dstptr += stride;
1463               }
1464            }
1465
1466            break;
1467         }       /* end 64 bpp */
1468
1469         default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
1470         {
1471            /* this should never happen */
1472            png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
1473            break;
1474         }
1475      } /* end switch (png_ptr->row_info.pixel_depth) */
1476
1477   } /* end if (non-trivial mask) */
1478
1479} /* end png_combine_row() */
1480
1481#endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
1482
1483
1484
1485
1486/*===========================================================================*/
1487/*                                                                           */
1488/*                 P N G _ D O _ R E A D _ I N T E R L A C E                 */
1489/*                                                                           */
1490/*===========================================================================*/
1491
1492#if defined(PNG_READ_INTERLACING_SUPPORTED)
1493#if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
1494
1495/* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1496 * has taken place.  [GRR: what other steps come before and/or after?]
1497 */
1498
1499void /* PRIVATE */
1500png_do_read_interlace(png_structp png_ptr)
1501{
1502   png_row_infop row_info = &(png_ptr->row_info);
1503   png_bytep row = png_ptr->row_buf + 1;
1504   int pass = png_ptr->pass;
1505#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1506   png_uint_32 transformations = png_ptr->transformations;
1507#endif
1508
1509   png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
1510
1511#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1512   if (_mmx_supported == 2) {
1513#if !defined(PNG_1_0_X)
1514       /* this should have happened in png_init_mmx_flags() already */
1515       png_warning(png_ptr, "asm_flags may not have been initialized");
1516#endif
1517       png_mmx_support();
1518   }
1519#endif
1520
1521   if (row != NULL && row_info != NULL)
1522   {
1523      png_uint_32 final_width;
1524
1525      final_width = row_info->width * png_pass_inc[pass];
1526
1527      switch (row_info->pixel_depth)
1528      {
1529         case 1:
1530         {
1531            png_bytep sp, dp;
1532            int sshift, dshift;
1533            int s_start, s_end, s_inc;
1534            png_byte v;
1535            png_uint_32 i;
1536            int j;
1537
1538            sp = row + (png_size_t)((row_info->width - 1) >> 3);
1539            dp = row + (png_size_t)((final_width - 1) >> 3);
1540#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1541            if (transformations & PNG_PACKSWAP)
1542            {
1543               sshift = (int)((row_info->width + 7) & 7);
1544               dshift = (int)((final_width + 7) & 7);
1545               s_start = 7;
1546               s_end = 0;
1547               s_inc = -1;
1548            }
1549            else
1550#endif
1551            {
1552               sshift = 7 - (int)((row_info->width + 7) & 7);
1553               dshift = 7 - (int)((final_width + 7) & 7);
1554               s_start = 0;
1555               s_end = 7;
1556               s_inc = 1;
1557            }
1558
1559            for (i = row_info->width; i; i--)
1560            {
1561               v = (png_byte)((*sp >> sshift) & 0x1);
1562               for (j = 0; j < png_pass_inc[pass]; j++)
1563               {
1564                  *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1565                  *dp |= (png_byte)(v << dshift);
1566                  if (dshift == s_end)
1567                  {
1568                     dshift = s_start;
1569                     dp--;
1570                  }
1571                  else
1572                     dshift += s_inc;
1573               }
1574               if (sshift == s_end)
1575               {
1576                  sshift = s_start;
1577                  sp--;
1578               }
1579               else
1580                  sshift += s_inc;
1581            }
1582            break;
1583         }
1584
1585         case 2:
1586         {
1587            png_bytep sp, dp;
1588            int sshift, dshift;
1589            int s_start, s_end, s_inc;
1590            png_uint_32 i;
1591
1592            sp = row + (png_size_t)((row_info->width - 1) >> 2);
1593            dp = row + (png_size_t)((final_width - 1) >> 2);
1594#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1595            if (transformations & PNG_PACKSWAP)
1596            {
1597               sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1598               dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1599               s_start = 6;
1600               s_end = 0;
1601               s_inc = -2;
1602            }
1603            else
1604#endif
1605            {
1606               sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1607               dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1608               s_start = 0;
1609               s_end = 6;
1610               s_inc = 2;
1611            }
1612
1613            for (i = row_info->width; i; i--)
1614            {
1615               png_byte v;
1616               int j;
1617
1618               v = (png_byte)((*sp >> sshift) & 0x3);
1619               for (j = 0; j < png_pass_inc[pass]; j++)
1620               {
1621                  *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1622                  *dp |= (png_byte)(v << dshift);
1623                  if (dshift == s_end)
1624                  {
1625                     dshift = s_start;
1626                     dp--;
1627                  }
1628                  else
1629                     dshift += s_inc;
1630               }
1631               if (sshift == s_end)
1632               {
1633                  sshift = s_start;
1634                  sp--;
1635               }
1636               else
1637                  sshift += s_inc;
1638            }
1639            break;
1640         }
1641
1642         case 4:
1643         {
1644            png_bytep sp, dp;
1645            int sshift, dshift;
1646            int s_start, s_end, s_inc;
1647            png_uint_32 i;
1648
1649            sp = row + (png_size_t)((row_info->width - 1) >> 1);
1650            dp = row + (png_size_t)((final_width - 1) >> 1);
1651#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1652            if (transformations & PNG_PACKSWAP)
1653            {
1654               sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1655               dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1656               s_start = 4;
1657               s_end = 0;
1658               s_inc = -4;
1659            }
1660            else
1661#endif
1662            {
1663               sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1664               dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1665               s_start = 0;
1666               s_end = 4;
1667               s_inc = 4;
1668            }
1669
1670            for (i = row_info->width; i; i--)
1671            {
1672               png_byte v;
1673               int j;
1674
1675               v = (png_byte)((*sp >> sshift) & 0xf);
1676               for (j = 0; j < png_pass_inc[pass]; j++)
1677               {
1678                  *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1679                  *dp |= (png_byte)(v << dshift);
1680                  if (dshift == s_end)
1681                  {
1682                     dshift = s_start;
1683                     dp--;
1684                  }
1685                  else
1686                     dshift += s_inc;
1687               }
1688               if (sshift == s_end)
1689               {
1690                  sshift = s_start;
1691                  sp--;
1692               }
1693               else
1694                  sshift += s_inc;
1695            }
1696            break;
1697         }
1698
1699       /*====================================================================*/
1700
1701         default: /* 8-bit or larger (this is where the routine is modified) */
1702         {
1703#if 0
1704/*          static unsigned long long _const4 = 0x0000000000FFFFFFLL;  no good */
1705/*          static unsigned long long const4 = 0x0000000000FFFFFFLL;   no good */
1706/*          unsigned long long _const4 = 0x0000000000FFFFFFLL;         no good */
1707/*          unsigned long long const4 = 0x0000000000FFFFFFLL;          no good */
1708#endif
1709            png_bytep sptr, dp;
1710            png_uint_32 i;
1711            png_size_t pixel_bytes;
1712            int width = (int)row_info->width;
1713
1714            pixel_bytes = (row_info->pixel_depth >> 3);
1715
1716            /* point sptr at the last pixel in the pre-expanded row: */
1717            sptr = row + (width - 1) * pixel_bytes;
1718
1719            /* point dp at the last pixel position in the expanded row: */
1720            dp = row + (final_width - 1) * pixel_bytes;
1721
1722            /* New code by Nirav Chhatrapati - Intel Corporation */
1723
1724#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1725#if !defined(PNG_1_0_X)
1726            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1727                /* && _mmx_supported */ )
1728#else
1729            if (_mmx_supported)
1730#endif
1731            {
1732               //--------------------------------------------------------------
1733               if (pixel_bytes == 3)
1734               {
1735                  if (((pass == 0) || (pass == 1)) && width)
1736                  {
1737                     int dummy_value_c;   /* fix 'forbidden register spilled' */
1738                     int dummy_value_S;
1739                     int dummy_value_D;
1740
1741                     __asm__ __volatile__ (
1742                        "subl $21, %%edi         \n\t"
1743                                     /* (png_pass_inc[pass] - 1)*pixel_bytes */
1744
1745                     ".loop3_pass0:              \n\t"
1746                        "movd (%%esi), %%mm0     \n\t" /* x x x x x 2 1 0 */
1747                        "pand _const4, %%mm0     \n\t" /* z z z z z 2 1 0 */
1748                        "movq %%mm0, %%mm1       \n\t" /* z z z z z 2 1 0 */
1749                        "psllq $16, %%mm0        \n\t" /* z z z 2 1 0 z z */
1750                        "movq %%mm0, %%mm2       \n\t" /* z z z 2 1 0 z z */
1751                        "psllq $24, %%mm0        \n\t" /* 2 1 0 z z z z z */
1752                        "psrlq $8, %%mm1         \n\t" /* z z z z z z 2 1 */
1753                        "por %%mm2, %%mm0        \n\t" /* 2 1 0 2 1 0 z z */
1754                        "por %%mm1, %%mm0        \n\t" /* 2 1 0 2 1 0 2 1 */
1755                        "movq %%mm0, %%mm3       \n\t" /* 2 1 0 2 1 0 2 1 */
1756                        "psllq $16, %%mm0        \n\t" /* 0 2 1 0 2 1 z z */
1757                        "movq %%mm3, %%mm4       \n\t" /* 2 1 0 2 1 0 2 1 */
1758                        "punpckhdq %%mm0, %%mm3  \n\t" /* 0 2 1 0 2 1 0 2 */
1759                        "movq %%mm4, 16(%%edi)   \n\t"
1760                        "psrlq $32, %%mm0        \n\t" /* z z z z 0 2 1 0 */
1761                        "movq %%mm3, 8(%%edi)    \n\t"
1762                        "punpckldq %%mm4, %%mm0  \n\t" /* 1 0 2 1 0 2 1 0 */
1763                        "subl $3, %%esi          \n\t"
1764                        "movq %%mm0, (%%edi)     \n\t"
1765                        "subl $24, %%edi         \n\t"
1766                        "decl %%ecx              \n\t"
1767                        "jnz .loop3_pass0        \n\t"
1768                        "EMMS                    \n\t" /* DONE */
1769
1770                        : "=c" (dummy_value_c),        /* output regs (dummy) */
1771                          "=S" (dummy_value_S),
1772                          "=D" (dummy_value_D)
1773
1774                        : "1" (sptr),      // esi      // input regs
1775                          "2" (dp),        // edi
1776                          "0" (width),     // ecx
1777                          "rim" (_const4)  // %1(?)  (0x0000000000FFFFFFLL)
1778
1779#if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1780                        : "%mm0", "%mm1", "%mm2"       /* clobber list */
1781                        , "%mm3", "%mm4"
1782#endif
1783                     );
1784                  }
1785                  else if (((pass == 2) || (pass == 3)) && width)
1786                  {
1787                     int dummy_value_c;   /* fix 'forbidden register spilled' */
1788                     int dummy_value_S;
1789                     int dummy_value_D;
1790
1791                     __asm__ __volatile__ (
1792                        "subl $9, %%edi          \n\t"
1793                                     /* (png_pass_inc[pass] - 1)*pixel_bytes */
1794
1795                     ".loop3_pass2:              \n\t"
1796                        "movd (%%esi), %%mm0     \n\t" /* x x x x x 2 1 0 */
1797                        "pand _const4, %%mm0     \n\t" /* z z z z z 2 1 0 */
1798                        "movq %%mm0, %%mm1       \n\t" /* z z z z z 2 1 0 */
1799                        "psllq $16, %%mm0        \n\t" /* z z z 2 1 0 z z */
1800                        "movq %%mm0, %%mm2       \n\t" /* z z z 2 1 0 z z */
1801                        "psllq $24, %%mm0        \n\t" /* 2 1 0 z z z z z */
1802                        "psrlq $8, %%mm1         \n\t" /* z z z z z z 2 1 */
1803                        "por %%mm2, %%mm0        \n\t" /* 2 1 0 2 1 0 z z */
1804                        "por %%mm1, %%mm0        \n\t" /* 2 1 0 2 1 0 2 1 */
1805                        "movq %%mm0, 4(%%edi)    \n\t"
1806                        "psrlq $16, %%mm0        \n\t" /* z z 2 1 0 2 1 0 */
1807                        "subl $3, %%esi          \n\t"
1808                        "movd %%mm0, (%%edi)     \n\t"
1809                        "subl $12, %%edi         \n\t"
1810                        "decl %%ecx              \n\t"
1811                        "jnz .loop3_pass2        \n\t"
1812                        "EMMS                    \n\t" /* DONE */
1813
1814                        : "=c" (dummy_value_c),        /* output regs (dummy) */
1815                          "=S" (dummy_value_S),
1816                          "=D" (dummy_value_D)
1817
1818                        : "1" (sptr),      // esi      // input regs
1819                          "2" (dp),        // edi
1820                          "0" (width),     // ecx
1821                          "rim" (_const4)  // (0x0000000000FFFFFFLL)
1822
1823#if 0  /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1824                        : "%mm0", "%mm1", "%mm2"       /* clobber list */
1825#endif
1826                     );
1827                  }
1828                  else if (width) /* && ((pass == 4) || (pass == 5)) */
1829                  {
1830                     int width_mmx = ((width >> 1) << 1) - 8;   /* GRR:  huh? */
1831                     if (width_mmx < 0)
1832                         width_mmx = 0;
1833                     width -= width_mmx;        /* 8 or 9 pix, 24 or 27 bytes */
1834                     if (width_mmx)
1835                     {
1836                        /* png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1837                        /* sptr points at last pixel in pre-expanded row */
1838                        /* dp points at last pixel position in expanded row */
1839                        int dummy_value_c;  /* fix 'forbidden register spilled' */
1840                        int dummy_value_S;
1841                        int dummy_value_D;
1842
1843                        __asm__ __volatile__ (
1844                           "subl $3, %%esi          \n\t"
1845                           "subl $9, %%edi          \n\t"
1846                                        /* (png_pass_inc[pass] + 1)*pixel_bytes */
1847
1848                        ".loop3_pass4:              \n\t"
1849                           "movq (%%esi), %%mm0     \n\t" /* x x 5 4 3 2 1 0 */
1850                           "movq %%mm0, %%mm1       \n\t" /* x x 5 4 3 2 1 0 */
1851                           "movq %%mm0, %%mm2       \n\t" /* x x 5 4 3 2 1 0 */
1852                           "psllq $24, %%mm0        \n\t" /* 4 3 2 1 0 z z z */
1853                           "pand _const4, %%mm1     \n\t" /* z z z z z 2 1 0 */
1854                           "psrlq $24, %%mm2        \n\t" /* z z z x x 5 4 3 */
1855                           "por %%mm1, %%mm0        \n\t" /* 4 3 2 1 0 2 1 0 */
1856                           "movq %%mm2, %%mm3       \n\t" /* z z z x x 5 4 3 */
1857                           "psllq $8, %%mm2         \n\t" /* z z x x 5 4 3 z */
1858                           "movq %%mm0, (%%edi)     \n\t"
1859                           "psrlq $16, %%mm3        \n\t" /* z z z z z x x 5 */
1860                           "pand _const6, %%mm3     \n\t" /* z z z z z z z 5 */
1861                           "por %%mm3, %%mm2        \n\t" /* z z x x 5 4 3 5 */
1862                           "subl $6, %%esi          \n\t"
1863                           "movd %%mm2, 8(%%edi)    \n\t"
1864                           "subl $12, %%edi         \n\t"
1865                           "subl $2, %%ecx          \n\t"
1866                           "jnz .loop3_pass4        \n\t"
1867                           "EMMS                    \n\t" /* DONE */
1868
1869                           : "=c" (dummy_value_c),        /* output regs (dummy) */
1870                             "=S" (dummy_value_S),
1871                             "=D" (dummy_value_D)
1872
1873                           : "1" (sptr),      // esi      // input regs
1874                             "2" (dp),        // edi
1875                             "0" (width_mmx), // ecx
1876                             "rim" (_const4), // 0x0000000000FFFFFFLL
1877                             "rim" (_const6)  // 0x00000000000000FFLL
1878
1879#if 0  /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1880                           : "%mm0", "%mm1"               /* clobber list */
1881                           , "%mm2", "%mm3"
1882#endif
1883                        );
1884                     }
1885
1886                     sptr -= width_mmx*3;
1887                     dp -= width_mmx*6;
1888                     for (i = width; i; i--)
1889                     {
1890                        png_byte v[8];
1891                        int j;
1892
1893                        png_memcpy(v, sptr, 3);
1894                        for (j = 0; j < png_pass_inc[pass]; j++)
1895                        {
1896                           png_memcpy(dp, v, 3);
1897                           dp -= 3;
1898                        }
1899                        sptr -= 3;
1900                     }
1901                  }
1902               } /* end of pixel_bytes == 3 */
1903
1904               //--------------------------------------------------------------
1905               else if (pixel_bytes == 1)
1906               {
1907                  if (((pass == 0) || (pass == 1)) && width)
1908                  {
1909                     int width_mmx = ((width >> 2) << 2);
1910                     width -= width_mmx;        /* 0-3 pixels => 0-3 bytes */
1911                     if (width_mmx)
1912                     {
1913                        int dummy_value_c;  /* fix 'forbidden register spilled' */
1914                        int dummy_value_S;
1915                        int dummy_value_D;
1916
1917                        __asm__ __volatile__ (
1918                           "subl $3, %%esi          \n\t"
1919                           "subl $31, %%edi         \n\t"
1920
1921                        ".loop1_pass0:              \n\t"
1922                           "movd (%%esi), %%mm0     \n\t" /* x x x x 3 2 1 0 */
1923                           "movq %%mm0, %%mm1       \n\t" /* x x x x 3 2 1 0 */
1924                           "punpcklbw %%mm0, %%mm0  \n\t" /* 3 3 2 2 1 1 0 0 */
1925                           "movq %%mm0, %%mm2       \n\t" /* 3 3 2 2 1 1 0 0 */
1926                           "punpcklwd %%mm0, %%mm0  \n\t" /* 1 1 1 1 0 0 0 0 */
1927                           "movq %%mm0, %%mm3       \n\t" /* 1 1 1 1 0 0 0 0 */
1928                           "punpckldq %%mm0, %%mm0  \n\t" /* 0 0 0 0 0 0 0 0 */
1929                           "punpckhdq %%mm3, %%mm3  \n\t" /* 1 1 1 1 1 1 1 1 */
1930                           "movq %%mm0, (%%edi)     \n\t"
1931                           "punpckhwd %%mm2, %%mm2  \n\t" /* 3 3 3 3 2 2 2 2 */
1932                           "movq %%mm3, 8(%%edi)    \n\t"
1933                           "movq %%mm2, %%mm4       \n\t" /* 3 3 3 3 2 2 2 2 */
1934                           "punpckldq %%mm2, %%mm2  \n\t" /* 2 2 2 2 2 2 2 2 */
1935                           "punpckhdq %%mm4, %%mm4  \n\t" /* 3 3 3 3 3 3 3 3 */
1936                           "movq %%mm2, 16(%%edi)   \n\t"
1937                           "subl $4, %%esi          \n\t"
1938                           "movq %%mm4, 24(%%edi)   \n\t"
1939                           "subl $32, %%edi         \n\t"
1940                           "subl $4, %%ecx          \n\t"
1941                           "jnz .loop1_pass0        \n\t"
1942                           "EMMS                    \n\t" /* DONE */
1943
1944                           : "=c" (dummy_value_c),        /* output regs (dummy) */
1945                             "=S" (dummy_value_S),
1946                             "=D" (dummy_value_D)
1947
1948                           : "1" (sptr),      /* esi      // input regs */
1949                             "2" (dp),        /* edi */
1950                             "0" (width_mmx)  /* ecx */
1951
1952#if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1953                           : "%mm0", "%mm1", "%mm2"       /* clobber list */
1954                           , "%mm3", "%mm4"
1955#endif
1956                        );
1957                     }
1958
1959                     sptr -= width_mmx;
1960                     dp -= width_mmx*8;
1961                     for (i = width; i; i--)
1962                     {
1963                        int j;
1964
1965                       /* I simplified this part in version 1.0.4e
1966                        * here and in several other instances where
1967                        * pixel_bytes == 1  -- GR-P
1968                        *
1969                        * Original code:
1970                        *
1971                        * png_byte v[8];
1972                        * png_memcpy(v, sptr, pixel_bytes);
1973                        * for (j = 0; j < png_pass_inc[pass]; j++)
1974                        * {
1975                        *    png_memcpy(dp, v, pixel_bytes);
1976                        *    dp -= pixel_bytes;
1977                        * }
1978                        * sptr -= pixel_bytes;
1979                        *
1980                        * Replacement code is in the next three lines:
1981                        */
1982
1983                        for (j = 0; j < png_pass_inc[pass]; j++)
1984                        {
1985                           *dp-- = *sptr;
1986                        }
1987                        --sptr;
1988                     }
1989                  }
1990                  else if (((pass == 2) || (pass == 3)) && width)
1991                  {
1992                     int width_mmx = ((width >> 2) << 2);
1993                     width -= width_mmx;        /* 0-3 pixels => 0-3 bytes */
1994                     if (width_mmx)
1995                     {
1996                        int dummy_value_c;  /* fix 'forbidden register spilled' */
1997                        int dummy_value_S;
1998                        int dummy_value_D;
1999
2000                        __asm__ __volatile__ (
2001                           "subl $3, %%esi          \n\t"
2002                           "subl $15, %%edi         \n\t"
2003
2004                        ".loop1_pass2:              \n\t"
2005                           "movd (%%esi), %%mm0     \n\t" /* x x x x 3 2 1 0 */
2006                           "punpcklbw %%mm0, %%mm0  \n\t" /* 3 3 2 2 1 1 0 0 */
2007                           "movq %%mm0, %%mm1       \n\t" /* 3 3 2 2 1 1 0 0 */
2008                           "punpcklwd %%mm0, %%mm0  \n\t" /* 1 1 1 1 0 0 0 0 */
2009                           "punpckhwd %%mm1, %%mm1  \n\t" /* 3 3 3 3 2 2 2 2 */
2010                           "movq %%mm0, (%%edi)     \n\t"
2011                           "subl $4, %%esi          \n\t"
2012                           "movq %%mm1, 8(%%edi)    \n\t"
2013                           "subl $16, %%edi         \n\t"
2014                           "subl $4, %%ecx          \n\t"
2015                           "jnz .loop1_pass2        \n\t"
2016                           "EMMS                    \n\t" /* DONE */
2017
2018                           : "=c" (dummy_value_c),        /* output regs (dummy) */
2019                             "=S" (dummy_value_S),
2020                             "=D" (dummy_value_D)
2021
2022                           : "1" (sptr),      /* esi      // input regs */
2023                             "2" (dp),        /* edi */
2024                             "0" (width_mmx)  /* ecx */
2025
2026#if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2027                           : "%mm0", "%mm1"               /* clobber list */
2028#endif
2029                        );
2030                     }
2031
2032                     sptr -= width_mmx;
2033                     dp -= width_mmx*4;
2034                     for (i = width; i; i--)
2035                     {
2036                        int j;
2037
2038                        for (j = 0; j < png_pass_inc[pass]; j++)
2039                        {
2040                           *dp-- = *sptr;
2041                        }
2042                        --sptr;
2043                     }
2044                  }
2045                  else if (width)  /* && ((pass == 4) || (pass == 5)) */
2046                  {
2047                     int width_mmx = ((width >> 3) << 3);
2048                     width -= width_mmx;        /* 0-3 pixels => 0-3 bytes */
2049                     if (width_mmx)
2050                     {
2051                        int dummy_value_c;  /* fix 'forbidden register spilled' */
2052                        int dummy_value_S;
2053                        int dummy_value_D;
2054
2055                        __asm__ __volatile__ (
2056                           "subl $7, %%esi          \n\t"
2057                           "subl $15, %%edi         \n\t"
2058
2059                        ".loop1_pass4:              \n\t"
2060                           "movq (%%esi), %%mm0     \n\t" /* 7 6 5 4 3 2 1 0 */
2061                           "movq %%mm0, %%mm1       \n\t" /* 7 6 5 4 3 2 1 0 */
2062                           "punpcklbw %%mm0, %%mm0  \n\t" /* 3 3 2 2 1 1 0 0 */
2063                           "punpckhbw %%mm1, %%mm1  \n\t" /* 7 7 6 6 5 5 4 4 */
2064                           "movq %%mm1, 8(%%edi)    \n\t"
2065                           "subl $8, %%esi          \n\t"
2066                           "movq %%mm0, (%%edi)     \n\t"
2067                           "subl $16, %%edi         \n\t"
2068                           "subl $8, %%ecx          \n\t"
2069                           "jnz .loop1_pass4        \n\t"
2070                           "EMMS                    \n\t" /* DONE */
2071
2072                           : "=c" (dummy_value_c),        /* output regs (none) */
2073                             "=S" (dummy_value_S),
2074                             "=D" (dummy_value_D)
2075
2076                           : "1" (sptr),      /* esi      // input regs */
2077                             "2" (dp),        /* edi */
2078                             "0" (width_mmx)  /* ecx */
2079
2080#if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2081                           : "%mm0", "%mm1"               /* clobber list */
2082#endif
2083                        );
2084                     }
2085
2086                     sptr -= width_mmx;
2087                     dp -= width_mmx*2;
2088                     for (i = width; i; i--)
2089                     {
2090                        int j;
2091
2092                        for (j = 0; j < png_pass_inc[pass]; j++)
2093                        {
2094                           *dp-- = *sptr;
2095                        }
2096                        --sptr;
2097                     }
2098                  }
2099               } /* end of pixel_bytes == 1 */
2100
2101               //--------------------------------------------------------------
2102               else if (pixel_bytes == 2)
2103               {
2104                  if (((pass == 0) || (pass == 1)) && width)
2105                  {
2106                     int width_mmx = ((width >> 1) << 1);
2107                     width -= width_mmx;        /* 0,1 pixels => 0,2 bytes */
2108                     if (width_mmx)
2109                     {
2110                        int dummy_value_c;  /* fix 'forbidden register spilled' */
2111                        int dummy_value_S;
2112                        int dummy_value_D;
2113
2114                        __asm__ __volatile__ (
2115                           "subl $2, %%esi          \n\t"
2116                           "subl $30, %%edi         \n\t"
2117
2118                        ".loop2_pass0:              \n\t"
2119                           "movd (%%esi), %%mm0     \n\t" /* x x x x 3 2 1 0 */
2120                           "punpcklwd %%mm0, %%mm0  \n\t" /* 3 2 3 2 1 0 1 0 */
2121                           "movq %%mm0, %%mm1       \n\t" /* 3 2 3 2 1 0 1 0 */
2122                           "punpckldq %%mm0, %%mm0  \n\t" /* 1 0 1 0 1 0 1 0 */
2123                           "punpckhdq %%mm1, %%mm1  \n\t" /* 3 2 3 2 3 2 3 2 */
2124                           "movq %%mm0, (%%edi)     \n\t"
2125                           "movq %%mm0, 8(%%edi)    \n\t"
2126                           "movq %%mm1, 16(%%edi)   \n\t"
2127                           "subl $4, %%esi          \n\t"
2128                           "movq %%mm1, 24(%%edi)   \n\t"
2129                           "subl $32, %%edi         \n\t"
2130                           "subl $2, %%ecx          \n\t"
2131                           "jnz .loop2_pass0        \n\t"
2132                           "EMMS                    \n\t" /* DONE */
2133
2134                           : "=c" (dummy_value_c),        /* output regs (dummy) */
2135                             "=S" (dummy_value_S),
2136                             "=D" (dummy_value_D)
2137
2138                           : "1" (sptr),      /* esi      // input regs */
2139                             "2" (dp),        /* edi */
2140                             "0" (width_mmx)  /* ecx */
2141
2142#if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2143                           : "%mm0", "%mm1"               /* clobber list */
2144#endif
2145                        );
2146                     }
2147
2148                     sptr -= (width_mmx*2 - 2); /* sign fixed */
2149                     dp -= (width_mmx*16 - 2);  /* sign fixed */
2150                     for (i = width; i; i--)
2151                     {
2152                        png_byte v[8];
2153                        int j;
2154                        sptr -= 2;
2155                        png_memcpy(v, sptr, 2);
2156                        for (j = 0; j < png_pass_inc[pass]; j++)
2157                        {
2158                           dp -= 2;
2159                           png_memcpy(dp, v, 2);
2160                        }
2161                     }
2162                  }
2163                  else if (((pass == 2) || (pass == 3)) && width)
2164                  {
2165                     int width_mmx = ((width >> 1) << 1) ;
2166                     width -= width_mmx;        /* 0,1 pixels => 0,2 bytes */
2167                     if (width_mmx)
2168                     {
2169                        int dummy_value_c;  /* fix 'forbidden register spilled' */
2170                        int dummy_value_S;
2171                        int dummy_value_D;
2172
2173                        __asm__ __volatile__ (
2174                           "subl $2, %%esi          \n\t"
2175                           "subl $14, %%edi         \n\t"
2176
2177                        ".loop2_pass2:              \n\t"
2178                           "movd (%%esi), %%mm0     \n\t" /* x x x x 3 2 1 0 */
2179                           "punpcklwd %%mm0, %%mm0  \n\t" /* 3 2 3 2 1 0 1 0 */
2180                           "movq %%mm0, %%mm1       \n\t" /* 3 2 3 2 1 0 1 0 */
2181                           "punpckldq %%mm0, %%mm0  \n\t" /* 1 0 1 0 1 0 1 0 */
2182                           "punpckhdq %%mm1, %%mm1  \n\t" /* 3 2 3 2 3 2 3 2 */
2183                           "movq %%mm0, (%%edi)     \n\t"
2184                           "subl $4, %%esi          \n\t"
2185                           "movq %%mm1, 8(%%edi)    \n\t"
2186                           "subl $16, %%edi         \n\t"
2187                           "subl $2, %%ecx          \n\t"
2188                           "jnz .loop2_pass2        \n\t"
2189                           "EMMS                    \n\t" /* DONE */
2190
2191                           : "=c" (dummy_value_c),        /* output regs (dummy) */
2192                             "=S" (dummy_value_S),
2193                             "=D" (dummy_value_D)
2194
2195                           : "1" (sptr),      /* esi      // input regs */
2196                             "2" (dp),        /* edi */
2197                             "0" (width_mmx)  /* ecx */
2198
2199#if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2200                           : "%mm0", "%mm1"               /* clobber list */
2201#endif
2202                        );
2203                     }
2204
2205                     sptr -= (width_mmx*2 - 2); /* sign fixed */
2206                     dp -= (width_mmx*8 - 2);   /* sign fixed */
2207                     for (i = width; i; i--)
2208                     {
2209                        png_byte v[8];
2210                        int j;
2211                        sptr -= 2;
2212                        png_memcpy(v, sptr, 2);
2213                        for (j = 0; j < png_pass_inc[pass]; j++)
2214                        {
2215                           dp -= 2;
2216                           png_memcpy(dp, v, 2);
2217                        }
2218                     }
2219                  }
2220                  else if (width)  /* pass == 4 or 5 */
2221                  {
2222                     int width_mmx = ((width >> 1) << 1) ;
2223                     width -= width_mmx;        /* 0,1 pixels => 0,2 bytes */
2224                     if (width_mmx)
2225                     {
2226                        int dummy_value_c;  /* fix 'forbidden register spilled' */
2227                        int dummy_value_S;
2228                        int dummy_value_D;
2229
2230                        __asm__ __volatile__ (
2231                           "subl $2, %%esi          \n\t"
2232                           "subl $6, %%edi          \n\t"
2233
2234                        ".loop2_pass4:              \n\t"
2235                           "movd (%%esi), %%mm0     \n\t" /* x x x x 3 2 1 0 */
2236                           "punpcklwd %%mm0, %%mm0  \n\t" /* 3 2 3 2 1 0 1 0 */
2237                           "subl $4, %%esi          \n\t"
2238                           "movq %%mm0, (%%edi)     \n\t"
2239                           "subl $8, %%edi          \n\t"
2240                           "subl $2, %%ecx          \n\t"
2241                           "jnz .loop2_pass4        \n\t"
2242                           "EMMS                    \n\t" /* DONE */
2243
2244                           : "=c" (dummy_value_c),        /* output regs (dummy) */
2245                             "=S" (dummy_value_S),
2246                             "=D" (dummy_value_D)
2247
2248                           : "1" (sptr),      /* esi      // input regs */
2249                             "2" (dp),        /* edi */
2250                             "0" (width_mmx)  /* ecx */
2251
2252#if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2253                           : "%mm0"                       /* clobber list */
2254#endif
2255                        );
2256                     }
2257
2258                     sptr -= (width_mmx*2 - 2); /* sign fixed */
2259                     dp -= (width_mmx*4 - 2);   /* sign fixed */
2260                     for (i = width; i; i--)
2261                     {
2262                        png_byte v[8];
2263                        int j;
2264                        sptr -= 2;
2265                        png_memcpy(v, sptr, 2);
2266                        for (j = 0; j < png_pass_inc[pass]; j++)
2267                        {
2268                           dp -= 2;
2269                           png_memcpy(dp, v, 2);
2270                        }
2271                     }
2272                  }
2273               } /* end of pixel_bytes == 2 */
2274
2275               //--------------------------------------------------------------
2276               else if (pixel_bytes == 4)
2277               {
2278                  if (((pass == 0) || (pass == 1)) && width)
2279                  {
2280                     int width_mmx = ((width >> 1) << 1);
2281                     width -= width_mmx;        /* 0,1 pixels => 0,4 bytes */
2282                     if (width_mmx)
2283                     {
2284                        int dummy_value_c;  /* fix 'forbidden register spilled' */
2285                        int dummy_value_S;
2286                        int dummy_value_D;
2287
2288                        __asm__ __volatile__ (
2289                           "subl $4, %%esi          \n\t"
2290                           "subl $60, %%edi         \n\t"
2291
2292                        ".loop4_pass0:              \n\t"
2293                           "movq (%%esi), %%mm0     \n\t" /* 7 6 5 4 3 2 1 0 */
2294                           "movq %%mm0, %%mm1       \n\t" /* 7 6 5 4 3 2 1 0 */
2295                           "punpckldq %%mm0, %%mm0  \n\t" /* 3 2 1 0 3 2 1 0 */
2296                           "punpckhdq %%mm1, %%mm1  \n\t" /* 7 6 5 4 7 6 5 4 */
2297                           "movq %%mm0, (%%edi)     \n\t"
2298                           "movq %%mm0, 8(%%edi)    \n\t"
2299                           "movq %%mm0, 16(%%edi)   \n\t"
2300                           "movq %%mm0, 24(%%edi)   \n\t"
2301                           "movq %%mm1, 32(%%edi)   \n\t"
2302                           "movq %%mm1, 40(%%edi)   \n\t"
2303                           "movq %%mm1, 48(%%edi)   \n\t"
2304                           "subl $8, %%esi          \n\t"
2305                           "movq %%mm1, 56(%%edi)   \n\t"
2306                           "subl $64, %%edi         \n\t"
2307                           "subl $2, %%ecx          \n\t"
2308                           "jnz .loop4_pass0        \n\t"
2309                           "EMMS                    \n\t" /* DONE */
2310
2311                           : "=c" (dummy_value_c),        /* output regs (dummy) */
2312                             "=S" (dummy_value_S),
2313                             "=D" (dummy_value_D)
2314
2315                           : "1" (sptr),      /* esi      // input regs */
2316                             "2" (dp),        /* edi */
2317                             "0" (width_mmx)  /* ecx */
2318
2319#if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2320                           : "%mm0", "%mm1"               /* clobber list */
2321#endif
2322                        );
2323                     }
2324
2325                     sptr -= (width_mmx*4 - 4); /* sign fixed */
2326                     dp -= (width_mmx*32 - 4);  /* sign fixed */
2327                     for (i = width; i; i--)
2328                     {
2329                        png_byte v[8];
2330                        int j;
2331                        sptr -= 4;
2332                        png_memcpy(v, sptr, 4);
2333                        for (j = 0; j < png_pass_inc[pass]; j++)
2334                        {
2335                           dp -= 4;
2336                           png_memcpy(dp, v, 4);
2337                        }
2338                     }
2339                  }
2340                  else if (((pass == 2) || (pass == 3)) && width)
2341                  {
2342                     int width_mmx = ((width >> 1) << 1);
2343                     width -= width_mmx;        /* 0,1 pixels => 0,4 bytes */
2344                     if (width_mmx)
2345                     {
2346                        int dummy_value_c;  /* fix 'forbidden register spilled' */
2347                        int dummy_value_S;
2348                        int dummy_value_D;
2349
2350                        __asm__ __volatile__ (
2351                           "subl $4, %%esi          \n\t"
2352                           "subl $28, %%edi         \n\t"
2353
2354                        ".loop4_pass2:              \n\t"
2355                           "movq (%%esi), %%mm0     \n\t" /* 7 6 5 4 3 2 1 0 */
2356                           "movq %%mm0, %%mm1       \n\t" /* 7 6 5 4 3 2 1 0 */
2357                           "punpckldq %%mm0, %%mm0  \n\t" /* 3 2 1 0 3 2 1 0 */
2358                           "punpckhdq %%mm1, %%mm1  \n\t" /* 7 6 5 4 7 6 5 4 */
2359                           "movq %%mm0, (%%edi)     \n\t"
2360                           "movq %%mm0, 8(%%edi)    \n\t"
2361                           "movq %%mm1, 16(%%edi)   \n\t"
2362                           "movq %%mm1, 24(%%edi)   \n\t"
2363                           "subl $8, %%esi          \n\t"
2364                           "subl $32, %%edi         \n\t"
2365                           "subl $2, %%ecx          \n\t"
2366                           "jnz .loop4_pass2        \n\t"
2367                           "EMMS                    \n\t" /* DONE */
2368
2369                           : "=c" (dummy_value_c),        /* output regs (dummy) */
2370                             "=S" (dummy_value_S),
2371                             "=D" (dummy_value_D)
2372
2373                           : "1" (sptr),      /* esi      // input regs */
2374                             "2" (dp),        /* edi */
2375                             "0" (width_mmx)  /* ecx */
2376
2377#if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2378                           : "%mm0", "%mm1"               /* clobber list */
2379#endif
2380                        );
2381                     }
2382
2383                     sptr -= (width_mmx*4 - 4); /* sign fixed */
2384                     dp -= (width_mmx*16 - 4);  /* sign fixed */
2385                     for (i = width; i; i--)
2386                     {
2387                        png_byte v[8];
2388                        int j;
2389                        sptr -= 4;
2390                        png_memcpy(v, sptr, 4);
2391                        for (j = 0; j < png_pass_inc[pass]; j++)
2392                        {
2393                           dp -= 4;
2394                           png_memcpy(dp, v, 4);
2395                        }
2396                     }
2397                  }
2398                  else if (width)  /* pass == 4 or 5 */
2399                  {
2400                     int width_mmx = ((width >> 1) << 1) ;
2401                     width -= width_mmx;        /* 0,1 pixels => 0,4 bytes */
2402                     if (width_mmx)
2403                     {
2404                        int dummy_value_c;  /* fix 'forbidden register spilled' */
2405                        int dummy_value_S;
2406                        int dummy_value_D;
2407
2408                        __asm__ __volatile__ (
2409                           "subl $4, %%esi          \n\t"
2410                           "subl $12, %%edi         \n\t"
2411
2412                        ".loop4_pass4:              \n\t"
2413                           "movq (%%esi), %%mm0     \n\t" /* 7 6 5 4 3 2 1 0 */
2414                           "movq %%mm0, %%mm1       \n\t" /* 7 6 5 4 3 2 1 0 */
2415                           "punpckldq %%mm0, %%mm0  \n\t" /* 3 2 1 0 3 2 1 0 */
2416                           "punpckhdq %%mm1, %%mm1  \n\t" /* 7 6 5 4 7 6 5 4 */
2417                           "movq %%mm0, (%%edi)     \n\t"
2418                           "subl $8, %%esi          \n\t"
2419                           "movq %%mm1, 8(%%edi)    \n\t"
2420                           "subl $16, %%edi         \n\t"
2421                           "subl $2, %%ecx          \n\t"
2422                           "jnz .loop4_pass4        \n\t"
2423                           "EMMS                    \n\t" /* DONE */
2424
2425                           : "=c" (dummy_value_c),        /* output regs (dummy) */
2426                             "=S" (dummy_value_S),
2427                             "=D" (dummy_value_D)
2428
2429                           : "1" (sptr),      /* esi      // input regs */
2430                             "2" (dp),        /* edi */
2431                             "0" (width_mmx)  /* ecx */
2432
2433#if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2434                           : "%mm0", "%mm1"               /* clobber list */
2435#endif
2436                        );
2437                     }
2438
2439                     sptr -= (width_mmx*4 - 4); /* sign fixed */
2440                     dp -= (width_mmx*8 - 4);   /* sign fixed */
2441                     for (i = width; i; i--)
2442                     {
2443                        png_byte v[8];
2444                        int j;
2445                        sptr -= 4;
2446                        png_memcpy(v, sptr, 4);
2447                        for (j = 0; j < png_pass_inc[pass]; j++)
2448                        {
2449                           dp -= 4;
2450                           png_memcpy(dp, v, 4);
2451                        }
2452                     }
2453                  }
2454               } /* end of pixel_bytes == 4 */
2455
2456               //--------------------------------------------------------------
2457               else if (pixel_bytes == 8)
2458               {
2459/* GRR TEST:  should work, but needs testing (special 64-bit version of rpng2?) */
2460                  /* GRR NOTE:  no need to combine passes here! */
2461                  if (((pass == 0) || (pass == 1)) && width)
2462                  {
2463                     int dummy_value_c;  /* fix 'forbidden register spilled' */
2464                     int dummy_value_S;
2465                     int dummy_value_D;
2466
2467                     /* source is 8-byte RRGGBBAA */
2468                     /* dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ... */
2469                     __asm__ __volatile__ (
2470                        "subl $56, %%edi         \n\t" /* start of last block */
2471
2472                     ".loop8_pass0:              \n\t"
2473                        "movq (%%esi), %%mm0     \n\t" /* 7 6 5 4 3 2 1 0 */
2474                        "movq %%mm0, (%%edi)     \n\t"
2475                        "movq %%mm0, 8(%%edi)    \n\t"
2476                        "movq %%mm0, 16(%%edi)   \n\t"
2477                        "movq %%mm0, 24(%%edi)   \n\t"
2478                        "movq %%mm0, 32(%%edi)   \n\t"
2479                        "movq %%mm0, 40(%%edi)   \n\t"
2480                        "movq %%mm0, 48(%%edi)   \n\t"
2481                        "subl $8, %%esi          \n\t"
2482                        "movq %%mm0, 56(%%edi)   \n\t"
2483                        "subl $64, %%edi         \n\t"
2484                        "decl %%ecx              \n\t"
2485                        "jnz .loop8_pass0        \n\t"
2486                        "EMMS                    \n\t" /* DONE */
2487
2488                        : "=c" (dummy_value_c),        /* output regs (dummy) */
2489                          "=S" (dummy_value_S),
2490                          "=D" (dummy_value_D)
2491
2492                        : "1" (sptr),      /* esi      // input regs */
2493                          "2" (dp),        /* edi */
2494                          "0" (width)      /* ecx */
2495
2496#if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2497                        : "%mm0"                       /* clobber list */
2498#endif
2499                     );
2500                  }
2501                  else if (((pass == 2) || (pass == 3)) && width)
2502                  {
2503                     /* source is 8-byte RRGGBBAA */
2504                     /* dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA */
2505                     /* (recall that expansion is _in place_:  sptr and dp */
2506                     /*  both point at locations within same row buffer) */
2507                     {
2508                        int dummy_value_c;  /* fix 'forbidden register spilled' */
2509                        int dummy_value_S;
2510                        int dummy_value_D;
2511
2512                        __asm__ __volatile__ (
2513                           "subl $24, %%edi         \n\t" /* start of last block */
2514
2515                        ".loop8_pass2:              \n\t"
2516                           "movq (%%esi), %%mm0     \n\t" /* 7 6 5 4 3 2 1 0 */
2517                           "movq %%mm0, (%%edi)     \n\t"
2518                           "movq %%mm0, 8(%%edi)    \n\t"
2519                           "movq %%mm0, 16(%%edi)   \n\t"
2520                           "subl $8, %%esi          \n\t"
2521                           "movq %%mm0, 24(%%edi)   \n\t"
2522                           "subl $32, %%edi         \n\t"
2523                           "decl %%ecx              \n\t"
2524                           "jnz .loop8_pass2        \n\t"
2525                           "EMMS                    \n\t" /* DONE */
2526
2527                           : "=c" (dummy_value_c),        /* output regs (dummy) */
2528                             "=S" (dummy_value_S),
2529                             "=D" (dummy_value_D)
2530
2531                           : "1" (sptr),      /* esi      // input regs */
2532                             "2" (dp),        /* edi */
2533                             "0" (width)      /* ecx */
2534
2535#if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2536                           : "%mm0"                       /* clobber list */
2537#endif
2538                        );
2539                     }
2540                  }
2541                  else if (width)  /* pass == 4 or 5 */
2542                  {
2543                     /* source is 8-byte RRGGBBAA */
2544                     /* dest is 16-byte RRGGBBAA RRGGBBAA */
2545                     {
2546                        int dummy_value_c;  /* fix 'forbidden register spilled' */
2547                        int dummy_value_S;
2548                        int dummy_value_D;
2549
2550                        __asm__ __volatile__ (
2551                           "subl $8, %%edi          \n\t" /* start of last block */
2552
2553                        ".loop8_pass4:              \n\t"
2554                           "movq (%%esi), %%mm0     \n\t" /* 7 6 5 4 3 2 1 0 */
2555                           "movq %%mm0, (%%edi)     \n\t"
2556                           "subl $8, %%esi          \n\t"
2557                           "movq %%mm0, 8(%%edi)    \n\t"
2558                           "subl $16, %%edi         \n\t"
2559                           "decl %%ecx              \n\t"
2560                           "jnz .loop8_pass4        \n\t"
2561                           "EMMS                    \n\t" /* DONE */
2562
2563                           : "=c" (dummy_value_c),        /* output regs (dummy) */
2564                             "=S" (dummy_value_S),
2565                             "=D" (dummy_value_D)
2566
2567                           : "1" (sptr),      /* esi      // input regs */
2568                             "2" (dp),        /* edi */
2569                             "0" (width)      /* ecx */
2570
2571#if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2572                           : "%mm0"                       /* clobber list */
2573#endif
2574                        );
2575                     }
2576                  }
2577
2578               } /* end of pixel_bytes == 8 */
2579
2580               //--------------------------------------------------------------
2581               else if (pixel_bytes == 6)
2582               {
2583                  for (i = width; i; i--)
2584                  {
2585                     png_byte v[8];
2586                     int j;
2587                     png_memcpy(v, sptr, 6);
2588                     for (j = 0; j < png_pass_inc[pass]; j++)
2589                     {
2590                        png_memcpy(dp, v, 6);
2591                        dp -= 6;
2592                     }
2593                     sptr -= 6;
2594                  }
2595               } /* end of pixel_bytes == 6 */
2596
2597               //--------------------------------------------------------------
2598               else
2599               {
2600                  for (i = width; i; i--)
2601                  {
2602                     png_byte v[8];
2603                     int j;
2604                     png_memcpy(v, sptr, pixel_bytes);
2605                     for (j = 0; j < png_pass_inc[pass]; j++)
2606                     {
2607                        png_memcpy(dp, v, pixel_bytes);
2608                        dp -= pixel_bytes;
2609                     }
2610                     sptr-= pixel_bytes;
2611                  }
2612               }
2613            } /* end of _mmx_supported ======================================== */
2614
2615            else /* MMX not supported:  use modified C code - takes advantage
2616                  *   of inlining of png_memcpy for a constant */
2617                 /* GRR 19991007:  does it?  or should pixel_bytes in each
2618                  *   block be replaced with immediate value (e.g., 1)? */
2619                 /* GRR 19991017:  replaced with constants in each case */
2620#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
2621            {
2622               if (pixel_bytes == 1)
2623               {
2624                  for (i = width; i; i--)
2625                  {
2626                     int j;
2627                     for (j = 0; j < png_pass_inc[pass]; j++)
2628                     {
2629                        *dp-- = *sptr;
2630                     }
2631                     --sptr;
2632                  }
2633               }
2634               else if (pixel_bytes == 3)
2635               {
2636                  for (i = width; i; i--)
2637                  {
2638                     png_byte v[8];
2639                     int j;
2640                     png_memcpy(v, sptr, 3);
2641                     for (j = 0; j < png_pass_inc[pass]; j++)
2642                     {
2643                        png_memcpy(dp, v, 3);
2644                        dp -= 3;
2645                     }
2646                     sptr -= 3;
2647                  }
2648               }
2649               else if (pixel_bytes == 2)
2650               {
2651                  for (i = width; i; i--)
2652                  {
2653                     png_byte v[8];
2654                     int j;
2655                     png_memcpy(v, sptr, 2);
2656                     for (j = 0; j < png_pass_inc[pass]; j++)
2657                     {
2658                        png_memcpy(dp, v, 2);
2659                        dp -= 2;
2660                     }
2661                     sptr -= 2;
2662                  }
2663               }
2664               else if (pixel_bytes == 4)
2665               {
2666                  for (i = width; i; i--)
2667                  {
2668                     png_byte v[8];
2669                     int j;
2670                     png_memcpy(v, sptr, 4);
2671                     for (j = 0; j < png_pass_inc[pass]; j++)
2672                     {
2673#ifdef PNG_DEBUG
2674                        if (dp < row || dp+3 > row+png_ptr->row_buf_size)
2675                        {
2676                           printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
2677                             row, dp, row+png_ptr->row_buf_size);
2678                           printf("row_buf=%d\n",png_ptr->row_buf_size);
2679                        }
2680#endif
2681                        png_memcpy(dp, v, 4);
2682                        dp -= 4;
2683                     }
2684                     sptr -= 4;
2685                  }
2686               }
2687               else if (pixel_bytes == 6)
2688               {
2689                  for (i = width; i; i--)
2690                  {
2691                     png_byte v[8];
2692                     int j;
2693                     png_memcpy(v, sptr, 6);
2694                     for (j = 0; j < png_pass_inc[pass]; j++)
2695                     {
2696                        png_memcpy(dp, v, 6);
2697                        dp -= 6;
2698                     }
2699                     sptr -= 6;
2700                  }
2701               }
2702               else if (pixel_bytes == 8)
2703               {
2704                  for (i = width; i; i--)
2705                  {
2706                     png_byte v[8];
2707                     int j;
2708                     png_memcpy(v, sptr, 8);
2709                     for (j = 0; j < png_pass_inc[pass]; j++)
2710                     {
2711                        png_memcpy(dp, v, 8);
2712                        dp -= 8;
2713                     }
2714                     sptr -= 8;
2715                  }
2716               }
2717               else     /* GRR:  should never be reached */
2718               {
2719                  for (i = width; i; i--)
2720                  {
2721                     png_byte v[8];
2722                     int j;
2723                     png_memcpy(v, sptr, pixel_bytes);
2724                     for (j = 0; j < png_pass_inc[pass]; j++)
2725                     {
2726                        png_memcpy(dp, v, pixel_bytes);
2727                        dp -= pixel_bytes;
2728                     }
2729                     sptr -= pixel_bytes;
2730                  }
2731               }
2732
2733            } /* end if (MMX not supported) */
2734            break;
2735         }
2736      } /* end switch (row_info->pixel_depth) */
2737
2738      row_info->width = final_width;
2739
2740      row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
2741   }
2742
2743} /* end png_do_read_interlace() */
2744
2745#endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
2746#endif /* PNG_READ_INTERLACING_SUPPORTED */
2747
2748
2749
2750#if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
2751#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
2752
2753/* These variables are utilized in the functions below.  They are declared */
2754/* globally here to ensure alignment on 8-byte boundaries. */
2755
2756union uAll {
2757   long long use;
2758   double  align;
2759} _LBCarryMask = {0x0101010101010101LL},
2760  _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2761  _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2762
2763#ifdef PNG_THREAD_UNSAFE_OK
2764/*===========================================================================*/
2765/*                                                                           */
2766/*           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G           */
2767/*                                                                           */
2768/*===========================================================================*/
2769
2770/* Optimized code for PNG Average filter decoder */
2771
2772static void /* PRIVATE */
2773png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2774                            png_bytep prev_row)
2775{
2776   int bpp;
2777   int dummy_value_c;   /* fix 'forbidden register 2 (cx) was spilled' error */
2778   int dummy_value_S;
2779   int dummy_value_D;
2780
2781   bpp = (row_info->pixel_depth + 7) >> 3;  /* get # bytes per pixel */
2782   _FullLength  = row_info->rowbytes;       /* # of bytes to filter */
2783
2784   __asm__ __volatile__ (
2785      /* initialize address pointers and offset */
2786#ifdef __PIC__
2787      "pushl %%ebx                 \n\t" /* save index to Global Offset Table */
2788#endif
2789/*pre "movl row, %%edi             \n\t" */ /* edi:  Avg(x) */
2790      "xorl %%ebx, %%ebx           \n\t" /* ebx:  x */
2791      "movl %%edi, %%edx           \n\t"
2792/*pre "movl prev_row, %%esi        \n\t" */ /* esi:  Prior(x) */
2793/*pre "subl bpp, %%edx             \n\t" */ /* (bpp is preloaded into ecx) */
2794      "subl %%ecx, %%edx           \n\t" /* edx:  Raw(x-bpp) */
2795
2796      "xorl %%eax,%%eax            \n\t"
2797
2798      /* Compute the Raw value for the first bpp bytes */
2799      /*    Raw(x) = Avg(x) + (Prior(x)/2) */
2800   "avg_rlp:                       \n\t"
2801      "movb (%%esi,%%ebx,),%%al    \n\t" /* load al with Prior(x) */
2802      "incl %%ebx                  \n\t"
2803      "shrb %%al                   \n\t" /* divide by 2 */
2804      "addb -1(%%edi,%%ebx,),%%al  \n\t" /* add Avg(x); -1 to offset inc ebx */
2805/* pre "cmpl bpp, %%ebx             \n\t" */ /* (bpp is preloaded into ecx) */
2806      "cmpl %%ecx, %%ebx           \n\t"
2807      "movb %%al,-1(%%edi,%%ebx,)  \n\t" /* write Raw(x); -1 to offset inc ebx */
2808      "jb avg_rlp                  \n\t" /* mov does not affect flags */
2809
2810      /* get # of bytes to alignment */
2811      "movl %%edi, _dif            \n\t" /* take start of row */
2812      "addl %%ebx, _dif            \n\t" /* add bpp */
2813      "addl $0xf, _dif             \n\t" /* add 7+8 to incr past alignment bdry */
2814      "andl $0xfffffff8, _dif      \n\t" /* mask to alignment boundary */
2815      "subl %%edi, _dif            \n\t" /* subtract from start => value ebx at */
2816      "jz avg_go                   \n\t" /*  alignment */
2817
2818      /* fix alignment */
2819      /* Compute the Raw value for the bytes up to the alignment boundary */
2820      /*    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
2821      "xorl %%ecx, %%ecx           \n\t"
2822
2823   "avg_lp1:                       \n\t"
2824      "xorl %%eax, %%eax           \n\t"
2825      "movb (%%esi,%%ebx,), %%cl   \n\t" /* load cl with Prior(x) */
2826      "movb (%%edx,%%ebx,), %%al   \n\t" /* load al with Raw(x-bpp) */
2827      "addw %%cx, %%ax             \n\t"
2828      "incl %%ebx                  \n\t"
2829      "shrw %%ax                   \n\t" /* divide by 2 */
2830      "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset inc ebx */
2831      "cmpl _dif, %%ebx            \n\t" /* check if at alignment boundary */
2832      "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write Raw(x); -1 to offset inc ebx */
2833      "jb avg_lp1                  \n\t" /* repeat until at alignment boundary */
2834
2835   "avg_go:                        \n\t"
2836      "movl _FullLength, %%eax     \n\t"
2837      "movl %%eax, %%ecx           \n\t"
2838      "subl %%ebx, %%eax           \n\t" /* subtract alignment fix */
2839      "andl $0x00000007, %%eax     \n\t" /* calc bytes over mult of 8 */
2840      "subl %%eax, %%ecx           \n\t" /* drop over bytes from original length */
2841      "movl %%ecx, _MMXLength      \n\t"
2842#ifdef __PIC__
2843      "popl %%ebx                  \n\t" /* restore index to Global Offset Table */
2844#endif
2845
2846      : "=c" (dummy_value_c),            /* output regs (dummy) */
2847        "=S" (dummy_value_S),
2848        "=D" (dummy_value_D)
2849
2850      : "0" (bpp),       /* ecx          // input regs */
2851        "1" (prev_row),  /* esi */
2852        "2" (row)        /* edi */
2853
2854      : "%eax", "%edx"                   /* clobber list */
2855#ifndef __PIC__
2856      , "%ebx"
2857#endif
2858      /* GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength) */
2859      /* (seems to work fine without...) */
2860   );
2861
2862   /* now do the math for the rest of the row */
2863   switch (bpp)
2864   {
2865      case 3:
2866      {
2867         _ActiveMask.use  = 0x0000000000ffffffLL;
2868         _ShiftBpp.use = 24;    /* == 3 * 8 */
2869         _ShiftRem.use = 40;    /* == 64 - 24 */
2870
2871         __asm__ __volatile__ (
2872            /* re-init address pointers and offset */
2873            "movq _ActiveMask, %%mm7      \n\t"
2874            "movl _dif, %%ecx             \n\t" /* ecx:  x = offset to */
2875            "movq _LBCarryMask, %%mm5     \n\t" /*  alignment boundary */
2876/* preload  "movl row, %%edi              \n\t" // edi:  Avg(x) */
2877            "movq _HBClearMask, %%mm4     \n\t"
2878/* preload  "movl prev_row, %%esi         \n\t" // esi:  Prior(x) */
2879
2880            /* prime the pump:  load the first Raw(x-bpp) data set */
2881            "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
2882                                                /* (correct pos. in loop below) */
2883         "avg_3lp:                        \n\t"
2884            "movq (%%edi,%%ecx,), %%mm0   \n\t" /* load mm0 with Avg(x) */
2885            "movq %%mm5, %%mm3            \n\t"
2886            "psrlq _ShiftRem, %%mm2       \n\t" /* correct position Raw(x-bpp) */
2887                                                /* data */
2888            "movq (%%esi,%%ecx,), %%mm1   \n\t" /* load mm1 with Prior(x) */
2889            "movq %%mm7, %%mm6            \n\t"
2890            "pand %%mm1, %%mm3            \n\t" /* get lsb for each prev_row byte */
2891            "psrlq $1, %%mm1              \n\t" /* divide prev_row bytes by 2 */
2892            "pand  %%mm4, %%mm1           \n\t" /* clear invalid bit 7 of each */
2893                                                /* byte */
2894            "paddb %%mm1, %%mm0           \n\t" /* add (Prev_row/2) to Avg for */
2895                                                /* each byte */
2896            /* add 1st active group (Raw(x-bpp)/2) to average with LBCarry */
2897            "movq %%mm3, %%mm1            \n\t" /* now use mm1 for getting */
2898                                                /* LBCarrys */
2899            "pand %%mm2, %%mm1            \n\t" /* get LBCarrys for each byte */
2900                                                /* where both */
2901                               /* lsb's were == 1 (only valid for active group) */
2902            "psrlq $1, %%mm2              \n\t" /* divide raw bytes by 2 */
2903            "pand  %%mm4, %%mm2           \n\t" /* clear invalid bit 7 of each */
2904                                                /* byte */
2905            "paddb %%mm1, %%mm2           \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
2906                                                /* for each byte */
2907            "pand %%mm6, %%mm2            \n\t" /* leave only Active Group 1 */
2908                                                /* bytes to add to Avg */
2909            "paddb %%mm2, %%mm0           \n\t" /* add (Raw/2) + LBCarrys to */
2910                                                /* Avg for each Active */
2911                               /*  byte */
2912            /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */
2913            "psllq _ShiftBpp, %%mm6       \n\t" /* shift the mm6 mask to cover */
2914                                                /* bytes 3-5 */
2915            "movq %%mm0, %%mm2            \n\t" /* mov updated Raws to mm2 */
2916            "psllq _ShiftBpp, %%mm2       \n\t" /* shift data to pos. correctly */
2917            "movq %%mm3, %%mm1            \n\t" /* now use mm1 for getting */
2918                                                /* LBCarrys */
2919            "pand %%mm2, %%mm1            \n\t" /* get LBCarrys for each byte */
2920                                                /* where both */
2921                               /* lsb's were == 1 (only valid for active group) */
2922            "psrlq $1, %%mm2              \n\t" /* divide raw bytes by 2 */
2923            "pand  %%mm4, %%mm2           \n\t" /* clear invalid bit 7 of each */
2924                                                /* byte */
2925            "paddb %%mm1, %%mm2           \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
2926                                                /* for each byte */
2927            "pand %%mm6, %%mm2            \n\t" /* leave only Active Group 2 */
2928                                                /* bytes to add to Avg */
2929            "paddb %%mm2, %%mm0           \n\t" /* add (Raw/2) + LBCarrys to */
2930                                                /* Avg for each Active */
2931                               /*  byte */
2932
2933            /* add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry */
2934            "psllq _ShiftBpp, %%mm6       \n\t" /* shift mm6 mask to cover last */
2935                                                /* two */
2936                                 /* bytes */
2937            "movq %%mm0, %%mm2            \n\t" /* mov updated Raws to mm2 */
2938            "psllq _ShiftBpp, %%mm2       \n\t" /* shift data to pos. correctly */
2939                              /* Data only needs to be shifted once here to */
2940                              /* get the correct x-bpp offset. */
2941            "movq %%mm3, %%mm1            \n\t" /* now use mm1 for getting */
2942                                                /* LBCarrys */
2943            "pand %%mm2, %%mm1            \n\t" /* get LBCarrys for each byte */
2944                                                /* where both */
2945                              /* lsb's were == 1 (only valid for active group) */
2946            "psrlq $1, %%mm2              \n\t" /* divide raw bytes by 2 */
2947            "pand  %%mm4, %%mm2           \n\t" /* clear invalid bit 7 of each */
2948                                                /* byte */
2949            "paddb %%mm1, %%mm2           \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
2950                                                /* for each byte */
2951            "pand %%mm6, %%mm2            \n\t" /* leave only Active Group 2 */
2952                                                /* bytes to add to Avg */
2953            "addl $8, %%ecx               \n\t"
2954            "paddb %%mm2, %%mm0           \n\t" /* add (Raw/2) + LBCarrys to */
2955                                                /* Avg for each Active */
2956                                                /* byte */
2957            /* now ready to write back to memory */
2958            "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2959            /* move updated Raw(x) to use as Raw(x-bpp) for next loop */
2960            "cmpl _MMXLength, %%ecx       \n\t"
2961            "movq %%mm0, %%mm2            \n\t" /* mov updated Raw(x) to mm2 */
2962            "jb avg_3lp                   \n\t"
2963
2964            : "=S" (dummy_value_S),             /* output regs (dummy) */
2965              "=D" (dummy_value_D)
2966
2967            : "0" (prev_row),  /* esi           // input regs */
2968              "1" (row)        /* edi */
2969
2970            : "%ecx"                            /* clobber list */
2971#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2972            , "%mm0", "%mm1", "%mm2", "%mm3"
2973            , "%mm4", "%mm5", "%mm6", "%mm7"
2974#endif
2975         );
2976      }
2977      break;  /* end 3 bpp */
2978
2979      case 6:
2980      case 4:
2981      //case 7:   /* who wrote this?  PNG doesn't support 5 or 7 bytes/pixel */
2982      //case 5:   /* GRR BOGUS */
2983      {
2984         _ActiveMask.use  = 0xffffffffffffffffLL; /* use shift below to clear */
2985                                                  /* appropriate inactive bytes */
2986         _ShiftBpp.use = bpp << 3;
2987         _ShiftRem.use = 64 - _ShiftBpp.use;
2988
2989         __asm__ __volatile__ (
2990            "movq _HBClearMask, %%mm4    \n\t"
2991
2992            /* re-init address pointers and offset */
2993            "movl _dif, %%ecx            \n\t" /* ecx:  x = offset to */
2994                                               /* alignment boundary */
2995
2996            /* load _ActiveMask and clear all bytes except for 1st active group */
2997            "movq _ActiveMask, %%mm7     \n\t"
2998/* preload  "movl row, %%edi             \n\t" // edi:  Avg(x) */
2999            "psrlq _ShiftRem, %%mm7      \n\t"
3000/* preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x) */
3001            "movq %%mm7, %%mm6           \n\t"
3002            "movq _LBCarryMask, %%mm5    \n\t"
3003            "psllq _ShiftBpp, %%mm6      \n\t" /* create mask for 2nd active */
3004                                               /* group */
3005
3006            /* prime the pump:  load the first Raw(x-bpp) data set */
3007            "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
3008                                          /* (we correct pos. in loop below) */
3009         "avg_4lp:                       \n\t"
3010            "movq (%%edi,%%ecx,), %%mm0  \n\t"
3011            "psrlq _ShiftRem, %%mm2      \n\t" /* shift data to pos. correctly */
3012            "movq (%%esi,%%ecx,), %%mm1  \n\t"
3013            /* add (Prev_row/2) to average */
3014            "movq %%mm5, %%mm3           \n\t"
3015            "pand %%mm1, %%mm3           \n\t" /* get lsb for each prev_row byte */
3016            "psrlq $1, %%mm1             \n\t" /* divide prev_row bytes by 2 */
3017            "pand  %%mm4, %%mm1          \n\t" /* clear invalid bit 7 of each */
3018                                               /* byte */
3019            "paddb %%mm1, %%mm0          \n\t" /* add (Prev_row/2) to Avg for */
3020                                               /* each byte */
3021            /* add 1st active group (Raw(x-bpp)/2) to average with _LBCarry */
3022            "movq %%mm3, %%mm1           \n\t" /* now use mm1 for getting */
3023                                               /* LBCarrys */
3024            "pand %%mm2, %%mm1           \n\t" /* get LBCarrys for each byte */
3025                                               /* where both */
3026                              /* lsb's were == 1 (only valid for active group) */
3027            "psrlq $1, %%mm2             \n\t" /* divide raw bytes by 2 */
3028            "pand  %%mm4, %%mm2          \n\t" /* clear invalid bit 7 of each */
3029                                               /* byte */
3030            "paddb %%mm1, %%mm2          \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3031                                               /* for each byte */
3032            "pand %%mm7, %%mm2           \n\t" /* leave only Active Group 1 */
3033                                               /* bytes to add to Avg */
3034            "paddb %%mm2, %%mm0          \n\t" /* add (Raw/2) + LBCarrys to Avg */
3035                                               /* for each Active */
3036                              /* byte */
3037            /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */
3038            "movq %%mm0, %%mm2           \n\t" /* mov updated Raws to mm2 */
3039            "psllq _ShiftBpp, %%mm2      \n\t" /* shift data to pos. correctly */
3040            "addl $8, %%ecx              \n\t"
3041            "movq %%mm3, %%mm1           \n\t" /* now use mm1 for getting */
3042                                               /* LBCarrys */
3043            "pand %%mm2, %%mm1           \n\t" /* get LBCarrys for each byte */
3044                                               /* where both */
3045                              /* lsb's were == 1 (only valid for active group) */
3046            "psrlq $1, %%mm2             \n\t" /* divide raw bytes by 2 */
3047            "pand  %%mm4, %%mm2          \n\t" /* clear invalid bit 7 of each */
3048                                               /* byte */
3049            "paddb %%mm1, %%mm2          \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3050                                               /* for each byte */
3051            "pand %%mm6, %%mm2           \n\t" /* leave only Active Group 2 */
3052                                               /* bytes to add to Avg */
3053            "paddb %%mm2, %%mm0          \n\t" /* add (Raw/2) + LBCarrys to */
3054                                               /* Avg for each Active */
3055                              /* byte */
3056            "cmpl _MMXLength, %%ecx      \n\t"
3057            /* now ready to write back to memory */
3058            "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3059            /* prep Raw(x-bpp) for next loop */
3060            "movq %%mm0, %%mm2           \n\t" /* mov updated Raws to mm2 */
3061            "jb avg_4lp                  \n\t"
3062
3063            : "=S" (dummy_value_S),            /* output regs (dummy) */
3064              "=D" (dummy_value_D)
3065
3066            : "0" (prev_row),  /* esi          // input regs */
3067              "1" (row)        /* edi */
3068
3069            : "%ecx"                           /* clobber list */
3070#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3071            , "%mm0", "%mm1", "%mm2", "%mm3"
3072            , "%mm4", "%mm5", "%mm6", "%mm7"
3073#endif
3074         );
3075      }
3076      break;  /* end 4,6 bpp */
3077
3078      case 2:
3079      {
3080         _ActiveMask.use  = 0x000000000000ffffLL;
3081         _ShiftBpp.use = 16;   /* == 2 * 8 */
3082         _ShiftRem.use = 48;   /* == 64 - 16 */
3083
3084         __asm__ __volatile__ (
3085            /* load _ActiveMask */
3086            "movq _ActiveMask, %%mm7     \n\t"
3087            /* re-init address pointers and offset */
3088            "movl _dif, %%ecx            \n\t" /* ecx:  x = offset to alignment */
3089                                               /* boundary */
3090            "movq _LBCarryMask, %%mm5    \n\t"
3091/* preload  "movl row, %%edi             \n\t" // edi:  Avg(x) */
3092            "movq _HBClearMask, %%mm4    \n\t"
3093/* preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x) */
3094
3095            /* prime the pump:  load the first Raw(x-bpp) data set */
3096            "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
3097                              /* (we correct pos. in loop below) */
3098         "avg_2lp:                       \n\t"
3099            "movq (%%edi,%%ecx,), %%mm0  \n\t"
3100            "psrlq _ShiftRem, %%mm2      \n\t" /* shift data to pos. correctly */
3101            "movq (%%esi,%%ecx,), %%mm1  \n\t" /*  (GRR BUGFIX:  was psllq) */
3102            /* add (Prev_row/2) to average */
3103            "movq %%mm5, %%mm3           \n\t"
3104            "pand %%mm1, %%mm3           \n\t" /* get lsb for each prev_row byte */
3105            "psrlq $1, %%mm1             \n\t" /* divide prev_row bytes by 2 */
3106            "pand  %%mm4, %%mm1          \n\t" /* clear invalid bit 7 of each */
3107                                               /* byte */
3108            "movq %%mm7, %%mm6           \n\t"
3109            "paddb %%mm1, %%mm0          \n\t" /* add (Prev_row/2) to Avg for */
3110                                               /* each byte */
3111
3112            /* add 1st active group (Raw(x-bpp)/2) to average with _LBCarry */
3113            "movq %%mm3, %%mm1           \n\t" /* now use mm1 for getting */
3114                                               /* LBCarrys */
3115            "pand %%mm2, %%mm1           \n\t" /* get LBCarrys for each byte */
3116                                               /* where both */
3117                                               /* lsb's were == 1 (only valid */
3118                                               /* for active group) */
3119            "psrlq $1, %%mm2             \n\t" /* divide raw bytes by 2 */
3120            "pand  %%mm4, %%mm2          \n\t" /* clear invalid bit 7 of each */
3121                                               /* byte */
3122            "paddb %%mm1, %%mm2          \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3123                                               /* for each byte */
3124            "pand %%mm6, %%mm2           \n\t" /* leave only Active Group 1 */
3125                                               /* bytes to add to Avg */
3126            "paddb %%mm2, %%mm0          \n\t" /* add (Raw/2) + LBCarrys to Avg */
3127                                               /* for each Active byte */
3128
3129            /* add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry */
3130            "psllq _ShiftBpp, %%mm6      \n\t" /* shift the mm6 mask to cover */
3131                                               /* bytes 2 & 3 */
3132            "movq %%mm0, %%mm2           \n\t" /* mov updated Raws to mm2 */
3133            "psllq _ShiftBpp, %%mm2      \n\t" /* shift data to pos. correctly */
3134            "movq %%mm3, %%mm1           \n\t" /* now use mm1 for getting */
3135                                               /* LBCarrys */
3136            "pand %%mm2, %%mm1           \n\t" /* get LBCarrys for each byte */
3137                                               /* where both */
3138                                               /* lsb's were == 1 (only valid */
3139                                               /* for active group) */
3140            "psrlq $1, %%mm2             \n\t" /* divide raw bytes by 2 */
3141            "pand  %%mm4, %%mm2          \n\t" /* clear invalid bit 7 of each */
3142                                               /* byte */
3143            "paddb %%mm1, %%mm2          \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3144                                               /* for each byte */
3145            "pand %%mm6, %%mm2           \n\t" /* leave only Active Group 2 */
3146                                               /* bytes to add to Avg */
3147            "paddb %%mm2, %%mm0          \n\t" /* add (Raw/2) + LBCarrys to */
3148                                               /* Avg for each Active byte */
3149
3150            /* add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry */
3151            "psllq _ShiftBpp, %%mm6      \n\t" /* shift the mm6 mask to cover */
3152                                               /* bytes 4 & 5 */
3153            "movq %%mm0, %%mm2           \n\t" /* mov updated Raws to mm2 */
3154            "psllq _ShiftBpp, %%mm2      \n\t" /* shift data to pos. correctly */
3155            "movq %%mm3, %%mm1           \n\t" /* now use mm1 for getting */
3156                                               /* LBCarrys */
3157            "pand %%mm2, %%mm1           \n\t" /* get LBCarrys for each byte */
3158                                               /* where both lsb's were == 1 */
3159                                               /* (only valid for active group) */
3160            "psrlq $1, %%mm2             \n\t" /* divide raw bytes by 2 */
3161            "pand  %%mm4, %%mm2          \n\t" /* clear invalid bit 7 of each */
3162                                               /* byte */
3163            "paddb %%mm1, %%mm2          \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3164                                               /* for each byte */
3165            "pand %%mm6, %%mm2           \n\t" /* leave only Active Group 2 */
3166                                               /* bytes to add to Avg */
3167            "paddb %%mm2, %%mm0          \n\t" /* add (Raw/2) + LBCarrys to */
3168                                               /* Avg for each Active byte */
3169
3170            /* add 4th active group (Raw(x-bpp)/2) to average with _LBCarry */
3171            "psllq _ShiftBpp, %%mm6      \n\t" /* shift the mm6 mask to cover */
3172                                               /* bytes 6 & 7 */
3173            "movq %%mm0, %%mm2           \n\t" /* mov updated Raws to mm2 */
3174            "psllq _ShiftBpp, %%mm2      \n\t" /* shift data to pos. correctly */
3175            "addl $8, %%ecx              \n\t"
3176            "movq %%mm3, %%mm1           \n\t" /* now use mm1 for getting */
3177                                               /* LBCarrys */
3178            "pand %%mm2, %%mm1           \n\t" /* get LBCarrys for each byte */
3179                                               /* where both */
3180                                               /* lsb's were == 1 (only valid */
3181                                               /* for active group) */
3182            "psrlq $1, %%mm2             \n\t" /* divide raw bytes by 2 */
3183            "pand  %%mm4, %%mm2          \n\t" /* clear invalid bit 7 of each */
3184                                               /* byte */
3185            "paddb %%mm1, %%mm2          \n\t" /* add LBCarrys to (Raw(x-bpp)/2) */
3186                                               /* for each byte */
3187            "pand %%mm6, %%mm2           \n\t" /* leave only Active Group 2 */
3188                                               /* bytes to add to Avg */
3189            "paddb %%mm2, %%mm0          \n\t" /* add (Raw/2) + LBCarrys to */
3190                                               /* Avg for each Active byte */
3191
3192            "cmpl _MMXLength, %%ecx      \n\t"
3193            /* now ready to write back to memory */
3194            "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3195            /* prep Raw(x-bpp) for next loop */
3196            "movq %%mm0, %%mm2           \n\t" /* mov updated Raws to mm2 */
3197            "jb avg_2lp                  \n\t"
3198
3199            : "=S" (dummy_value_S),            /* output regs (dummy) */
3200              "=D" (dummy_value_D)
3201
3202            : "0" (prev_row),  /* esi          // input regs */
3203              "1" (row)        /* edi */
3204
3205            : "%ecx"                           /* clobber list */
3206#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3207            , "%mm0", "%mm1", "%mm2", "%mm3"
3208            , "%mm4", "%mm5", "%mm6", "%mm7"
3209#endif
3210         );
3211      }
3212      break;  /* end 2 bpp */
3213
3214      case 1:
3215      {
3216         __asm__ __volatile__ (
3217            /* re-init address pointers and offset */
3218#ifdef __PIC__
3219            "pushl %%ebx                 \n\t" /* save Global Offset Table index */
3220#endif
3221            "movl _dif, %%ebx            \n\t" /* ebx:  x = offset to alignment */
3222                                               /* boundary */
3223/* preload  "movl row, %%edi             \n\t" // edi:  Avg(x) */
3224            "cmpl _FullLength, %%ebx     \n\t" /* test if offset at end of array */
3225            "jnb avg_1end                \n\t"
3226            /* do Paeth decode for remaining bytes */
3227/* preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x) */
3228            "movl %%edi, %%edx           \n\t"
3229/* preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx) */
3230            "subl %%ecx, %%edx           \n\t" /* edx:  Raw(x-bpp) */
3231            "xorl %%ecx, %%ecx           \n\t" /* zero ecx before using cl & cx */
3232                                               /*  in loop below */
3233         "avg_1lp:                       \n\t"
3234            /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
3235            "xorl %%eax, %%eax           \n\t"
3236            "movb (%%esi,%%ebx,), %%cl   \n\t" /* load cl with Prior(x) */
3237            "movb (%%edx,%%ebx,), %%al   \n\t" /* load al with Raw(x-bpp) */
3238            "addw %%cx, %%ax             \n\t"
3239            "incl %%ebx                  \n\t"
3240            "shrw %%ax                   \n\t" /* divide by 2 */
3241            "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset */
3242                                               /* inc ebx */
3243            "cmpl _FullLength, %%ebx     \n\t" /* check if at end of array */
3244            "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write back Raw(x); */
3245                         /* mov does not affect flags; -1 to offset inc ebx */
3246            "jb avg_1lp                  \n\t"
3247
3248         "avg_1end:                      \n\t"
3249#ifdef __PIC__
3250            "popl %%ebx                  \n\t" /* Global Offset Table index */
3251#endif
3252
3253            : "=c" (dummy_value_c),            /* output regs (dummy) */
3254              "=S" (dummy_value_S),
3255              "=D" (dummy_value_D)
3256
3257            : "0" (bpp),       /* ecx          // input regs */
3258              "1" (prev_row),  /* esi */
3259              "2" (row)        /* edi */
3260
3261            : "%eax", "%edx"                   /* clobber list */
3262#ifndef __PIC__
3263            , "%ebx"
3264#endif
3265         );
3266      }
3267      return;  /* end 1 bpp */
3268
3269      case 8:
3270      {
3271         __asm__ __volatile__ (
3272            /* re-init address pointers and offset */
3273            "movl _dif, %%ecx            \n\t" /* ecx:  x == offset to alignment */
3274            "movq _LBCarryMask, %%mm5    \n\t" /*            boundary */
3275/* preload  "movl row, %%edi             \n\t" // edi:  Avg(x) */
3276            "movq _HBClearMask, %%mm4    \n\t"
3277/* preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x) */
3278
3279            /* prime the pump:  load the first Raw(x-bpp) data set */
3280            "movq -8(%%edi,%%ecx,), %%mm2 \n\t" /* load previous aligned 8 bytes */
3281                                      /* (NO NEED to correct pos. in loop below) */
3282
3283         "avg_8lp:                       \n\t"
3284            "movq (%%edi,%%ecx,), %%mm0  \n\t"
3285            "movq %%mm5, %%mm3           \n\t"
3286            "movq (%%esi,%%ecx,), %%mm1  \n\t"
3287            "addl $8, %%ecx              \n\t"
3288            "pand %%mm1, %%mm3           \n\t" /* get lsb for each prev_row byte */
3289            "psrlq $1, %%mm1             \n\t" /* divide prev_row bytes by 2 */
3290            "pand %%mm2, %%mm3           \n\t" /* get LBCarrys for each byte */
3291                                               /*  where both lsb's were == 1 */
3292            "psrlq $1, %%mm2             \n\t" /* divide raw bytes by 2 */
3293            "pand  %%mm4, %%mm1          \n\t" /* clear invalid bit 7, each byte */
3294            "paddb %%mm3, %%mm0          \n\t" /* add LBCarrys to Avg, each byte */
3295            "pand  %%mm4, %%mm2          \n\t" /* clear invalid bit 7, each byte */
3296            "paddb %%mm1, %%mm0          \n\t" /* add (Prev_row/2) to Avg, each */
3297            "paddb %%mm2, %%mm0          \n\t" /* add (Raw/2) to Avg for each */
3298            "cmpl _MMXLength, %%ecx      \n\t"
3299            "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3300            "movq %%mm0, %%mm2           \n\t" /* reuse as Raw(x-bpp) */
3301            "jb avg_8lp                  \n\t"
3302
3303            : "=S" (dummy_value_S),            /* output regs (dummy) */
3304              "=D" (dummy_value_D)
3305
3306            : "0" (prev_row),  /* esi          // input regs */
3307              "1" (row)        /* edi */
3308
3309            : "%ecx"                           /* clobber list */
3310#if 0  /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3311            , "%mm0", "%mm1", "%mm2"
3312            , "%mm3", "%mm4", "%mm5"
3313#endif
3314         );
3315      }
3316      break;  /* end 8 bpp */
3317
3318      default:                  /* bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8) */
3319      {
3320
3321#ifdef PNG_DEBUG
3322         /* GRR:  PRINT ERROR HERE:  SHOULD NEVER BE REACHED */
3323        png_debug(1,
3324        "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
3325#endif
3326
3327#if 0
3328        __asm__ __volatile__ (
3329            "movq _LBCarryMask, %%mm5    \n\t"
3330            /* re-init address pointers and offset */
3331            "movl _dif, %%ebx            \n\t" /* ebx:  x = offset to */
3332                                               /* alignment boundary */
3333            "movl row, %%edi             \n\t" /* edi:  Avg(x) */
3334            "movq _HBClearMask, %%mm4    \n\t"
3335            "movl %%edi, %%edx           \n\t"
3336            "movl prev_row, %%esi        \n\t" /* esi:  Prior(x) */
3337            "subl bpp, %%edx             \n\t" /* edx:  Raw(x-bpp) */
3338         "avg_Alp:                       \n\t"
3339            "movq (%%edi,%%ebx,), %%mm0  \n\t"
3340            "movq %%mm5, %%mm3           \n\t"
3341            "movq (%%esi,%%ebx,), %%mm1  \n\t"
3342            "pand %%mm1, %%mm3           \n\t" /* get lsb for each prev_row byte */
3343            "movq (%%edx,%%ebx,), %%mm2  \n\t"
3344            "psrlq $1, %%mm1             \n\t" /* divide prev_row bytes by 2 */
3345            "pand %%mm2, %%mm3           \n\t" /* get LBCarrys for each byte */
3346                                               /* where both lsb's were == 1 */
3347            "psrlq $1, %%mm2             \n\t" /* divide raw bytes by 2 */
3348            "pand  %%mm4, %%mm1          \n\t" /* clear invalid bit 7 of each */
3349                                               /* byte */
3350            "paddb %%mm3, %%mm0          \n\t" /* add LBCarrys to Avg for each */
3351                                               /* byte */
3352            "pand  %%mm4, %%mm2          \n\t" /* clear invalid bit 7 of each */
3353                                               /* byte */
3354            "paddb %%mm1, %%mm0          \n\t" /* add (Prev_row/2) to Avg for */
3355                                               /* each byte */
3356            "addl $8, %%ebx              \n\t"
3357            "paddb %%mm2, %%mm0          \n\t" /* add (Raw/2) to Avg for each */
3358                                               /* byte */
3359            "cmpl _MMXLength, %%ebx      \n\t"
3360            "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3361            "jb avg_Alp                  \n\t"
3362
3363            : /* FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var) */
3364
3365            : /* FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest) */
3366
3367            : "%ebx", "%edx", "%edi", "%esi" /* CHECKASM: clobber list */
3368         );
3369#endif /* 0 - NEVER REACHED */
3370      }
3371      break;
3372
3373   } /* end switch (bpp) */
3374
3375   __asm__ __volatile__ (
3376      /* MMX acceleration complete; now do clean-up */
3377      /* check if any remaining bytes left to decode */
3378#ifdef __PIC__
3379      "pushl %%ebx                 \n\t" /* save index to Global Offset Table */
3380#endif
3381      "movl _MMXLength, %%ebx      \n\t" /* ebx:  x == offset bytes after MMX */
3382/* pre "movl row, %%edi             \n\t" */ /* edi:  Avg(x) */
3383      "cmpl _FullLength, %%ebx     \n\t" /* test if offset at end of array */
3384      "jnb avg_end                 \n\t"
3385
3386      /* do Avg decode for remaining bytes */
3387/*pre "movl prev_row, %%esi        \n\t" */ /* esi:  Prior(x) */
3388      "movl %%edi, %%edx           \n\t"
3389/*pre "subl bpp, %%edx             \n\t" */ /* (bpp is preloaded into ecx) */
3390      "subl %%ecx, %%edx           \n\t" /* edx:  Raw(x-bpp) */
3391      "xorl %%ecx, %%ecx           \n\t" /* zero ecx before using cl & cx below */
3392
3393   "avg_lp2:                       \n\t"
3394      /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
3395      "xorl %%eax, %%eax           \n\t"
3396      "movb (%%esi,%%ebx,), %%cl   \n\t" /* load cl with Prior(x) */
3397      "movb (%%edx,%%ebx,), %%al   \n\t" /* load al with Raw(x-bpp) */
3398      "addw %%cx, %%ax             \n\t"
3399      "incl %%ebx                  \n\t"
3400      "shrw %%ax                   \n\t" /* divide by 2 */
3401      "addb -1(%%edi,%%ebx,), %%al \n\t" /* add Avg(x); -1 to offset inc ebx */
3402      "cmpl _FullLength, %%ebx     \n\t" /* check if at end of array */
3403      "movb %%al, -1(%%edi,%%ebx,) \n\t" /* write back Raw(x) [mov does not */
3404      "jb avg_lp2                  \n\t" /*  affect flags; -1 to offset inc ebx] */
3405
3406   "avg_end:                       \n\t"
3407      "EMMS                        \n\t" /* end MMX; prep for poss. FP instrs. */
3408#ifdef __PIC__
3409      "popl %%ebx                  \n\t" /* restore index to Global Offset Table */
3410#endif
3411
3412      : "=c" (dummy_value_c),            /* output regs (dummy) */
3413        "=S" (dummy_value_S),
3414        "=D" (dummy_value_D)
3415
3416      : "0" (bpp),       /* ecx          // input regs */
3417        "1" (prev_row),  /* esi */
3418        "2" (row)        /* edi */
3419
3420      : "%eax", "%edx"                   /* clobber list */
3421#ifndef __PIC__
3422      , "%ebx"
3423#endif
3424   );
3425
3426} /* end png_read_filter_row_mmx_avg() */
3427#endif
3428
3429
3430
3431#ifdef PNG_THREAD_UNSAFE_OK
3432/*===========================================================================*/
3433/*                                                                           */
3434/*         P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H         */
3435/*                                                                           */
3436/*===========================================================================*/
3437
3438/* Optimized code for PNG Paeth filter decoder */
3439
3440static void /* PRIVATE */
3441png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3442                              png_bytep prev_row)
3443{
3444   int bpp;
3445   int dummy_value_c;   /* fix 'forbidden register 2 (cx) was spilled' error */
3446   int dummy_value_S;
3447   int dummy_value_D;
3448
3449   bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */
3450   _FullLength  = row_info->rowbytes; /* # of bytes to filter */
3451
3452   __asm__ __volatile__ (
3453#ifdef __PIC__
3454      "pushl %%ebx                 \n\t" /* save index to Global Offset Table */
3455#endif
3456      "xorl %%ebx, %%ebx           \n\t" /* ebx:  x offset */
3457/*pre "movl row, %%edi             \n\t" */
3458      "xorl %%edx, %%edx           \n\t" /* edx:  x-bpp offset */
3459/*pre "movl prev_row, %%esi        \n\t" */
3460      "xorl %%eax, %%eax           \n\t"
3461
3462      /* Compute the Raw value for the first bpp bytes */
3463      /* Note: the formula works out to be always */
3464      /*   Paeth(x) = Raw(x) + Prior(x)      where x < bpp */
3465   "paeth_rlp:                     \n\t"
3466      "movb (%%edi,%%ebx,), %%al   \n\t"
3467      "addb (%%esi,%%ebx,), %%al   \n\t"
3468      "incl %%ebx                  \n\t"
3469/*pre "cmpl bpp, %%ebx             \n\t" (bpp is preloaded into ecx) */
3470      "cmpl %%ecx, %%ebx           \n\t"
3471      "movb %%al, -1(%%edi,%%ebx,) \n\t"
3472      "jb paeth_rlp                \n\t"
3473      /* get # of bytes to alignment */
3474      "movl %%edi, _dif            \n\t" /* take start of row */
3475      "addl %%ebx, _dif            \n\t" /* add bpp */
3476      "xorl %%ecx, %%ecx           \n\t"
3477      "addl $0xf, _dif             \n\t" /* add 7 + 8 to incr past alignment */
3478                                         /* boundary */
3479      "andl $0xfffffff8, _dif      \n\t" /* mask to alignment boundary */
3480      "subl %%edi, _dif            \n\t" /* subtract from start ==> value ebx */
3481                                         /* at alignment */
3482      "jz paeth_go                 \n\t"
3483      /* fix alignment */
3484
3485   "paeth_lp1:                     \n\t"
3486      "xorl %%eax, %%eax           \n\t"
3487      /* pav = p - a = (a + b - c) - a = b - c */
3488      "movb (%%esi,%%ebx,), %%al   \n\t" /* load Prior(x) into al */
3489      "movb (%%esi,%%edx,), %%cl   \n\t" /* load Prior(x-bpp) into cl */
3490      "subl %%ecx, %%eax           \n\t" /* subtract Prior(x-bpp) */
3491      "movl %%eax, _patemp         \n\t" /* Save pav for later use */
3492      "xorl %%eax, %%eax           \n\t"
3493      /* pbv = p - b = (a + b - c) - b = a - c */
3494      "movb (%%edi,%%edx,), %%al   \n\t" /* load Raw(x-bpp) into al */
3495      "subl %%ecx, %%eax           \n\t" /* subtract Prior(x-bpp) */
3496      "movl %%eax, %%ecx           \n\t"
3497      /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3498      "addl _patemp, %%eax         \n\t" /* pcv = pav + pbv */
3499      /* pc = abs(pcv) */
3500      "testl $0x80000000, %%eax    \n\t"
3501      "jz paeth_pca                \n\t"
3502      "negl %%eax                  \n\t" /* reverse sign of neg values */
3503
3504   "paeth_pca:                     \n\t"
3505      "movl %%eax, _pctemp         \n\t" /* save pc for later use */
3506      /* pb = abs(pbv) */
3507      "testl $0x80000000, %%ecx    \n\t"
3508      "jz paeth_pba                \n\t"
3509      "negl %%ecx                  \n\t" /* reverse sign of neg values */
3510
3511   "paeth_pba:                     \n\t"
3512      "movl %%ecx, _pbtemp         \n\t" /* save pb for later use */
3513      /* pa = abs(pav) */
3514      "movl _patemp, %%eax         \n\t"
3515      "testl $0x80000000, %%eax    \n\t"
3516      "jz paeth_paa                \n\t"
3517      "negl %%eax                  \n\t" /* reverse sign of neg values */
3518
3519   "paeth_paa:                     \n\t"
3520      "movl %%eax, _patemp         \n\t" /* save pa for later use */
3521      /* test if pa <= pb */
3522      "cmpl %%ecx, %%eax           \n\t"
3523      "jna paeth_abb               \n\t"
3524      /* pa > pb; now test if pb <= pc */
3525      "cmpl _pctemp, %%ecx         \n\t"
3526      "jna paeth_bbc               \n\t"
3527      /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3528      "movb (%%esi,%%edx,), %%cl   \n\t" /* load Prior(x-bpp) into cl */
3529      "jmp paeth_paeth             \n\t"
3530
3531   "paeth_bbc:                     \n\t"
3532      /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
3533      "movb (%%esi,%%ebx,), %%cl   \n\t" /* load Prior(x) into cl */
3534      "jmp paeth_paeth             \n\t"
3535
3536   "paeth_abb:                     \n\t"
3537      /* pa <= pb; now test if pa <= pc */
3538      "cmpl _pctemp, %%eax         \n\t"
3539      "jna paeth_abc               \n\t"
3540      /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3541      "movb (%%esi,%%edx,), %%cl   \n\t" /* load Prior(x-bpp) into cl */
3542      "jmp paeth_paeth             \n\t"
3543
3544   "paeth_abc:                     \n\t"
3545      /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
3546      "movb (%%edi,%%edx,), %%cl   \n\t" /* load Raw(x-bpp) into cl */
3547
3548   "paeth_paeth:                   \n\t"
3549      "incl %%ebx                  \n\t"
3550      "incl %%edx                  \n\t"
3551      /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
3552      "addb %%cl, -1(%%edi,%%ebx,) \n\t"
3553      "cmpl _dif, %%ebx            \n\t"
3554      "jb paeth_lp1                \n\t"
3555
3556   "paeth_go:                      \n\t"
3557      "movl _FullLength, %%ecx     \n\t"
3558      "movl %%ecx, %%eax           \n\t"
3559      "subl %%ebx, %%eax           \n\t" /* subtract alignment fix */
3560      "andl $0x00000007, %%eax     \n\t" /* calc bytes over mult of 8 */
3561      "subl %%eax, %%ecx           \n\t" /* drop over bytes from original length */
3562      "movl %%ecx, _MMXLength      \n\t"
3563#ifdef __PIC__
3564      "popl %%ebx                  \n\t" /* restore index to Global Offset Table */
3565#endif
3566
3567      : "=c" (dummy_value_c),            /* output regs (dummy) */
3568        "=S" (dummy_value_S),
3569        "=D" (dummy_value_D)
3570
3571      : "0" (bpp),       /* ecx          // input regs */
3572        "1" (prev_row),  /* esi */
3573        "2" (row)        /* edi */
3574
3575      : "%eax", "%edx"                   /* clobber list */
3576#ifndef __PIC__
3577      , "%ebx"
3578#endif
3579   );
3580
3581   /* now do the math for the rest of the row */
3582   switch (bpp)
3583   {
3584      case 3:
3585      {
3586         _ActiveMask.use = 0x0000000000ffffffLL;
3587         _ActiveMaskEnd.use = 0xffff000000000000LL;
3588         _ShiftBpp.use = 24;    /* == bpp(3) * 8 */
3589         _ShiftRem.use = 40;    /* == 64 - 24 */
3590
3591         __asm__ __volatile__ (
3592            "movl _dif, %%ecx            \n\t"
3593/* preload  "movl row, %%edi             \n\t" */
3594/* preload  "movl prev_row, %%esi        \n\t" */
3595            "pxor %%mm0, %%mm0           \n\t"
3596            /* prime the pump:  load the first Raw(x-bpp) data set */
3597            "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3598         "paeth_3lp:                     \n\t"
3599            "psrlq _ShiftRem, %%mm1      \n\t" /* shift last 3 bytes to 1st */
3600                                               /* 3 bytes */
3601            "movq (%%esi,%%ecx,), %%mm2  \n\t" /* load b=Prior(x) */
3602            "punpcklbw %%mm0, %%mm1      \n\t" /* unpack High bytes of a */
3603            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* prep c=Prior(x-bpp) bytes */
3604            "punpcklbw %%mm0, %%mm2      \n\t" /* unpack High bytes of b */
3605            "psrlq _ShiftRem, %%mm3      \n\t" /* shift last 3 bytes to 1st */
3606                                               /* 3 bytes */
3607            /* pav = p - a = (a + b - c) - a = b - c */
3608            "movq %%mm2, %%mm4           \n\t"
3609            "punpcklbw %%mm0, %%mm3      \n\t" /* unpack High bytes of c */
3610            /* pbv = p - b = (a + b - c) - b = a - c */
3611            "movq %%mm1, %%mm5           \n\t"
3612            "psubw %%mm3, %%mm4          \n\t"
3613            "pxor %%mm7, %%mm7           \n\t"
3614            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3615            "movq %%mm4, %%mm6           \n\t"
3616            "psubw %%mm3, %%mm5          \n\t"
3617
3618            /* pa = abs(p-a) = abs(pav) */
3619            /* pb = abs(p-b) = abs(pbv) */
3620            /* pc = abs(p-c) = abs(pcv) */
3621            "pcmpgtw %%mm4, %%mm0        \n\t" /* create mask pav bytes < 0 */
3622            "paddw %%mm5, %%mm6          \n\t"
3623            "pand %%mm4, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
3624            "pcmpgtw %%mm5, %%mm7        \n\t" /* create mask pbv bytes < 0 */
3625            "psubw %%mm0, %%mm4          \n\t"
3626            "pand %%mm5, %%mm7           \n\t" /* only pbv bytes < 0 in mm0 */
3627            "psubw %%mm0, %%mm4          \n\t"
3628            "psubw %%mm7, %%mm5          \n\t"
3629            "pxor %%mm0, %%mm0           \n\t"
3630            "pcmpgtw %%mm6, %%mm0        \n\t" /* create mask pcv bytes < 0 */
3631            "pand %%mm6, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
3632            "psubw %%mm7, %%mm5          \n\t"
3633            "psubw %%mm0, %%mm6          \n\t"
3634            /*  test pa <= pb */
3635            "movq %%mm4, %%mm7           \n\t"
3636            "psubw %%mm0, %%mm6          \n\t"
3637            "pcmpgtw %%mm5, %%mm7        \n\t" /* pa > pb? */
3638            "movq %%mm7, %%mm0           \n\t"
3639            /* use mm7 mask to merge pa & pb */
3640            "pand %%mm7, %%mm5           \n\t"
3641            /* use mm0 mask copy to merge a & b */
3642            "pand %%mm0, %%mm2           \n\t"
3643            "pandn %%mm4, %%mm7          \n\t"
3644            "pandn %%mm1, %%mm0          \n\t"
3645            "paddw %%mm5, %%mm7          \n\t"
3646            "paddw %%mm2, %%mm0          \n\t"
3647            /*  test  ((pa <= pb)? pa:pb) <= pc */
3648            "pcmpgtw %%mm6, %%mm7        \n\t" /* pab > pc? */
3649            "pxor %%mm1, %%mm1           \n\t"
3650            "pand %%mm7, %%mm3           \n\t"
3651            "pandn %%mm0, %%mm7          \n\t"
3652            "paddw %%mm3, %%mm7          \n\t"
3653            "pxor %%mm0, %%mm0           \n\t"
3654            "packuswb %%mm1, %%mm7       \n\t"
3655            "movq (%%esi,%%ecx,), %%mm3  \n\t" /* load c=Prior(x-bpp) */
3656            "pand _ActiveMask, %%mm7     \n\t"
3657            "movq %%mm3, %%mm2           \n\t" /* load b=Prior(x) step 1 */
3658            "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
3659            "punpcklbw %%mm0, %%mm3      \n\t" /* unpack High bytes of c */
3660            "movq %%mm7, (%%edi,%%ecx,)  \n\t" /* write back updated value */
3661            "movq %%mm7, %%mm1           \n\t" /* now mm1 will be used as */
3662                                               /* Raw(x-bpp) */
3663            /* now do Paeth for 2nd set of bytes (3-5) */
3664            "psrlq _ShiftBpp, %%mm2      \n\t" /* load b=Prior(x) step 2 */
3665            "punpcklbw %%mm0, %%mm1      \n\t" /* unpack High bytes of a */
3666            "pxor %%mm7, %%mm7           \n\t"
3667            "punpcklbw %%mm0, %%mm2      \n\t" /* unpack High bytes of b */
3668            /* pbv = p - b = (a + b - c) - b = a - c */
3669            "movq %%mm1, %%mm5           \n\t"
3670            /* pav = p - a = (a + b - c) - a = b - c */
3671            "movq %%mm2, %%mm4           \n\t"
3672            "psubw %%mm3, %%mm5          \n\t"
3673            "psubw %%mm3, %%mm4          \n\t"
3674            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = */
3675            /*       pav + pbv = pbv + pav */
3676            "movq %%mm5, %%mm6           \n\t"
3677            "paddw %%mm4, %%mm6          \n\t"
3678
3679            /* pa = abs(p-a) = abs(pav) */
3680            /* pb = abs(p-b) = abs(pbv) */
3681            /* pc = abs(p-c) = abs(pcv) */
3682            "pcmpgtw %%mm5, %%mm0        \n\t" /* create mask pbv bytes < 0 */
3683            "pcmpgtw %%mm4, %%mm7        \n\t" /* create mask pav bytes < 0 */
3684            "pand %%mm5, %%mm0           \n\t" /* only pbv bytes < 0 in mm0 */
3685            "pand %%mm4, %%mm7           \n\t" /* only pav bytes < 0 in mm7 */
3686            "psubw %%mm0, %%mm5          \n\t"
3687            "psubw %%mm7, %%mm4          \n\t"
3688            "psubw %%mm0, %%mm5          \n\t"
3689            "psubw %%mm7, %%mm4          \n\t"
3690            "pxor %%mm0, %%mm0           \n\t"
3691            "pcmpgtw %%mm6, %%mm0        \n\t" /* create mask pcv bytes < 0 */
3692            "pand %%mm6, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
3693            "psubw %%mm0, %%mm6          \n\t"
3694            /*  test pa <= pb */
3695            "movq %%mm4, %%mm7           \n\t"
3696            "psubw %%mm0, %%mm6          \n\t"
3697            "pcmpgtw %%mm5, %%mm7        \n\t" /* pa > pb? */
3698            "movq %%mm7, %%mm0           \n\t"
3699            /* use mm7 mask to merge pa & pb */
3700            "pand %%mm7, %%mm5           \n\t"
3701            /* use mm0 mask copy to merge a & b */
3702            "pand %%mm0, %%mm2           \n\t"
3703            "pandn %%mm4, %%mm7          \n\t"
3704            "pandn %%mm1, %%mm0          \n\t"
3705            "paddw %%mm5, %%mm7          \n\t"
3706            "paddw %%mm2, %%mm0          \n\t"
3707            /*  test  ((pa <= pb)? pa:pb) <= pc */
3708            "pcmpgtw %%mm6, %%mm7        \n\t" /* pab > pc? */
3709            "movq (%%esi,%%ecx,), %%mm2  \n\t" /* load b=Prior(x) */
3710            "pand %%mm7, %%mm3           \n\t"
3711            "pandn %%mm0, %%mm7          \n\t"
3712            "pxor %%mm1, %%mm1           \n\t"
3713            "paddw %%mm3, %%mm7          \n\t"
3714            "pxor %%mm0, %%mm0           \n\t"
3715            "packuswb %%mm1, %%mm7       \n\t"
3716            "movq %%mm2, %%mm3           \n\t" /* load c=Prior(x-bpp) step 1 */
3717            "pand _ActiveMask, %%mm7     \n\t"
3718            "punpckhbw %%mm0, %%mm2      \n\t" /* unpack High bytes of b */
3719            "psllq _ShiftBpp, %%mm7      \n\t" /* shift bytes to 2nd group of */
3720                                               /* 3 bytes */
3721             /* pav = p - a = (a + b - c) - a = b - c */
3722            "movq %%mm2, %%mm4           \n\t"
3723            "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
3724            "psllq _ShiftBpp, %%mm3      \n\t" /* load c=Prior(x-bpp) step 2 */
3725            "movq %%mm7, (%%edi,%%ecx,)  \n\t" /* write back updated value */
3726            "movq %%mm7, %%mm1           \n\t"
3727            "punpckhbw %%mm0, %%mm3      \n\t" /* unpack High bytes of c */
3728            "psllq _ShiftBpp, %%mm1      \n\t" /* shift bytes */
3729                                    /* now mm1 will be used as Raw(x-bpp) */
3730            /* now do Paeth for 3rd, and final, set of bytes (6-7) */
3731            "pxor %%mm7, %%mm7           \n\t"
3732            "punpckhbw %%mm0, %%mm1      \n\t" /* unpack High bytes of a */
3733            "psubw %%mm3, %%mm4          \n\t"
3734            /* pbv = p - b = (a + b - c) - b = a - c */
3735            "movq %%mm1, %%mm5           \n\t"
3736            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3737            "movq %%mm4, %%mm6           \n\t"
3738            "psubw %%mm3, %%mm5          \n\t"
3739            "pxor %%mm0, %%mm0           \n\t"
3740            "paddw %%mm5, %%mm6          \n\t"
3741
3742            /* pa = abs(p-a) = abs(pav) */
3743            /* pb = abs(p-b) = abs(pbv) */
3744            /* pc = abs(p-c) = abs(pcv) */
3745            "pcmpgtw %%mm4, %%mm0        \n\t" /* create mask pav bytes < 0 */
3746            "pcmpgtw %%mm5, %%mm7        \n\t" /* create mask pbv bytes < 0 */
3747            "pand %%mm4, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
3748            "pand %%mm5, %%mm7           \n\t" /* only pbv bytes < 0 in mm0 */
3749            "psubw %%mm0, %%mm4          \n\t"
3750            "psubw %%mm7, %%mm5          \n\t"
3751            "psubw %%mm0, %%mm4          \n\t"
3752            "psubw %%mm7, %%mm5          \n\t"
3753            "pxor %%mm0, %%mm0           \n\t"
3754            "pcmpgtw %%mm6, %%mm0        \n\t" /* create mask pcv bytes < 0 */
3755            "pand %%mm6, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
3756            "psubw %%mm0, %%mm6          \n\t"
3757            /*  test pa <= pb */
3758            "movq %%mm4, %%mm7           \n\t"
3759            "psubw %%mm0, %%mm6          \n\t"
3760            "pcmpgtw %%mm5, %%mm7        \n\t" /* pa > pb? */
3761            "movq %%mm7, %%mm0           \n\t"
3762            /* use mm0 mask copy to merge a & b */
3763            "pand %%mm0, %%mm2           \n\t"
3764            /* use mm7 mask to merge pa & pb */
3765            "pand %%mm7, %%mm5           \n\t"
3766            "pandn %%mm1, %%mm0          \n\t"
3767            "pandn %%mm4, %%mm7          \n\t"
3768            "paddw %%mm2, %%mm0          \n\t"
3769            "paddw %%mm5, %%mm7          \n\t"
3770            /*  test  ((pa <= pb)? pa:pb) <= pc */
3771            "pcmpgtw %%mm6, %%mm7        \n\t" /* pab > pc? */
3772            "pand %%mm7, %%mm3           \n\t"
3773            "pandn %%mm0, %%mm7          \n\t"
3774            "paddw %%mm3, %%mm7          \n\t"
3775            "pxor %%mm1, %%mm1           \n\t"
3776            "packuswb %%mm7, %%mm1       \n\t"
3777            /* step ecx to next set of 8 bytes and repeat loop til done */
3778            "addl $8, %%ecx              \n\t"
3779            "pand _ActiveMaskEnd, %%mm1  \n\t"
3780            "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with */
3781                                                 /* Raw(x) */
3782
3783            "cmpl _MMXLength, %%ecx      \n\t"
3784            "pxor %%mm0, %%mm0           \n\t" /* pxor does not affect flags */
3785            "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
3786                                 /* mm1 will be used as Raw(x-bpp) next loop */
3787                           /* mm3 ready to be used as Prior(x-bpp) next loop */
3788            "jb paeth_3lp                \n\t"
3789
3790            : "=S" (dummy_value_S),             /* output regs (dummy) */
3791              "=D" (dummy_value_D)
3792
3793            : "0" (prev_row),  /* esi           // input regs */
3794              "1" (row)        /* edi */
3795
3796            : "%ecx"                            /* clobber list */
3797#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3798            , "%mm0", "%mm1", "%mm2", "%mm3"
3799            , "%mm4", "%mm5", "%mm6", "%mm7"
3800#endif
3801         );
3802      }
3803      break;  /* end 3 bpp */
3804
3805      case 6:
3806      //case 7:   /* GRR BOGUS */
3807      //case 5:   /* GRR BOGUS */
3808      {
3809         _ActiveMask.use  = 0x00000000ffffffffLL;
3810         _ActiveMask2.use = 0xffffffff00000000LL;
3811         _ShiftBpp.use = bpp << 3;    /* == bpp * 8 */
3812         _ShiftRem.use = 64 - _ShiftBpp.use;
3813
3814         __asm__ __volatile__ (
3815            "movl _dif, %%ecx            \n\t"
3816/* preload  "movl row, %%edi             \n\t" */
3817/* preload  "movl prev_row, %%esi        \n\t" */
3818            /* prime the pump:  load the first Raw(x-bpp) data set */
3819            "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3820            "pxor %%mm0, %%mm0           \n\t"
3821
3822         "paeth_6lp:                     \n\t"
3823            /* must shift to position Raw(x-bpp) data */
3824            "psrlq _ShiftRem, %%mm1      \n\t"
3825            /* do first set of 4 bytes */
3826            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
3827            "punpcklbw %%mm0, %%mm1      \n\t" /* unpack Low bytes of a */
3828            "movq (%%esi,%%ecx,), %%mm2  \n\t" /* load b=Prior(x) */
3829            "punpcklbw %%mm0, %%mm2      \n\t" /* unpack Low bytes of b */
3830            /* must shift to position Prior(x-bpp) data */
3831            "psrlq _ShiftRem, %%mm3      \n\t"
3832            /* pav = p - a = (a + b - c) - a = b - c */
3833            "movq %%mm2, %%mm4           \n\t"
3834            "punpcklbw %%mm0, %%mm3      \n\t" /* unpack Low bytes of c */
3835            /* pbv = p - b = (a + b - c) - b = a - c */
3836            "movq %%mm1, %%mm5           \n\t"
3837            "psubw %%mm3, %%mm4          \n\t"
3838            "pxor %%mm7, %%mm7           \n\t"
3839            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3840            "movq %%mm4, %%mm6           \n\t"
3841            "psubw %%mm3, %%mm5          \n\t"
3842            /* pa = abs(p-a) = abs(pav) */
3843            /* pb = abs(p-b) = abs(pbv) */
3844            /* pc = abs(p-c) = abs(pcv) */
3845            "pcmpgtw %%mm4, %%mm0        \n\t" /* create mask pav bytes < 0 */
3846            "paddw %%mm5, %%mm6          \n\t"
3847            "pand %%mm4, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
3848            "pcmpgtw %%mm5, %%mm7        \n\t" /* create mask pbv bytes < 0 */
3849            "psubw %%mm0, %%mm4          \n\t"
3850            "pand %%mm5, %%mm7           \n\t" /* only pbv bytes < 0 in mm0 */
3851            "psubw %%mm0, %%mm4          \n\t"
3852            "psubw %%mm7, %%mm5          \n\t"
3853            "pxor %%mm0, %%mm0           \n\t"
3854            "pcmpgtw %%mm6, %%mm0        \n\t" /* create mask pcv bytes < 0 */
3855            "pand %%mm6, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
3856            "psubw %%mm7, %%mm5          \n\t"
3857            "psubw %%mm0, %%mm6          \n\t"
3858            /*  test pa <= pb */
3859            "movq %%mm4, %%mm7           \n\t"
3860            "psubw %%mm0, %%mm6          \n\t"
3861            "pcmpgtw %%mm5, %%mm7        \n\t" /* pa > pb? */
3862            "movq %%mm7, %%mm0           \n\t"
3863            /* use mm7 mask to merge pa & pb */
3864            "pand %%mm7, %%mm5           \n\t"
3865            /* use mm0 mask copy to merge a & b */
3866            "pand %%mm0, %%mm2           \n\t"
3867            "pandn %%mm4, %%mm7          \n\t"
3868            "pandn %%mm1, %%mm0          \n\t"
3869            "paddw %%mm5, %%mm7          \n\t"
3870            "paddw %%mm2, %%mm0          \n\t"
3871            /*  test  ((pa <= pb)? pa:pb) <= pc */
3872            "pcmpgtw %%mm6, %%mm7        \n\t" /* pab > pc? */
3873            "pxor %%mm1, %%mm1           \n\t"
3874            "pand %%mm7, %%mm3           \n\t"
3875            "pandn %%mm0, %%mm7          \n\t"
3876            "paddw %%mm3, %%mm7          \n\t"
3877            "pxor %%mm0, %%mm0           \n\t"
3878            "packuswb %%mm1, %%mm7       \n\t"
3879            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* load c=Prior(x-bpp) */
3880            "pand _ActiveMask, %%mm7     \n\t"
3881            "psrlq _ShiftRem, %%mm3      \n\t"
3882            "movq (%%esi,%%ecx,), %%mm2  \n\t" /* load b=Prior(x) step 1 */
3883            "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor and Raw(x) */
3884            "movq %%mm2, %%mm6           \n\t"
3885            "movq %%mm7, (%%edi,%%ecx,)  \n\t" /* write back updated value */
3886            "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3887            "psllq _ShiftBpp, %%mm6      \n\t"
3888            "movq %%mm7, %%mm5           \n\t"
3889            "psrlq _ShiftRem, %%mm1      \n\t"
3890            "por %%mm6, %%mm3            \n\t"
3891            "psllq _ShiftBpp, %%mm5      \n\t"
3892            "punpckhbw %%mm0, %%mm3      \n\t" /* unpack High bytes of c */
3893            "por %%mm5, %%mm1            \n\t"
3894            /* do second set of 4 bytes */
3895            "punpckhbw %%mm0, %%mm2      \n\t" /* unpack High bytes of b */
3896            "punpckhbw %%mm0, %%mm1      \n\t" /* unpack High bytes of a */
3897            /* pav = p - a = (a + b - c) - a = b - c */
3898            "movq %%mm2, %%mm4           \n\t"
3899            /* pbv = p - b = (a + b - c) - b = a - c */
3900            "movq %%mm1, %%mm5           \n\t"
3901            "psubw %%mm3, %%mm4          \n\t"
3902            "pxor %%mm7, %%mm7           \n\t"
3903            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3904            "movq %%mm4, %%mm6           \n\t"
3905            "psubw %%mm3, %%mm5          \n\t"
3906            /* pa = abs(p-a) = abs(pav) */
3907            /* pb = abs(p-b) = abs(pbv) */
3908            /* pc = abs(p-c) = abs(pcv) */
3909            "pcmpgtw %%mm4, %%mm0        \n\t" /* create mask pav bytes < 0 */
3910            "paddw %%mm5, %%mm6          \n\t"
3911            "pand %%mm4, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
3912            "pcmpgtw %%mm5, %%mm7        \n\t" /* create mask pbv bytes < 0 */
3913            "psubw %%mm0, %%mm4          \n\t"
3914            "pand %%mm5, %%mm7           \n\t" /* only pbv bytes < 0 in mm0 */
3915            "psubw %%mm0, %%mm4          \n\t"
3916            "psubw %%mm7, %%mm5          \n\t"
3917            "pxor %%mm0, %%mm0           \n\t"
3918            "pcmpgtw %%mm6, %%mm0        \n\t" /* create mask pcv bytes < 0 */
3919            "pand %%mm6, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
3920            "psubw %%mm7, %%mm5          \n\t"
3921            "psubw %%mm0, %%mm6          \n\t"
3922            /*  test pa <= pb */
3923            "movq %%mm4, %%mm7           \n\t"
3924            "psubw %%mm0, %%mm6          \n\t"
3925            "pcmpgtw %%mm5, %%mm7        \n\t" /* pa > pb? */
3926            "movq %%mm7, %%mm0           \n\t"
3927            /* use mm7 mask to merge pa & pb */
3928            "pand %%mm7, %%mm5           \n\t"
3929            /* use mm0 mask copy to merge a & b */
3930            "pand %%mm0, %%mm2           \n\t"
3931            "pandn %%mm4, %%mm7          \n\t"
3932            "pandn %%mm1, %%mm0          \n\t"
3933            "paddw %%mm5, %%mm7          \n\t"
3934            "paddw %%mm2, %%mm0          \n\t"
3935            /*  test  ((pa <= pb)? pa:pb) <= pc */
3936            "pcmpgtw %%mm6, %%mm7        \n\t" /* pab > pc? */
3937            "pxor %%mm1, %%mm1           \n\t"
3938            "pand %%mm7, %%mm3           \n\t"
3939            "pandn %%mm0, %%mm7          \n\t"
3940            "pxor %%mm1, %%mm1           \n\t"
3941            "paddw %%mm3, %%mm7          \n\t"
3942            "pxor %%mm0, %%mm0           \n\t"
3943            /* step ecx to next set of 8 bytes and repeat loop til done */
3944            "addl $8, %%ecx              \n\t"
3945            "packuswb %%mm7, %%mm1       \n\t"
3946            "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with Raw(x) */
3947            "cmpl _MMXLength, %%ecx      \n\t"
3948            "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
3949                                /* mm1 will be used as Raw(x-bpp) next loop */
3950            "jb paeth_6lp                \n\t"
3951
3952            : "=S" (dummy_value_S),             /* output regs (dummy) */
3953              "=D" (dummy_value_D)
3954
3955            : "0" (prev_row),  /* esi           // input regs */
3956              "1" (row)        /* edi */
3957
3958            : "%ecx"                            /* clobber list */
3959#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3960            , "%mm0", "%mm1", "%mm2", "%mm3"
3961            , "%mm4", "%mm5", "%mm6", "%mm7"
3962#endif
3963         );
3964      }
3965      break;  /* end 6 bpp */
3966
3967      case 4:
3968      {
3969         _ActiveMask.use  = 0x00000000ffffffffLL;
3970
3971         __asm__ __volatile__ (
3972            "movl _dif, %%ecx            \n\t"
3973/* preload  "movl row, %%edi             \n\t" */
3974/* preload  "movl prev_row, %%esi        \n\t" */
3975            "pxor %%mm0, %%mm0           \n\t"
3976            /* prime the pump:  load the first Raw(x-bpp) data set */
3977            "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* only time should need to read */
3978                                     /*  a=Raw(x-bpp) bytes */
3979         "paeth_4lp:                     \n\t"
3980            /* do first set of 4 bytes */
3981            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
3982            "punpckhbw %%mm0, %%mm1      \n\t" /* unpack Low bytes of a */
3983            "movq (%%esi,%%ecx,), %%mm2  \n\t" /* load b=Prior(x) */
3984            "punpcklbw %%mm0, %%mm2      \n\t" /* unpack High bytes of b */
3985            /* pav = p - a = (a + b - c) - a = b - c */
3986            "movq %%mm2, %%mm4           \n\t"
3987            "punpckhbw %%mm0, %%mm3      \n\t" /* unpack High bytes of c */
3988            /* pbv = p - b = (a + b - c) - b = a - c */
3989            "movq %%mm1, %%mm5           \n\t"
3990            "psubw %%mm3, %%mm4          \n\t"
3991            "pxor %%mm7, %%mm7           \n\t"
3992            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3993            "movq %%mm4, %%mm6           \n\t"
3994            "psubw %%mm3, %%mm5          \n\t"
3995            /* pa = abs(p-a) = abs(pav) */
3996            /* pb = abs(p-b) = abs(pbv) */
3997            /* pc = abs(p-c) = abs(pcv) */
3998            "pcmpgtw %%mm4, %%mm0        \n\t" /* create mask pav bytes < 0 */
3999            "paddw %%mm5, %%mm6          \n\t"
4000            "pand %%mm4, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
4001            "pcmpgtw %%mm5, %%mm7        \n\t" /* create mask pbv bytes < 0 */
4002            "psubw %%mm0, %%mm4          \n\t"
4003            "pand %%mm5, %%mm7           \n\t" /* only pbv bytes < 0 in mm0 */
4004            "psubw %%mm0, %%mm4          \n\t"
4005            "psubw %%mm7, %%mm5          \n\t"
4006            "pxor %%mm0, %%mm0           \n\t"
4007            "pcmpgtw %%mm6, %%mm0        \n\t" /* create mask pcv bytes < 0 */
4008            "pand %%mm6, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
4009            "psubw %%mm7, %%mm5          \n\t"
4010            "psubw %%mm0, %%mm6          \n\t"
4011            /*  test pa <= pb */
4012            "movq %%mm4, %%mm7           \n\t"
4013            "psubw %%mm0, %%mm6          \n\t"
4014            "pcmpgtw %%mm5, %%mm7        \n\t" /* pa > pb? */
4015            "movq %%mm7, %%mm0           \n\t"
4016            /* use mm7 mask to merge pa & pb */
4017            "pand %%mm7, %%mm5           \n\t"
4018            /* use mm0 mask copy to merge a & b */
4019            "pand %%mm0, %%mm2           \n\t"
4020            "pandn %%mm4, %%mm7          \n\t"
4021            "pandn %%mm1, %%mm0          \n\t"
4022            "paddw %%mm5, %%mm7          \n\t"
4023            "paddw %%mm2, %%mm0          \n\t"
4024            /*  test  ((pa <= pb)? pa:pb) <= pc */
4025            "pcmpgtw %%mm6, %%mm7        \n\t" /* pab > pc? */
4026            "pxor %%mm1, %%mm1           \n\t"
4027            "pand %%mm7, %%mm3           \n\t"
4028            "pandn %%mm0, %%mm7          \n\t"
4029            "paddw %%mm3, %%mm7          \n\t"
4030            "pxor %%mm0, %%mm0           \n\t"
4031            "packuswb %%mm1, %%mm7       \n\t"
4032            "movq (%%esi,%%ecx,), %%mm3  \n\t" /* load c=Prior(x-bpp) */
4033            "pand _ActiveMask, %%mm7     \n\t"
4034            "movq %%mm3, %%mm2           \n\t" /* load b=Prior(x) step 1 */
4035            "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
4036            "punpcklbw %%mm0, %%mm3      \n\t" /* unpack High bytes of c */
4037            "movq %%mm7, (%%edi,%%ecx,)  \n\t" /* write back updated value */
4038            "movq %%mm7, %%mm1           \n\t" /* now mm1 will be used as Raw(x-bpp) */
4039            /* do second set of 4 bytes */
4040            "punpckhbw %%mm0, %%mm2      \n\t" /* unpack Low bytes of b */
4041            "punpcklbw %%mm0, %%mm1      \n\t" /* unpack Low bytes of a */
4042            /* pav = p - a = (a + b - c) - a = b - c */
4043            "movq %%mm2, %%mm4           \n\t"
4044            /* pbv = p - b = (a + b - c) - b = a - c */
4045            "movq %%mm1, %%mm5           \n\t"
4046            "psubw %%mm3, %%mm4          \n\t"
4047            "pxor %%mm7, %%mm7           \n\t"
4048            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4049            "movq %%mm4, %%mm6           \n\t"
4050            "psubw %%mm3, %%mm5          \n\t"
4051            /* pa = abs(p-a) = abs(pav) */
4052            /* pb = abs(p-b) = abs(pbv) */
4053            /* pc = abs(p-c) = abs(pcv) */
4054            "pcmpgtw %%mm4, %%mm0        \n\t" /* create mask pav bytes < 0 */
4055            "paddw %%mm5, %%mm6          \n\t"
4056            "pand %%mm4, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
4057            "pcmpgtw %%mm5, %%mm7        \n\t" /* create mask pbv bytes < 0 */
4058            "psubw %%mm0, %%mm4          \n\t"
4059            "pand %%mm5, %%mm7           \n\t" /* only pbv bytes < 0 in mm0 */
4060            "psubw %%mm0, %%mm4          \n\t"
4061            "psubw %%mm7, %%mm5          \n\t"
4062            "pxor %%mm0, %%mm0           \n\t"
4063            "pcmpgtw %%mm6, %%mm0        \n\t" /* create mask pcv bytes < 0 */
4064            "pand %%mm6, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
4065            "psubw %%mm7, %%mm5          \n\t"
4066            "psubw %%mm0, %%mm6          \n\t"
4067            /*  test pa <= pb */
4068            "movq %%mm4, %%mm7           \n\t"
4069            "psubw %%mm0, %%mm6          \n\t"
4070            "pcmpgtw %%mm5, %%mm7        \n\t" /* pa > pb? */
4071            "movq %%mm7, %%mm0           \n\t"
4072            /* use mm7 mask to merge pa & pb */
4073            "pand %%mm7, %%mm5           \n\t"
4074            /* use mm0 mask copy to merge a & b */
4075            "pand %%mm0, %%mm2           \n\t"
4076            "pandn %%mm4, %%mm7          \n\t"
4077            "pandn %%mm1, %%mm0          \n\t"
4078            "paddw %%mm5, %%mm7          \n\t"
4079            "paddw %%mm2, %%mm0          \n\t"
4080            /*  test  ((pa <= pb)? pa:pb) <= pc */
4081            "pcmpgtw %%mm6, %%mm7        \n\t" /* pab > pc? */
4082            "pxor %%mm1, %%mm1           \n\t"
4083            "pand %%mm7, %%mm3           \n\t"
4084            "pandn %%mm0, %%mm7          \n\t"
4085            "pxor %%mm1, %%mm1           \n\t"
4086            "paddw %%mm3, %%mm7          \n\t"
4087            "pxor %%mm0, %%mm0           \n\t"
4088            /* step ecx to next set of 8 bytes and repeat loop til done */
4089            "addl $8, %%ecx              \n\t"
4090            "packuswb %%mm7, %%mm1       \n\t"
4091            "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add predictor with Raw(x) */
4092            "cmpl _MMXLength, %%ecx      \n\t"
4093            "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
4094                                /* mm1 will be used as Raw(x-bpp) next loop */
4095            "jb paeth_4lp                \n\t"
4096
4097            : "=S" (dummy_value_S),             /* output regs (dummy) */
4098              "=D" (dummy_value_D)
4099
4100            : "0" (prev_row),  /* esi           // input regs */
4101              "1" (row)        /* edi */
4102
4103            : "%ecx"                            /* clobber list */
4104#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4105            , "%mm0", "%mm1", "%mm2", "%mm3"
4106            , "%mm4", "%mm5", "%mm6", "%mm7"
4107#endif
4108         );
4109      }
4110      break;  /* end 4 bpp */
4111
4112      case 8:                          /* bpp == 8 */
4113      {
4114         _ActiveMask.use  = 0x00000000ffffffffLL;
4115
4116         __asm__ __volatile__ (
4117            "movl _dif, %%ecx            \n\t"
4118/* preload  "movl row, %%edi             \n\t" */
4119/* preload  "movl prev_row, %%esi        \n\t" */
4120            "pxor %%mm0, %%mm0           \n\t"
4121            /* prime the pump:  load the first Raw(x-bpp) data set */
4122            "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* only time should need to read */
4123                                       /*  a=Raw(x-bpp) bytes */
4124         "paeth_8lp:                     \n\t"
4125            /* do first set of 4 bytes */
4126            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
4127            "punpcklbw %%mm0, %%mm1      \n\t" /* unpack Low bytes of a */
4128            "movq (%%esi,%%ecx,), %%mm2  \n\t" /* load b=Prior(x) */
4129            "punpcklbw %%mm0, %%mm2      \n\t" /* unpack Low bytes of b */
4130            /* pav = p - a = (a + b - c) - a = b - c */
4131            "movq %%mm2, %%mm4           \n\t"
4132            "punpcklbw %%mm0, %%mm3      \n\t" /* unpack Low bytes of c */
4133            /* pbv = p - b = (a + b - c) - b = a - c */
4134            "movq %%mm1, %%mm5           \n\t"
4135            "psubw %%mm3, %%mm4          \n\t"
4136            "pxor %%mm7, %%mm7           \n\t"
4137            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4138            "movq %%mm4, %%mm6           \n\t"
4139            "psubw %%mm3, %%mm5          \n\t"
4140            /* pa = abs(p-a) = abs(pav) */
4141            /* pb = abs(p-b) = abs(pbv) */
4142            /* pc = abs(p-c) = abs(pcv) */
4143            "pcmpgtw %%mm4, %%mm0        \n\t" /* create mask pav bytes < 0 */
4144            "paddw %%mm5, %%mm6          \n\t"
4145            "pand %%mm4, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
4146            "pcmpgtw %%mm5, %%mm7        \n\t" /* create mask pbv bytes < 0 */
4147            "psubw %%mm0, %%mm4          \n\t"
4148            "pand %%mm5, %%mm7           \n\t" /* only pbv bytes < 0 in mm0 */
4149            "psubw %%mm0, %%mm4          \n\t"
4150            "psubw %%mm7, %%mm5          \n\t"
4151            "pxor %%mm0, %%mm0           \n\t"
4152            "pcmpgtw %%mm6, %%mm0        \n\t" /* create mask pcv bytes < 0 */
4153            "pand %%mm6, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
4154            "psubw %%mm7, %%mm5          \n\t"
4155            "psubw %%mm0, %%mm6          \n\t"
4156            /*  test pa <= pb */
4157            "movq %%mm4, %%mm7           \n\t"
4158            "psubw %%mm0, %%mm6          \n\t"
4159            "pcmpgtw %%mm5, %%mm7        \n\t" /* pa > pb? */
4160            "movq %%mm7, %%mm0           \n\t"
4161            /* use mm7 mask to merge pa & pb */
4162            "pand %%mm7, %%mm5           \n\t"
4163            /* use mm0 mask copy to merge a & b */
4164            "pand %%mm0, %%mm2           \n\t"
4165            "pandn %%mm4, %%mm7          \n\t"
4166            "pandn %%mm1, %%mm0          \n\t"
4167            "paddw %%mm5, %%mm7          \n\t"
4168            "paddw %%mm2, %%mm0          \n\t"
4169            /*  test  ((pa <= pb)? pa:pb) <= pc */
4170            "pcmpgtw %%mm6, %%mm7        \n\t" /* pab > pc? */
4171            "pxor %%mm1, %%mm1           \n\t"
4172            "pand %%mm7, %%mm3           \n\t"
4173            "pandn %%mm0, %%mm7          \n\t"
4174            "paddw %%mm3, %%mm7          \n\t"
4175            "pxor %%mm0, %%mm0           \n\t"
4176            "packuswb %%mm1, %%mm7       \n\t"
4177            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" /* read c=Prior(x-bpp) bytes */
4178            "pand _ActiveMask, %%mm7     \n\t"
4179            "movq (%%esi,%%ecx,), %%mm2  \n\t" /* load b=Prior(x) */
4180            "paddb (%%edi,%%ecx,), %%mm7 \n\t" /* add Paeth predictor with Raw(x) */
4181            "punpckhbw %%mm0, %%mm3      \n\t" /* unpack High bytes of c */
4182            "movq %%mm7, (%%edi,%%ecx,)  \n\t" /* write back updated value */
4183            "movq -8(%%edi,%%ecx,), %%mm1 \n\t" /* read a=Raw(x-bpp) bytes */
4184
4185            /* do second set of 4 bytes */
4186            "punpckhbw %%mm0, %%mm2      \n\t" /* unpack High bytes of b */
4187            "punpckhbw %%mm0, %%mm1      \n\t" /* unpack High bytes of a */
4188            /* pav = p - a = (a + b - c) - a = b - c */
4189            "movq %%mm2, %%mm4           \n\t"
4190            /* pbv = p - b = (a + b - c) - b = a - c */
4191            "movq %%mm1, %%mm5           \n\t"
4192            "psubw %%mm3, %%mm4          \n\t"
4193            "pxor %%mm7, %%mm7           \n\t"
4194            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4195            "movq %%mm4, %%mm6           \n\t"
4196            "psubw %%mm3, %%mm5          \n\t"
4197            /* pa = abs(p-a) = abs(pav) */
4198            /* pb = abs(p-b) = abs(pbv) */
4199            /* pc = abs(p-c) = abs(pcv) */
4200            "pcmpgtw %%mm4, %%mm0        \n\t" /* create mask pav bytes < 0 */
4201            "paddw %%mm5, %%mm6          \n\t"
4202            "pand %%mm4, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
4203            "pcmpgtw %%mm5, %%mm7        \n\t" /* create mask pbv bytes < 0 */
4204            "psubw %%mm0, %%mm4          \n\t"
4205            "pand %%mm5, %%mm7           \n\t" /* only pbv bytes < 0 in mm0 */
4206            "psubw %%mm0, %%mm4          \n\t"
4207            "psubw %%mm7, %%mm5          \n\t"
4208            "pxor %%mm0, %%mm0           \n\t"
4209            "pcmpgtw %%mm6, %%mm0        \n\t" /* create mask pcv bytes < 0 */
4210            "pand %%mm6, %%mm0           \n\t" /* only pav bytes < 0 in mm7 */
4211            "psubw %%mm7, %%mm5          \n\t"
4212            "psubw %%mm0, %%mm6          \n\t"
4213            /*  test pa <= pb */
4214            "movq %%mm4, %%mm7           \n\t"
4215            "psubw %%mm0, %%mm6          \n\t"
4216            "pcmpgtw %%mm5, %%mm7        \n\t" /* pa > pb? */
4217            "movq %%mm7, %%mm0           \n\t"
4218            /* use mm7 mask to merge pa & pb */
4219            "pand %%mm7, %%mm5           \n\t"
4220            /* use mm0 mask copy to merge a & b */
4221            "pand %%mm0, %%mm2           \n\t"
4222            "pandn %%mm4, %%mm7          \n\t"
4223            "pandn %%mm1, %%mm0          \n\t"
4224            "paddw %%mm5, %%mm7          \n\t"
4225            "paddw %%mm2, %%mm0          \n\t"
4226            /*  test  ((pa <= pb)? pa:pb) <= pc */
4227            "pcmpgtw %%mm6, %%mm7        \n\t" /* pab > pc? */
4228            "pxor %%mm1, %%mm1           \n\t"
4229            "pand %%mm7, %%mm3           \n\t"
4230            "pandn %%mm0, %%mm7          \n\t"
4231            "pxor %%mm1, %%mm1           \n\t"
4232            "paddw %%mm3, %%mm7          \n\t"
4233            "pxor %%mm0, %%mm0           \n\t"
4234            /* step ecx to next set of 8 bytes and repeat loop til done */
4235            "addl $8, %%ecx              \n\t"
4236            "packuswb %%mm7, %%mm1       \n\t"
4237            "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" /* add Paeth predictor with Raw(x) */
4238            "cmpl _MMXLength, %%ecx      \n\t"
4239            "movq %%mm1, -8(%%edi,%%ecx,) \n\t" /* write back updated value */
4240                            /* mm1 will be used as Raw(x-bpp) next loop */
4241            "jb paeth_8lp                \n\t"
4242
4243            : "=S" (dummy_value_S),             /* output regs (dummy) */
4244              "=D" (dummy_value_D)
4245
4246            : "0" (prev_row),  /* esi           // input regs */
4247              "1" (row)        /* edi */
4248
4249            : "%ecx"                            /* clobber list */
4250#if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4251            , "%mm0", "%mm1", "%mm2", "%mm3"
4252            , "%mm4", "%mm5", "%mm6", "%mm7"
4253#endif
4254         );
4255      }
4256      break;  /* end 8 bpp */
4257
4258      case 1:                /* bpp = 1 */
4259      case 2:                /* bpp = 2 */
4260      default:               /* bpp > 8 */
4261      {
4262         __asm__ __volatile__ (
4263#ifdef __PIC__
4264            "pushl %%ebx                 \n\t" /* save Global Offset Table index */
4265#endif
4266            "movl _dif, %%ebx            \n\t"
4267            "cmpl _FullLength, %%ebx     \n\t"
4268            "jnb paeth_dend              \n\t"
4269
4270/* preload  "movl row, %%edi             \n\t" */
4271/* preload  "movl prev_row, %%esi        \n\t" */
4272            /* do Paeth decode for remaining bytes */
4273            "movl %%ebx, %%edx           \n\t"
4274/* preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx) */
4275            "subl %%ecx, %%edx           \n\t" /* edx = ebx - bpp */
4276            "xorl %%ecx, %%ecx           \n\t" /* zero ecx before using cl & cx */
4277
4278         "paeth_dlp:                     \n\t"
4279            "xorl %%eax, %%eax           \n\t"
4280            /* pav = p - a = (a + b - c) - a = b - c */
4281            "movb (%%esi,%%ebx,), %%al   \n\t" /* load Prior(x) into al */
4282            "movb (%%esi,%%edx,), %%cl   \n\t" /* load Prior(x-bpp) into cl */
4283            "subl %%ecx, %%eax           \n\t" /* subtract Prior(x-bpp) */
4284            "movl %%eax, _patemp         \n\t" /* Save pav for later use */
4285            "xorl %%eax, %%eax           \n\t"
4286            /* pbv = p - b = (a + b - c) - b = a - c */
4287            "movb (%%edi,%%edx,), %%al   \n\t" /* load Raw(x-bpp) into al */
4288            "subl %%ecx, %%eax           \n\t" /* subtract Prior(x-bpp) */
4289            "movl %%eax, %%ecx           \n\t"
4290            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4291            "addl _patemp, %%eax         \n\t" /* pcv = pav + pbv */
4292            /* pc = abs(pcv) */
4293            "testl $0x80000000, %%eax    \n\t"
4294            "jz paeth_dpca               \n\t"
4295            "negl %%eax                  \n\t" /* reverse sign of neg values */
4296
4297         "paeth_dpca:                    \n\t"
4298            "movl %%eax, _pctemp         \n\t" /* save pc for later use */
4299            /* pb = abs(pbv) */
4300            "testl $0x80000000, %%ecx    \n\t"
4301            "jz paeth_dpba               \n\t"
4302            "negl %%ecx                  \n\t" /* reverse sign of neg values */
4303
4304         "paeth_dpba:                    \n\t"
4305            "movl %%ecx, _pbtemp         \n\t" /* save pb for later use */
4306            /* pa = abs(pav) */
4307            "movl _patemp, %%eax         \n\t"
4308            "testl $0x80000000, %%eax    \n\t"
4309            "jz paeth_dpaa               \n\t"
4310            "negl %%eax                  \n\t" /* reverse sign of neg values */
4311
4312         "paeth_dpaa:                    \n\t"
4313            "movl %%eax, _patemp         \n\t" /* save pa for later use */
4314            /* test if pa <= pb */
4315            "cmpl %%ecx, %%eax           \n\t"
4316            "jna paeth_dabb              \n\t"
4317            /* pa > pb; now test if pb <= pc */
4318            "cmpl _pctemp, %%ecx         \n\t"
4319            "jna paeth_dbbc              \n\t"
4320            /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4321            "movb (%%esi,%%edx,), %%cl   \n\t" /* load Prior(x-bpp) into cl */
4322            "jmp paeth_dpaeth            \n\t"
4323
4324         "paeth_dbbc:                    \n\t"
4325            /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
4326            "movb (%%esi,%%ebx,), %%cl   \n\t" /* load Prior(x) into cl */
4327            "jmp paeth_dpaeth            \n\t"
4328
4329         "paeth_dabb:                    \n\t"
4330            /* pa <= pb; now test if pa <= pc */
4331            "cmpl _pctemp, %%eax         \n\t"
4332            "jna paeth_dabc              \n\t"
4333            /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4334            "movb (%%esi,%%edx,), %%cl   \n\t" /* load Prior(x-bpp) into cl */
4335            "jmp paeth_dpaeth            \n\t"
4336
4337         "paeth_dabc:                    \n\t"
4338            /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
4339            "movb (%%edi,%%edx,), %%cl   \n\t" /* load Raw(x-bpp) into cl */
4340
4341         "paeth_dpaeth:                  \n\t"
4342            "incl %%ebx                  \n\t"
4343            "incl %%edx                  \n\t"
4344            /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
4345            "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4346            "cmpl _FullLength, %%ebx     \n\t"
4347            "jb paeth_dlp                \n\t"
4348
4349         "paeth_dend:                    \n\t"
4350#ifdef __PIC__
4351            "popl %%ebx                  \n\t" /* index to Global Offset Table */
4352#endif
4353
4354            : "=c" (dummy_value_c),            /* output regs (dummy) */
4355              "=S" (dummy_value_S),
4356              "=D" (dummy_value_D)
4357
4358            : "0" (bpp),       /* ecx          // input regs */
4359              "1" (prev_row),  /* esi */
4360              "2" (row)        /* edi */
4361
4362            : "%eax", "%edx"                   /* clobber list */
4363#ifndef __PIC__
4364            , "%ebx"
4365#endif
4366         );
4367      }
4368      return;                   /* No need to go further with this one */
4369
4370   } /* end switch (bpp) */
4371
4372   __asm__ __volatile__ (
4373      /* MMX acceleration complete; now do clean-up */
4374      /* check if any remaining bytes left to decode */
4375#ifdef __PIC__
4376      "pushl %%ebx                 \n\t" /* save index to Global Offset Table */
4377#endif
4378      "movl _MMXLength, %%ebx      \n\t"
4379      "cmpl _FullLength, %%ebx     \n\t"
4380      "jnb paeth_end               \n\t"
4381/*pre "movl row, %%edi             \n\t" */
4382/*pre "movl prev_row, %%esi        \n\t" */
4383      /* do Paeth decode for remaining bytes */
4384      "movl %%ebx, %%edx           \n\t"
4385/*pre "subl bpp, %%edx             \n\t" */ /* (bpp is preloaded into ecx) */
4386      "subl %%ecx, %%edx           \n\t" /* edx = ebx - bpp */
4387      "xorl %%ecx, %%ecx           \n\t" /* zero ecx before using cl & cx below */
4388
4389   "paeth_lp2:                     \n\t"
4390      "xorl %%eax, %%eax           \n\t"
4391      /* pav = p - a = (a + b - c) - a = b - c */
4392      "movb (%%esi,%%ebx,), %%al   \n\t" /* load Prior(x) into al */
4393      "movb (%%esi,%%edx,), %%cl   \n\t" /* load Prior(x-bpp) into cl */
4394      "subl %%ecx, %%eax           \n\t" /* subtract Prior(x-bpp) */
4395      "movl %%eax, _patemp         \n\t" /* Save pav for later use */
4396      "xorl %%eax, %%eax           \n\t"
4397      /* pbv = p - b = (a + b - c) - b = a - c */
4398      "movb (%%edi,%%edx,), %%al   \n\t" /* load Raw(x-bpp) into al */
4399      "subl %%ecx, %%eax           \n\t" /* subtract Prior(x-bpp) */
4400      "movl %%eax, %%ecx           \n\t"
4401      /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
4402      "addl _patemp, %%eax         \n\t" /* pcv = pav + pbv */
4403      /* pc = abs(pcv) */
4404      "testl $0x80000000, %%eax    \n\t"
4405      "jz paeth_pca2               \n\t"
4406      "negl %%eax                  \n\t" /* reverse sign of neg values */
4407
4408   "paeth_pca2:                    \n\t"
4409      "movl %%eax, _pctemp         \n\t" /* save pc for later use */
4410      /* pb = abs(pbv) */
4411      "testl $0x80000000, %%ecx    \n\t"
4412      "jz paeth_pba2               \n\t"
4413      "negl %%ecx                  \n\t" /* reverse sign of neg values */
4414
4415   "paeth_pba2:                    \n\t"
4416      "movl %%ecx, _pbtemp         \n\t" /* save pb for later use */
4417      /* pa = abs(pav) */
4418      "movl _patemp, %%eax         \n\t"
4419      "testl $0x80000000, %%eax    \n\t"
4420      "jz paeth_paa2               \n\t"
4421      "negl %%eax                  \n\t" /* reverse sign of neg values */
4422
4423   "paeth_paa2:                    \n\t"
4424      "movl %%eax, _patemp         \n\t" /* save pa for later use */
4425      /* test if pa <= pb */
4426      "cmpl %%ecx, %%eax           \n\t"
4427      "jna paeth_abb2              \n\t"
4428      /* pa > pb; now test if pb <= pc */
4429      "cmpl _pctemp, %%ecx         \n\t"
4430      "jna paeth_bbc2              \n\t"
4431      /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4432      "movb (%%esi,%%edx,), %%cl   \n\t" /* load Prior(x-bpp) into cl */
4433      "jmp paeth_paeth2            \n\t"
4434
4435   "paeth_bbc2:                    \n\t"
4436      /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
4437      "movb (%%esi,%%ebx,), %%cl   \n\t" /* load Prior(x) into cl */
4438      "jmp paeth_paeth2            \n\t"
4439
4440   "paeth_abb2:                    \n\t"
4441      /* pa <= pb; now test if pa <= pc */
4442      "cmpl _pctemp, %%eax         \n\t"
4443      "jna paeth_abc2              \n\t"
4444      /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
4445      "movb (%%esi,%%edx,), %%cl   \n\t" /* load Prior(x-bpp) into cl */
4446      "jmp paeth_paeth2            \n\t"
4447
4448   "paeth_abc2:                    \n\t"
4449      /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
4450      "movb (%%edi,%%edx,), %%cl   \n\t" /* load Raw(x-bpp) into cl */
4451
4452   "paeth_paeth2:                  \n\t"
4453      "incl %%ebx                  \n\t"
4454      "incl %%edx                  \n\t"
4455      /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
4456      "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4457      "cmpl _FullLength, %%ebx     \n\t"
4458      "jb paeth_lp2                \n\t"
4459
4460   "paeth_end:                     \n\t"
4461      "EMMS                        \n\t" /* end MMX; prep for poss. FP instrs. */
4462#ifdef __PIC__
4463      "popl %%ebx                  \n\t" /* restore index to Global Offset Table */
4464#endif
4465
4466      : "=c" (dummy_value_c),            /* output regs (dummy) */
4467        "=S" (dummy_value_S),
4468        "=D" (dummy_value_D)
4469
4470      : "0" (bpp),       /* ecx          // input regs */
4471        "1" (prev_row),  /* esi */
4472        "2" (row)        /* edi */
4473
4474      : "%eax", "%edx"                   /* clobber list (no input regs!) */
4475#ifndef __PIC__
4476      , "%ebx"
4477#endif
4478   );
4479
4480} /* end png_read_filter_row_mmx_paeth() */
4481#endif
4482
4483
4484
4485
4486#ifdef PNG_THREAD_UNSAFE_OK
4487/*===========================================================================*/
4488/*                                                                           */
4489/*           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B           */
4490/*                                                                           */
4491/*===========================================================================*/
4492
4493/* Optimized code for PNG Sub filter decoder */
4494
4495static void /* PRIVATE */
4496png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
4497{
4498   int bpp;
4499   int dummy_value_a;
4500   int dummy_value_D;
4501
4502   bpp = (row_info->pixel_depth + 7) >> 3;   /* calc number of bytes per pixel */
4503   _FullLength = row_info->rowbytes - bpp;   /* number of bytes to filter */
4504
4505   __asm__ __volatile__ (
4506/*pre "movl row, %%edi             \n\t" */
4507      "movl %%edi, %%esi           \n\t" /* lp = row */
4508/*pre "movl bpp, %%eax             \n\t" */
4509      "addl %%eax, %%edi           \n\t" /* rp = row + bpp */
4510/*irr "xorl %%eax, %%eax           \n\t" */
4511      /* get # of bytes to alignment */
4512      "movl %%edi, _dif            \n\t" /* take start of row */
4513      "addl $0xf, _dif             \n\t" /* add 7 + 8 to incr past */
4514                                         /*  alignment boundary */
4515      "xorl %%ecx, %%ecx           \n\t"
4516      "andl $0xfffffff8, _dif      \n\t" /* mask to alignment boundary */
4517      "subl %%edi, _dif            \n\t" /* subtract from start ==> value */
4518      "jz sub_go                   \n\t" /*  ecx at alignment */
4519
4520   "sub_lp1:                       \n\t" /* fix alignment */
4521      "movb (%%esi,%%ecx,), %%al   \n\t"
4522      "addb %%al, (%%edi,%%ecx,)   \n\t"
4523      "incl %%ecx                  \n\t"
4524      "cmpl _dif, %%ecx            \n\t"
4525      "jb sub_lp1                  \n\t"
4526
4527   "sub_go:                        \n\t"
4528      "movl _FullLength, %%eax     \n\t"
4529      "movl %%eax, %%edx           \n\t"
4530      "subl %%ecx, %%edx           \n\t" /* subtract alignment fix */
4531      "andl $0x00000007, %%edx     \n\t" /* calc bytes over mult of 8 */
4532      "subl %%edx, %%eax           \n\t" /* drop over bytes from length */
4533      "movl %%eax, _MMXLength      \n\t"
4534
4535      : "=a" (dummy_value_a),   /* 0      // output regs (dummy) */
4536        "=D" (dummy_value_D)    /* 1 */
4537
4538      : "0" (bpp),              /* eax    // input regs */
4539        "1" (row)               /* edi */
4540
4541      : "%esi", "%ecx", "%edx"            // clobber list
4542
4543#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4544      , "%mm0", "%mm1", "%mm2", "%mm3"
4545      , "%mm4", "%mm5", "%mm6", "%mm7"
4546#endif
4547   );
4548
4549   /* now do the math for the rest of the row */
4550   switch (bpp)
4551   {
4552      case 3:
4553      {
4554         _ActiveMask.use  = 0x0000ffffff000000LL;
4555         _ShiftBpp.use = 24;       /* == 3 * 8 */
4556         _ShiftRem.use  = 40;      /* == 64 - 24 */
4557
4558         __asm__ __volatile__ (
4559/* preload  "movl row, %%edi              \n\t" */
4560            "movq _ActiveMask, %%mm7       \n\t" /* load _ActiveMask for 2nd */
4561                                                /*  active byte group */
4562            "movl %%edi, %%esi            \n\t" /* lp = row */
4563/* preload  "movl bpp, %%eax              \n\t" */
4564            "addl %%eax, %%edi            \n\t" /* rp = row + bpp */
4565            "movq %%mm7, %%mm6            \n\t"
4566            "movl _dif, %%edx             \n\t"
4567            "psllq _ShiftBpp, %%mm6       \n\t" /* move mask in mm6 to cover */
4568                                                /*  3rd active byte group */
4569            /* prime the pump:  load the first Raw(x-bpp) data set */
4570            "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4571
4572         "sub_3lp:                        \n\t" /* shift data for adding first */
4573            "psrlq _ShiftRem, %%mm1       \n\t" /*  bpp bytes (no need for mask; */
4574                                                /*  shift clears inactive bytes) */
4575            /* add 1st active group */
4576            "movq (%%edi,%%edx,), %%mm0   \n\t"
4577            "paddb %%mm1, %%mm0           \n\t"
4578
4579            /* add 2nd active group */
4580            "movq %%mm0, %%mm1            \n\t" /* mov updated Raws to mm1 */
4581            "psllq _ShiftBpp, %%mm1       \n\t" /* shift data to pos. correctly */
4582            "pand %%mm7, %%mm1            \n\t" /* mask to use 2nd active group */
4583            "paddb %%mm1, %%mm0           \n\t"
4584
4585            /* add 3rd active group */
4586            "movq %%mm0, %%mm1            \n\t" /* mov updated Raws to mm1 */
4587            "psllq _ShiftBpp, %%mm1       \n\t" /* shift data to pos. correctly */
4588            "pand %%mm6, %%mm1            \n\t" /* mask to use 3rd active group */
4589            "addl $8, %%edx               \n\t"
4590            "paddb %%mm1, %%mm0           \n\t"
4591
4592            "cmpl _MMXLength, %%edx       \n\t"
4593            "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* write updated Raws to array */
4594            "movq %%mm0, %%mm1            \n\t" /* prep 1st add at top of loop */
4595            "jb sub_3lp                   \n\t"
4596
4597            : "=a" (dummy_value_a),   /* 0      // output regs (dummy) */
4598              "=D" (dummy_value_D)    /* 1 */
4599
4600            : "0" (bpp),              /* eax    // input regs */
4601              "1" (row)               /* edi */
4602
4603            : "%edx", "%esi"                    /* clobber list */
4604#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4605            , "%mm0", "%mm1", "%mm6", "%mm7"
4606#endif
4607         );
4608      }
4609      break;
4610
4611      case 1:
4612      {
4613         __asm__ __volatile__ (
4614            "movl _dif, %%edx            \n\t"
4615/* preload  "movl row, %%edi             \n\t" */
4616            "cmpl _FullLength, %%edx     \n\t"
4617            "jnb sub_1end                \n\t"
4618            "movl %%edi, %%esi           \n\t" /* lp = row */
4619            "xorl %%eax, %%eax           \n\t"
4620/* preload  "movl bpp, %%eax             \n\t" */
4621            "addl %%eax, %%edi           \n\t" /* rp = row + bpp */
4622
4623         "sub_1lp:                       \n\t"
4624            "movb (%%esi,%%edx,), %%al   \n\t"
4625            "addb %%al, (%%edi,%%edx,)   \n\t"
4626            "incl %%edx                  \n\t"
4627            "cmpl _FullLength, %%edx     \n\t"
4628            "jb sub_1lp                  \n\t"
4629
4630         "sub_1end:                      \n\t"
4631
4632            : "=a" (dummy_value_a),   /* 0      // output regs (dummy) */
4633              "=D" (dummy_value_D)    /* 1 */
4634
4635            : "0" (bpp),              /* eax    // input regs */
4636              "1" (row)               /* edi */
4637
4638            : "%edx", "%esi"                    /* clobber list */
4639         );
4640      }
4641      return;
4642
4643      case 6:
4644      case 4:
4645      //case 7:   /* GRR BOGUS */
4646      //case 5:   /* GRR BOGUS */
4647      {
4648         _ShiftBpp.use = bpp << 3;
4649         _ShiftRem.use = 64 - _ShiftBpp.use;
4650
4651         __asm__ __volatile__ (
4652/* preload  "movl row, %%edi              \n\t" */
4653            "movl _dif, %%edx             \n\t"
4654            "movl %%edi, %%esi            \n\t" /* lp = row */
4655/* preload  "movl bpp, %%eax              \n\t" */
4656            "addl %%eax, %%edi            \n\t" /* rp = row + bpp */
4657
4658            /* prime the pump:  load the first Raw(x-bpp) data set */
4659            "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4660
4661         "sub_4lp:                        \n\t" /* shift data for adding first */
4662            "psrlq _ShiftRem, %%mm1       \n\t" /*  bpp bytes (no need for mask; */
4663                                                /*  shift clears inactive bytes) */
4664            "movq (%%edi,%%edx,), %%mm0   \n\t"
4665            "paddb %%mm1, %%mm0           \n\t"
4666
4667            /* add 2nd active group */
4668            "movq %%mm0, %%mm1            \n\t" /* mov updated Raws to mm1 */
4669            "psllq _ShiftBpp, %%mm1       \n\t" /* shift data to pos. correctly */
4670            "addl $8, %%edx               \n\t"
4671            "paddb %%mm1, %%mm0           \n\t"
4672
4673            "cmpl _MMXLength, %%edx       \n\t"
4674            "movq %%mm0, -8(%%edi,%%edx,) \n\t"
4675            "movq %%mm0, %%mm1            \n\t" /* prep 1st add at top of loop */
4676            "jb sub_4lp                   \n\t"
4677
4678            : "=a" (dummy_value_a),   /* 0      // output regs (dummy) */
4679              "=D" (dummy_value_D)    /* 1 */
4680
4681            : "0" (bpp),              /* eax    // input regs */
4682              "1" (row)               /* edi */
4683
4684            : "%edx", "%esi"                    /* clobber list */
4685#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4686            , "%mm0", "%mm1"
4687#endif
4688         );
4689      }
4690      break;
4691
4692      case 2:
4693      {
4694         _ActiveMask.use = 0x00000000ffff0000LL;
4695         _ShiftBpp.use = 16;       /* == 2 * 8 */
4696         _ShiftRem.use = 48;       /* == 64 - 16 */
4697
4698         __asm__ __volatile__ (
4699            "movq _ActiveMask, %%mm7      \n\t" /* load _ActiveMask for 2nd */
4700                                                /*  active byte group */
4701            "movl _dif, %%edx             \n\t"
4702            "movq %%mm7, %%mm6            \n\t"
4703/* preload  "movl row, %%edi              \n\t" */
4704            "psllq _ShiftBpp, %%mm6       \n\t" /* move mask in mm6 to cover */
4705                                                /*  3rd active byte group */
4706            "movl %%edi, %%esi            \n\t" /* lp = row */
4707            "movq %%mm6, %%mm5            \n\t"
4708/* preload  "movl bpp, %%eax              \n\t" */
4709            "addl %%eax, %%edi            \n\t" /* rp = row + bpp */
4710            "psllq _ShiftBpp, %%mm5       \n\t" /* move mask in mm5 to cover */
4711                                                /*  4th active byte group */
4712            /* prime the pump:  load the first Raw(x-bpp) data set */
4713            "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4714
4715         "sub_2lp:                        \n\t" /* shift data for adding first */
4716            "psrlq _ShiftRem, %%mm1       \n\t" /*  bpp bytes (no need for mask; */
4717                                                /*  shift clears inactive bytes) */
4718            /* add 1st active group */
4719            "movq (%%edi,%%edx,), %%mm0   \n\t"
4720            "paddb %%mm1, %%mm0           \n\t"
4721
4722            /* add 2nd active group */
4723            "movq %%mm0, %%mm1            \n\t" /* mov updated Raws to mm1 */
4724            "psllq _ShiftBpp, %%mm1       \n\t" /* shift data to pos. correctly */
4725            "pand %%mm7, %%mm1            \n\t" /* mask to use 2nd active group */
4726            "paddb %%mm1, %%mm0           \n\t"
4727
4728            /* add 3rd active group */
4729            "movq %%mm0, %%mm1            \n\t" /* mov updated Raws to mm1 */
4730            "psllq _ShiftBpp, %%mm1       \n\t" /* shift data to pos. correctly */
4731            "pand %%mm6, %%mm1            \n\t" /* mask to use 3rd active group */
4732            "paddb %%mm1, %%mm0           \n\t"
4733
4734            /* add 4th active group */
4735            "movq %%mm0, %%mm1            \n\t" /* mov updated Raws to mm1 */
4736            "psllq _ShiftBpp, %%mm1       \n\t" /* shift data to pos. correctly */
4737            "pand %%mm5, %%mm1            \n\t" /* mask to use 4th active group */
4738            "addl $8, %%edx               \n\t"
4739            "paddb %%mm1, %%mm0           \n\t"
4740            "cmpl _MMXLength, %%edx       \n\t"
4741            "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* write updated Raws to array */
4742            "movq %%mm0, %%mm1            \n\t" /* prep 1st add at top of loop */
4743            "jb sub_2lp                   \n\t"
4744
4745            : "=a" (dummy_value_a),   /* 0      // output regs (dummy) */
4746              "=D" (dummy_value_D)    /* 1 */
4747
4748            : "0" (bpp),              /* eax    // input regs */
4749              "1" (row)               /* edi */
4750
4751            : "%edx", "%esi"                    /* clobber list */
4752#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4753            , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4754#endif
4755         );
4756      }
4757      break;
4758
4759      case 8:
4760      {
4761         __asm__ __volatile__ (
4762/* preload  "movl row, %%edi              \n\t" */
4763            "movl _dif, %%edx             \n\t"
4764            "movl %%edi, %%esi            \n\t" /* lp = row */
4765/* preload  "movl bpp, %%eax              \n\t" */
4766            "addl %%eax, %%edi            \n\t" /* rp = row + bpp */
4767            "movl _MMXLength, %%ecx       \n\t"
4768
4769            /* prime the pump:  load the first Raw(x-bpp) data set */
4770            "movq -8(%%edi,%%edx,), %%mm7 \n\t"
4771            "andl $0x0000003f, %%ecx      \n\t" /* calc bytes over mult of 64 */
4772
4773         "sub_8lp:                        \n\t"
4774            "movq (%%edi,%%edx,), %%mm0   \n\t" /* load Sub(x) for 1st 8 bytes */
4775            "paddb %%mm7, %%mm0           \n\t"
4776            "movq 8(%%edi,%%edx,), %%mm1  \n\t" /* load Sub(x) for 2nd 8 bytes */
4777            "movq %%mm0, (%%edi,%%edx,)   \n\t" /* write Raw(x) for 1st 8 bytes */
4778
4779            /* Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes. */
4780            /* This will be repeated for each group of 8 bytes with the 8th */
4781            /* group being used as the Raw(x-bpp) for the 1st group of the */
4782            /* next loop. */
4783
4784            "paddb %%mm0, %%mm1           \n\t"
4785            "movq 16(%%edi,%%edx,), %%mm2 \n\t" /* load Sub(x) for 3rd 8 bytes */
4786            "movq %%mm1, 8(%%edi,%%edx,)  \n\t" /* write Raw(x) for 2nd 8 bytes */
4787            "paddb %%mm1, %%mm2           \n\t"
4788            "movq 24(%%edi,%%edx,), %%mm3 \n\t" /* load Sub(x) for 4th 8 bytes */
4789            "movq %%mm2, 16(%%edi,%%edx,) \n\t" /* write Raw(x) for 3rd 8 bytes */
4790            "paddb %%mm2, %%mm3           \n\t"
4791            "movq 32(%%edi,%%edx,), %%mm4 \n\t" /* load Sub(x) for 5th 8 bytes */
4792            "movq %%mm3, 24(%%edi,%%edx,) \n\t" /* write Raw(x) for 4th 8 bytes */
4793            "paddb %%mm3, %%mm4           \n\t"
4794            "movq 40(%%edi,%%edx,), %%mm5 \n\t" /* load Sub(x) for 6th 8 bytes */
4795            "movq %%mm4, 32(%%edi,%%edx,) \n\t" /* write Raw(x) for 5th 8 bytes */
4796            "paddb %%mm4, %%mm5           \n\t"
4797            "movq 48(%%edi,%%edx,), %%mm6 \n\t" /* load Sub(x) for 7th 8 bytes */
4798            "movq %%mm5, 40(%%edi,%%edx,) \n\t" /* write Raw(x) for 6th 8 bytes */
4799            "paddb %%mm5, %%mm6           \n\t"
4800            "movq 56(%%edi,%%edx,), %%mm7 \n\t" /* load Sub(x) for 8th 8 bytes */
4801            "movq %%mm6, 48(%%edi,%%edx,) \n\t" /* write Raw(x) for 7th 8 bytes */
4802            "addl $64, %%edx              \n\t"
4803            "paddb %%mm6, %%mm7           \n\t"
4804            "cmpl %%ecx, %%edx            \n\t"
4805            "movq %%mm7, -8(%%edi,%%edx,) \n\t" /* write Raw(x) for 8th 8 bytes */
4806            "jb sub_8lp                   \n\t"
4807
4808            "cmpl _MMXLength, %%edx       \n\t"
4809            "jnb sub_8lt8                 \n\t"
4810
4811         "sub_8lpA:                       \n\t"
4812            "movq (%%edi,%%edx,), %%mm0   \n\t"
4813            "addl $8, %%edx               \n\t"
4814            "paddb %%mm7, %%mm0           \n\t"
4815            "cmpl _MMXLength, %%edx       \n\t"
4816            "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* -8 to offset early addl edx */
4817            "movq %%mm0, %%mm7            \n\t" /* move calculated Raw(x) data */
4818                                                /*  to mm1 to be new Raw(x-bpp) */
4819                                                /*  for next loop */
4820            "jb sub_8lpA                  \n\t"
4821
4822         "sub_8lt8:                       \n\t"
4823
4824            : "=a" (dummy_value_a),   /* 0      // output regs (dummy) */
4825              "=D" (dummy_value_D)    /* 1 */
4826
4827            : "0" (bpp),              /* eax    // input regs */
4828              "1" (row)               /* edi */
4829
4830            : "%ecx", "%edx", "%esi"            /* clobber list */
4831#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4832            , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4833#endif
4834         );
4835      }
4836      break;
4837
4838      default:                /* bpp greater than 8 bytes   GRR BOGUS */
4839      {
4840         __asm__ __volatile__ (
4841            "movl _dif, %%edx             \n\t"
4842/* preload  "movl row, %%edi              \n\t" */
4843            "movl %%edi, %%esi            \n\t" /* lp = row */
4844/* preload  "movl bpp, %%eax              \n\t" */
4845            "addl %%eax, %%edi            \n\t" /* rp = row + bpp */
4846
4847         "sub_Alp:                        \n\t"
4848            "movq (%%edi,%%edx,), %%mm0   \n\t"
4849            "movq (%%esi,%%edx,), %%mm1   \n\t"
4850            "addl $8, %%edx               \n\t"
4851            "paddb %%mm1, %%mm0           \n\t"
4852            "cmpl _MMXLength, %%edx       \n\t"
4853            "movq %%mm0, -8(%%edi,%%edx,) \n\t" /* mov does not affect flags; */
4854                                                /*  -8 to offset addl edx */
4855            "jb sub_Alp                   \n\t"
4856
4857            : "=a" (dummy_value_a),   /* 0      // output regs (dummy) */
4858              "=D" (dummy_value_D)    /* 1 */
4859
4860            : "0" (bpp),              /* eax    // input regs */
4861              "1" (row)               /* edi */
4862
4863            : "%edx", "%esi"                    /* clobber list */
4864#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4865            , "%mm0", "%mm1"
4866#endif
4867         );
4868      }
4869      break;
4870
4871   } /* end switch (bpp) */
4872
4873   __asm__ __volatile__ (
4874      "movl _MMXLength, %%edx       \n\t"
4875/* pre "movl row, %%edi              \n\t" */
4876      "cmpl _FullLength, %%edx      \n\t"
4877      "jnb sub_end                  \n\t"
4878
4879      "movl %%edi, %%esi            \n\t" /* lp = row */
4880/* pre "movl bpp, %%eax              \n\t" */
4881      "addl %%eax, %%edi            \n\t" /* rp = row + bpp */
4882      "xorl %%eax, %%eax            \n\t"
4883
4884   "sub_lp2:                        \n\t"
4885      "movb (%%esi,%%edx,), %%al    \n\t"
4886      "addb %%al, (%%edi,%%edx,)    \n\t"
4887      "incl %%edx                   \n\t"
4888      "cmpl _FullLength, %%edx      \n\t"
4889      "jb sub_lp2                   \n\t"
4890
4891   "sub_end:                        \n\t"
4892      "EMMS                         \n\t" /* end MMX instructions */
4893
4894      : "=a" (dummy_value_a),   /* 0      // output regs (dummy) */
4895        "=D" (dummy_value_D)    /* 1 */
4896
4897      : "0" (bpp),              /* eax    // input regs */
4898        "1" (row)               /* edi */
4899
4900      : "%edx", "%esi"                    /* clobber list */
4901   );
4902
4903} /* end of png_read_filter_row_mmx_sub() */
4904#endif
4905
4906
4907
4908
4909/*===========================================================================*/
4910/*                                                                           */
4911/*            P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P            */
4912/*                                                                           */
4913/*===========================================================================*/
4914
4915/* Optimized code for PNG Up filter decoder */
4916
4917static void /* PRIVATE */
4918png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4919                           png_bytep prev_row)
4920{
4921   png_uint_32 len;
4922   int dummy_value_d;   /* fix 'forbidden register 3 (dx) was spilled' error */
4923   int dummy_value_S;
4924   int dummy_value_D;
4925
4926   len = row_info->rowbytes;              /* number of bytes to filter */
4927
4928   __asm__ __volatile__ (
4929/* pre "movl row, %%edi              \n\t" */
4930      /* get # of bytes to alignment */
4931#ifdef __PIC__
4932      "pushl %%ebx                  \n\t"
4933#endif
4934      "movl %%edi, %%ecx            \n\t"
4935      "xorl %%ebx, %%ebx            \n\t"
4936      "addl $0x7, %%ecx             \n\t"
4937      "xorl %%eax, %%eax            \n\t"
4938      "andl $0xfffffff8, %%ecx      \n\t"
4939/* pre "movl prev_row, %%esi         \n\t" */
4940      "subl %%edi, %%ecx            \n\t"
4941      "jz up_go                     \n\t"
4942
4943   "up_lp1:                         \n\t" /* fix alignment */
4944      "movb (%%edi,%%ebx,), %%al    \n\t"
4945      "addb (%%esi,%%ebx,), %%al    \n\t"
4946      "incl %%ebx                   \n\t"
4947      "cmpl %%ecx, %%ebx            \n\t"
4948      "movb %%al, -1(%%edi,%%ebx,)  \n\t" /* mov does not affect flags; -1 to */
4949      "jb up_lp1                    \n\t" /*  offset incl ebx */
4950
4951   "up_go:                          \n\t"
4952/* pre "movl len, %%edx              \n\t" */
4953      "movl %%edx, %%ecx            \n\t"
4954      "subl %%ebx, %%edx            \n\t" /* subtract alignment fix */
4955      "andl $0x0000003f, %%edx      \n\t" /* calc bytes over mult of 64 */
4956      "subl %%edx, %%ecx            \n\t" /* drop over bytes from length */
4957
4958      /* unrolled loop - use all MMX registers and interleave to reduce */
4959      /* number of branch instructions (loops) and reduce partial stalls */
4960   "up_loop:                        \n\t"
4961      "movq (%%esi,%%ebx,), %%mm1   \n\t"
4962      "movq (%%edi,%%ebx,), %%mm0   \n\t"
4963      "movq 8(%%esi,%%ebx,), %%mm3  \n\t"
4964      "paddb %%mm1, %%mm0           \n\t"
4965      "movq 8(%%edi,%%ebx,), %%mm2  \n\t"
4966      "movq %%mm0, (%%edi,%%ebx,)   \n\t"
4967      "paddb %%mm3, %%mm2           \n\t"
4968      "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4969      "movq %%mm2, 8(%%edi,%%ebx,)  \n\t"
4970      "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4971      "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4972      "paddb %%mm5, %%mm4           \n\t"
4973      "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4974      "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4975      "paddb %%mm7, %%mm6           \n\t"
4976      "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4977      "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4978      "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4979      "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4980      "paddb %%mm1, %%mm0           \n\t"
4981      "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4982      "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4983      "paddb %%mm3, %%mm2           \n\t"
4984      "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4985      "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4986      "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4987      "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4988      "paddb %%mm5, %%mm4           \n\t"
4989      "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
4990      "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
4991      "addl $64, %%ebx              \n\t"
4992      "paddb %%mm7, %%mm6           \n\t"
4993      "cmpl %%ecx, %%ebx            \n\t"
4994      "movq %%mm6, -8(%%edi,%%ebx,) \n\t" /* (+56)movq does not affect flags; */
4995      "jb up_loop                   \n\t" /*  -8 to offset addl ebx */
4996
4997      "cmpl $0, %%edx               \n\t" /* test for bytes over mult of 64 */
4998      "jz up_end                    \n\t"
4999
5000      "cmpl $8, %%edx               \n\t" /* test for less than 8 bytes */
5001      "jb up_lt8                    \n\t" /*  [added by lcreeve@netins.net] */
5002
5003      "addl %%edx, %%ecx            \n\t"
5004      "andl $0x00000007, %%edx      \n\t" /* calc bytes over mult of 8 */
5005      "subl %%edx, %%ecx            \n\t" /* drop over bytes from length */
5006      "jz up_lt8                    \n\t"
5007
5008   "up_lpA:                         \n\t" /* use MMX regs to update 8 bytes sim. */
5009      "movq (%%esi,%%ebx,), %%mm1   \n\t"
5010      "movq (%%edi,%%ebx,), %%mm0   \n\t"
5011      "addl $8, %%ebx               \n\t"
5012      "paddb %%mm1, %%mm0           \n\t"
5013      "cmpl %%ecx, %%ebx            \n\t"
5014      "movq %%mm0, -8(%%edi,%%ebx,) \n\t" /* movq does not affect flags; -8 to */
5015      "jb up_lpA                    \n\t" /*  offset add ebx */
5016      "cmpl $0, %%edx               \n\t" /* test for bytes over mult of 8 */
5017      "jz up_end                    \n\t"
5018
5019   "up_lt8:                         \n\t"
5020      "xorl %%eax, %%eax            \n\t"
5021      "addl %%edx, %%ecx            \n\t" /* move over byte count into counter */
5022
5023   "up_lp2:                         \n\t" /* use x86 regs for remaining bytes */
5024      "movb (%%edi,%%ebx,), %%al    \n\t"
5025      "addb (%%esi,%%ebx,), %%al    \n\t"
5026      "incl %%ebx                   \n\t"
5027      "cmpl %%ecx, %%ebx            \n\t"
5028      "movb %%al, -1(%%edi,%%ebx,)  \n\t" /* mov does not affect flags; -1 to */
5029      "jb up_lp2                    \n\t" /*  offset inc ebx */
5030
5031   "up_end:                         \n\t"
5032      "EMMS                         \n\t" /* conversion of filtered row complete */
5033#ifdef __PIC__
5034      "popl %%ebx                   \n\t"
5035#endif
5036
5037      : "=d" (dummy_value_d),   /* 0      // output regs (dummy) */
5038        "=S" (dummy_value_S),   /* 1 */
5039        "=D" (dummy_value_D)    /* 2 */
5040
5041      : "0" (len),              /* edx    // input regs */
5042        "1" (prev_row),         /* esi */
5043        "2" (row)               /* edi */
5044
5045      : "%eax", "%ecx"            // clobber list (no input regs!)
5046#ifndef __PIC__
5047      , "%ebx"
5048#endif
5049
5050#if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
5051      , "%mm0", "%mm1", "%mm2", "%mm3"
5052      , "%mm4", "%mm5", "%mm6", "%mm7"
5053#endif
5054   );
5055
5056} /* end of png_read_filter_row_mmx_up() */
5057
5058#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5059
5060
5061
5062
5063/*===========================================================================*/
5064/*                                                                           */
5065/*                   P N G _ R E A D _ F I L T E R _ R O W                   */
5066/*                                                                           */
5067/*===========================================================================*/
5068
5069
5070/* Optimized png_read_filter_row routines */
5071
5072void /* PRIVATE */
5073png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
5074   row, png_bytep prev_row, int filter)
5075{
5076#ifdef PNG_DEBUG
5077   char filnm[10];
5078#endif
5079
5080#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5081/* GRR:  these are superseded by png_ptr->asm_flags: */
5082#define UseMMX_sub    1   /* GRR:  converted 20000730 */
5083#define UseMMX_up     1   /* GRR:  converted 20000729 */
5084#define UseMMX_avg    1   /* GRR:  converted 20000828 (+ 16-bit bugfix 20000916) */
5085#define UseMMX_paeth  1   /* GRR:  converted 20000828 */
5086
5087   if (_mmx_supported == 2) {
5088       /* this should have happened in png_init_mmx_flags() already */
5089#if !defined(PNG_1_0_X)
5090       png_warning(png_ptr, "asm_flags may not have been initialized");
5091#endif
5092       png_mmx_support();
5093   }
5094#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5095
5096#ifdef PNG_DEBUG
5097   png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5098   switch (filter)
5099   {
5100      case 0: sprintf(filnm, "none");
5101         break;
5102      case 1: sprintf(filnm, "sub-%s",
5103#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5104#if !defined(PNG_1_0_X)
5105        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
5106#endif
5107#endif
5108"x86");
5109         break;
5110      case 2: sprintf(filnm, "up-%s",
5111#ifdef PNG_ASSEMBLER_CODE_SUPPORTED
5112#if !defined(PNG_1_0_X)
5113        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
5114#endif
5115#endif
5116 "x86");
5117         break;
5118      case 3: sprintf(filnm, "avg-%s",
5119#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5120#if !defined(PNG_1_0_X)
5121        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
5122#endif
5123#endif
5124 "x86");
5125         break;
5126      case 4: sprintf(filnm, "Paeth-%s",
5127#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5128#if !defined(PNG_1_0_X)
5129        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
5130#endif
5131#endif
5132"x86");
5133         break;
5134      default: sprintf(filnm, "unknw");
5135         break;
5136   }
5137   png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
5138   png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
5139   png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
5140      (int)((row_info->pixel_depth + 7) >> 3));
5141   png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
5142#endif /* PNG_DEBUG */
5143
5144   switch (filter)
5145   {
5146      case PNG_FILTER_VALUE_NONE:
5147         break;
5148
5149      case PNG_FILTER_VALUE_SUB:
5150#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5151#if !defined(PNG_1_0_X)
5152         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
5153             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5154             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5155#else
5156         if (_mmx_supported)
5157#endif
5158         {
5159            png_read_filter_row_mmx_sub(row_info, row);
5160         }
5161         else
5162#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5163         {
5164            png_uint_32 i;
5165            png_uint_32 istop = row_info->rowbytes;
5166            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5167            png_bytep rp = row + bpp;
5168            png_bytep lp = row;
5169
5170            for (i = bpp; i < istop; i++)
5171            {
5172               *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
5173               rp++;
5174            }
5175         }  /* end !UseMMX_sub */
5176         break;
5177
5178      case PNG_FILTER_VALUE_UP:
5179#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5180#if !defined(PNG_1_0_X)
5181         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
5182             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5183             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5184#else
5185         if (_mmx_supported)
5186#endif
5187         {
5188            png_read_filter_row_mmx_up(row_info, row, prev_row);
5189         }
5190          else
5191#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5192         {
5193            png_uint_32 i;
5194            png_uint_32 istop = row_info->rowbytes;
5195            png_bytep rp = row;
5196            png_bytep pp = prev_row;
5197
5198            for (i = 0; i < istop; ++i)
5199            {
5200               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5201               rp++;
5202            }
5203         }  /* end !UseMMX_up */
5204         break;
5205
5206      case PNG_FILTER_VALUE_AVG:
5207#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5208#if !defined(PNG_1_0_X)
5209         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
5210             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5211             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5212#else
5213         if (_mmx_supported)
5214#endif
5215         {
5216            png_read_filter_row_mmx_avg(row_info, row, prev_row);
5217         }
5218         else
5219#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5220         {
5221            png_uint_32 i;
5222            png_bytep rp = row;
5223            png_bytep pp = prev_row;
5224            png_bytep lp = row;
5225            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5226            png_uint_32 istop = row_info->rowbytes - bpp;
5227
5228            for (i = 0; i < bpp; i++)
5229            {
5230               *rp = (png_byte)(((int)(*rp) +
5231                  ((int)(*pp++) >> 1)) & 0xff);
5232               rp++;
5233            }
5234
5235            for (i = 0; i < istop; i++)
5236            {
5237               *rp = (png_byte)(((int)(*rp) +
5238                  ((int)(*pp++ + *lp++) >> 1)) & 0xff);
5239               rp++;
5240            }
5241         }  /* end !UseMMX_avg */
5242         break;
5243
5244      case PNG_FILTER_VALUE_PAETH:
5245#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5246#if !defined(PNG_1_0_X)
5247         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
5248             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5249             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5250#else
5251         if (_mmx_supported)
5252#endif
5253         {
5254            png_read_filter_row_mmx_paeth(row_info, row, prev_row);
5255         }
5256         else
5257#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5258         {
5259            png_uint_32 i;
5260            png_bytep rp = row;
5261            png_bytep pp = prev_row;
5262            png_bytep lp = row;
5263            png_bytep cp = prev_row;
5264            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5265            png_uint_32 istop = row_info->rowbytes - bpp;
5266
5267            for (i = 0; i < bpp; i++)
5268            {
5269               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5270               rp++;
5271            }
5272
5273            for (i = 0; i < istop; i++)   /* use leftover rp,pp */
5274            {
5275               int a, b, c, pa, pb, pc, p;
5276
5277               a = *lp++;
5278               b = *pp++;
5279               c = *cp++;
5280
5281               p = b - c;
5282               pc = a - c;
5283
5284#ifdef PNG_USE_ABS
5285               pa = abs(p);
5286               pb = abs(pc);
5287               pc = abs(p + pc);
5288#else
5289               pa = p < 0 ? -p : p;
5290               pb = pc < 0 ? -pc : pc;
5291               pc = (p + pc) < 0 ? -(p + pc) : p + pc;
5292#endif
5293
5294               /*
5295                  if (pa <= pb && pa <= pc)
5296                     p = a;
5297                  else if (pb <= pc)
5298                     p = b;
5299                  else
5300                     p = c;
5301                */
5302
5303               p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
5304
5305               *rp = (png_byte)(((int)(*rp) + p) & 0xff);
5306               rp++;
5307            }
5308         }  /* end !UseMMX_paeth */
5309         break;
5310
5311      default:
5312         png_warning(png_ptr, "Ignoring bad row-filter type");
5313         *row=0;
5314         break;
5315   }
5316}
5317
5318#endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
5319
5320
5321/*===========================================================================*/
5322/*                                                                           */
5323/*                      P N G _ M M X _ S U P P O R T                        */
5324/*                                                                           */
5325/*===========================================================================*/
5326
5327/* GRR NOTES:  (1) the following code assumes 386 or better (pushfl/popfl)
5328 *             (2) all instructions compile with gcc 2.7.2.3 and later
5329 *             (3) the function is moved down here to prevent gcc from
5330 *                  inlining it in multiple places and then barfing be-
5331 *                  cause the ".NOT_SUPPORTED" label is multiply defined
5332 *             [is there a way to signal that a *single* function should
5333 *              not be inlined?  is there a way to modify the label for
5334 *              each inlined instance, e.g., by appending _1, _2, etc.?
5335 *              maybe if don't use leading "." in label name? (nope...sigh)]
5336 */
5337
5338int PNGAPI
5339png_mmx_support(void)
5340{
5341#if defined(PNG_MMX_CODE_SUPPORTED)
5342    __asm__ __volatile__ (
5343        "pushl %%ebx          \n\t"  // ebx gets clobbered by CPUID instruction
5344        "pushl %%ecx          \n\t"  // so does ecx...
5345        "pushl %%edx          \n\t"  // ...and edx (but ecx & edx safe on Linux)
5346//      ".byte  0x66          \n\t"  // convert 16-bit pushf to 32-bit pushfd
5347//      "pushf                \n\t"  // 16-bit pushf
5348        "pushfl               \n\t"  // save Eflag to stack
5349        "popl %%eax           \n\t"  // get Eflag from stack into eax
5350        "movl %%eax, %%ecx    \n\t"  // make another copy of Eflag in ecx
5351        "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
5352        "pushl %%eax          \n\t"  // save modified Eflag back to stack
5353//      ".byte  0x66          \n\t"  // convert 16-bit popf to 32-bit popfd
5354//      "popf                 \n\t"  // 16-bit popf
5355        "popfl                \n\t"  // restore modified value to Eflag reg
5356        "pushfl               \n\t"  // save Eflag to stack
5357        "popl %%eax           \n\t"  // get Eflag from stack
5358        "pushl %%ecx          \n\t"  // save original Eflag to stack
5359        "popfl                \n\t"  // restore original Eflag
5360        "xorl %%ecx, %%eax    \n\t"  // compare new Eflag with original Eflag
5361        "jz 0f                \n\t"  // if same, CPUID instr. is not supported
5362
5363        "xorl %%eax, %%eax    \n\t"  // set eax to zero
5364//      ".byte  0x0f, 0xa2    \n\t"  // CPUID instruction (two-byte opcode)
5365        "cpuid                \n\t"  // get the CPU identification info
5366        "cmpl $1, %%eax       \n\t"  // make sure eax return non-zero value
5367        "jl 0f                \n\t"  // if eax is zero, MMX is not supported
5368
5369        "xorl %%eax, %%eax    \n\t"  // set eax to zero and...
5370        "incl %%eax           \n\t"  // ...increment eax to 1.  This pair is
5371                                     // faster than the instruction "mov eax, 1"
5372        "cpuid                \n\t"  // get the CPU identification info again
5373        "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
5374        "cmpl $0, %%edx       \n\t"  // 0 = MMX not supported
5375        "jz 0f                \n\t"  // non-zero = yes, MMX IS supported
5376
5377        "movl $1, %%eax       \n\t"  // set return value to 1
5378        "jmp  1f              \n\t"  // DONE:  have MMX support
5379
5380    "0:                       \n\t"  // .NOT_SUPPORTED: target label for jump instructions
5381        "movl $0, %%eax       \n\t"  // set return value to 0
5382    "1:                       \n\t"  // .RETURN: target label for jump instructions
5383        "movl %%eax, _mmx_supported \n\t" // save in global static variable, too
5384        "popl %%edx           \n\t"  // restore edx
5385        "popl %%ecx           \n\t"  // restore ecx
5386        "popl %%ebx           \n\t"  // restore ebx
5387
5388//      "ret                  \n\t"  // DONE:  no MMX support
5389                                     // (fall through to standard C "ret")
5390
5391        :                            // output list (none)
5392
5393        :                            // any variables used on input (none)
5394
5395        : "%eax"                     // clobber list
5396//      , "%ebx", "%ecx", "%edx"     // GRR:  we handle these manually
5397//      , "memory"   // if write to a variable gcc thought was in a reg
5398//      , "cc"       // "condition codes" (flag bits)
5399    );
5400#else
5401    _mmx_supported = 0;
5402#endif /* PNG_MMX_CODE_SUPPORTED */
5403
5404    return _mmx_supported;
5405}
5406
5407
5408#endif /* PNG_USE_PNGGCCRD */
5409