1/* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
2 *
3 * For Intel x86 CPU and Microsoft Visual C++ compiler
4 *
5 * libpng version 1.2.7 - September 12, 2004
6 * For conditions of distribution and use, see copyright notice in png.h
7 * Copyright (c) 1998-2004 Glenn Randers-Pehrson
8 * Copyright (c) 1998, Intel Corporation
9 *
10 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11 * Interface to libpng contributed by Gilles Vollant, 1999
12 *
13 *
14 * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
15 * a sign error in the post-MMX cleanup code for each pixel_depth resulted
16 * in bad pixels at the beginning of some rows of some images, and also
17 * (due to out-of-range memory reads and writes) caused heap corruption
18 * when compiled with MSVC 6.0.  The error was fixed in version 1.0.4e.
19 *
20 * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
21 *
22 * [runtime MMX configuration, GRR 20010102]
23 *
24 */
25
26#define PNG_INTERNAL
27#include "png.h"
28
29#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
30
31static int mmx_supported=2;
32
33
34int PNGAPI
35png_mmx_support(void)
36{
37  int mmx_supported_local = 0;
38  _asm {
39    push ebx          /*CPUID will trash these */
40    push ecx
41    push edx
42
43    pushfd            /*Save Eflag to stack */
44    pop eax           /*Get Eflag from stack into eax */
45    mov ecx, eax      /*Make another copy of Eflag in ecx */
46    xor eax, 0x200000 /*Toggle ID bit in Eflag [i.e. bit(21)] */
47    push eax          /*Save modified Eflag back to stack */
48
49    popfd             /*Restored modified value back to Eflag reg */
50    pushfd            /*Save Eflag to stack */
51    pop eax           /*Get Eflag from stack */
52    push ecx          /* save original Eflag to stack */
53    popfd             /* restore original Eflag */
54    xor eax, ecx      /*Compare the new Eflag with the original Eflag */
55    jz NOT_SUPPORTED  /*If the same, CPUID instruction is not supported, */
56                      /*skip following instructions and jump to */
57                      /*NOT_SUPPORTED label */
58
59    xor eax, eax      /*Set eax to zero */
60
61    _asm _emit 0x0f   /*CPUID instruction  (two bytes opcode) */
62    _asm _emit 0xa2
63
64    cmp eax, 1        /*make sure eax return non-zero value */
65    jl NOT_SUPPORTED  /*If eax is zero, mmx not supported */
66
67    xor eax, eax      /*set eax to zero */
68    inc eax           /*Now increment eax to 1.  This instruction is */
69                      /*faster than the instruction "mov eax, 1" */
70
71    _asm _emit 0x0f   /*CPUID instruction */
72    _asm _emit 0xa2
73
74    and edx, 0x00800000  /*mask out all bits but mmx bit(24) */
75    cmp edx, 0        /* 0 = mmx not supported */
76    jz  NOT_SUPPORTED /* non-zero = Yes, mmx IS supported */
77
78    mov  mmx_supported_local, 1  /*set return value to 1 */
79
80NOT_SUPPORTED:
81    mov  eax, mmx_supported_local  /*move return value to eax */
82    pop edx          /*CPUID trashed these */
83    pop ecx
84    pop ebx
85  }
86
87  /*mmx_supported_local=0; // test code for force don't support MMX */
88  /*printf("MMX : %u (1=MMX supported)\n",mmx_supported_local); */
89
90  mmx_supported = mmx_supported_local;
91  return mmx_supported_local;
92}
93
94/* Combines the row recently read in with the previous row.
95   This routine takes care of alpha and transparency if requested.
96   This routine also handles the two methods of progressive display
97   of interlaced images, depending on the mask value.
98   The mask value describes which pixels are to be combined with
99   the row.  The pattern always repeats every 8 pixels, so just 8
100   bits are needed.  A one indicates the pixel is to be combined; a
101   zero indicates the pixel is to be skipped.  This is in addition
102   to any alpha or transparency value associated with the pixel.  If
103   you want all pixels to be combined, pass 0xff (255) in mask.  */
104
105/* Use this routine for x86 platform - uses faster MMX routine if machine
106   supports MMX */
107
108void /* PRIVATE */
109png_combine_row(png_structp png_ptr, png_bytep row, int mask)
110{
111#ifdef PNG_USE_LOCAL_ARRAYS
112   const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
113#endif
114
115   png_debug(1,"in png_combine_row_asm\n");
116
117   if (mmx_supported == 2) {
118#if !defined(PNG_1_0_X)
119       /* this should have happened in png_init_mmx_flags() already */
120       png_warning(png_ptr, "asm_flags may not have been initialized");
121#endif
122       png_mmx_support();
123   }
124
125   if (mask == 0xff)
126   {
127      png_memcpy(row, png_ptr->row_buf + 1,
128       (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,
129       png_ptr->width));
130   }
131   /* GRR:  add "else if (mask == 0)" case?
132    *       or does png_combine_row() not even get called in that case? */
133   else
134   {
135      switch (png_ptr->row_info.pixel_depth)
136      {
137         case 1:
138         {
139            png_bytep sp;
140            png_bytep dp;
141            int s_inc, s_start, s_end;
142            int m;
143            int shift;
144            png_uint_32 i;
145
146            sp = png_ptr->row_buf + 1;
147            dp = row;
148            m = 0x80;
149#if defined(PNG_READ_PACKSWAP_SUPPORTED)
150            if (png_ptr->transformations & PNG_PACKSWAP)
151            {
152                s_start = 0;
153                s_end = 7;
154                s_inc = 1;
155            }
156            else
157#endif
158            {
159                s_start = 7;
160                s_end = 0;
161                s_inc = -1;
162            }
163
164            shift = s_start;
165
166            for (i = 0; i < png_ptr->width; i++)
167            {
168               if (m & mask)
169               {
170                  int value;
171
172                  value = (*sp >> shift) & 0x1;
173                  *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
174                  *dp |= (png_byte)(value << shift);
175               }
176
177               if (shift == s_end)
178               {
179                  shift = s_start;
180                  sp++;
181                  dp++;
182               }
183               else
184                  shift += s_inc;
185
186               if (m == 1)
187                  m = 0x80;
188               else
189                  m >>= 1;
190            }
191            break;
192         }
193
194         case 2:
195         {
196            png_bytep sp;
197            png_bytep dp;
198            int s_start, s_end, s_inc;
199            int m;
200            int shift;
201            png_uint_32 i;
202            int value;
203
204            sp = png_ptr->row_buf + 1;
205            dp = row;
206            m = 0x80;
207#if defined(PNG_READ_PACKSWAP_SUPPORTED)
208            if (png_ptr->transformations & PNG_PACKSWAP)
209            {
210               s_start = 0;
211               s_end = 6;
212               s_inc = 2;
213            }
214            else
215#endif
216            {
217               s_start = 6;
218               s_end = 0;
219               s_inc = -2;
220            }
221
222            shift = s_start;
223
224            for (i = 0; i < png_ptr->width; i++)
225            {
226               if (m & mask)
227               {
228                  value = (*sp >> shift) & 0x3;
229                  *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
230                  *dp |= (png_byte)(value << shift);
231               }
232
233               if (shift == s_end)
234               {
235                  shift = s_start;
236                  sp++;
237                  dp++;
238               }
239               else
240                  shift += s_inc;
241               if (m == 1)
242                  m = 0x80;
243               else
244                  m >>= 1;
245            }
246            break;
247         }
248
249         case 4:
250         {
251            png_bytep sp;
252            png_bytep dp;
253            int s_start, s_end, s_inc;
254            int m;
255            int shift;
256            png_uint_32 i;
257            int value;
258
259            sp = png_ptr->row_buf + 1;
260            dp = row;
261            m = 0x80;
262#if defined(PNG_READ_PACKSWAP_SUPPORTED)
263            if (png_ptr->transformations & PNG_PACKSWAP)
264            {
265               s_start = 0;
266               s_end = 4;
267               s_inc = 4;
268            }
269            else
270#endif
271            {
272               s_start = 4;
273               s_end = 0;
274               s_inc = -4;
275            }
276            shift = s_start;
277
278            for (i = 0; i < png_ptr->width; i++)
279            {
280               if (m & mask)
281               {
282                  value = (*sp >> shift) & 0xf;
283                  *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
284                  *dp |= (png_byte)(value << shift);
285               }
286
287               if (shift == s_end)
288               {
289                  shift = s_start;
290                  sp++;
291                  dp++;
292               }
293               else
294                  shift += s_inc;
295               if (m == 1)
296                  m = 0x80;
297               else
298                  m >>= 1;
299            }
300            break;
301         }
302
303         case 8:
304         {
305            png_bytep srcptr;
306            png_bytep dstptr;
307            png_uint_32 len;
308            int m;
309            int diff, unmask;
310
311            __int64 mask0=0x0102040810204080;
312
313#if !defined(PNG_1_0_X)
314            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
315                /* && mmx_supported */ )
316#else
317            if (mmx_supported)
318#endif
319            {
320               srcptr = png_ptr->row_buf + 1;
321               dstptr = row;
322               m = 0x80;
323               unmask = ~mask;
324               len  = png_ptr->width &~7;  /*reduce to multiple of 8 */
325               diff = png_ptr->width & 7;  /*amount lost */
326
327               _asm
328               {
329                  movd       mm7, unmask   /*load bit pattern */
330                  psubb      mm6,mm6       /*zero mm6 */
331                  punpcklbw  mm7,mm7
332                  punpcklwd  mm7,mm7
333                  punpckldq  mm7,mm7       /*fill register with 8 masks */
334
335                  movq       mm0,mask0
336
337                  pand       mm0,mm7       /*nonzero if keep byte */
338                  pcmpeqb    mm0,mm6       /*zeros->1s, v versa */
339
340                  mov        ecx,len       /*load length of line (pixels) */
341                  mov        esi,srcptr    /*load source */
342                  mov        ebx,dstptr    /*load dest */
343                  cmp        ecx,0         /*lcr */
344                  je         mainloop8end
345
346mainloop8:
347                  movq       mm4,[esi]
348                  pand       mm4,mm0
349                  movq       mm6,mm0
350                  pandn      mm6,[ebx]
351                  por        mm4,mm6
352                  movq       [ebx],mm4
353
354                  add        esi,8         /*inc by 8 bytes processed */
355                  add        ebx,8
356                  sub        ecx,8         /*dec by 8 pixels processed */
357
358                  ja         mainloop8
359mainloop8end:
360
361                  mov        ecx,diff
362                  cmp        ecx,0
363                  jz         end8
364
365                  mov        edx,mask
366                  sal        edx,24        /*make low byte the high byte */
367
368secondloop8:
369                  sal        edx,1         /*move high bit to CF */
370                  jnc        skip8         /*if CF = 0 */
371                  mov        al,[esi]
372                  mov        [ebx],al
373skip8:
374                  inc        esi
375                  inc        ebx
376
377                  dec        ecx
378                  jnz        secondloop8
379end8:
380                  emms
381               }
382            }
383            else /* mmx not supported - use modified C routine */
384            {
385               register unsigned int incr1, initial_val, final_val;
386               png_size_t pixel_bytes;
387               png_uint_32 i;
388               register int disp = png_pass_inc[png_ptr->pass];
389               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
390
391               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
392               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
393                  pixel_bytes;
394               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
395               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
396               final_val = png_ptr->width*pixel_bytes;
397               incr1 = (disp)*pixel_bytes;
398               for (i = initial_val; i < final_val; i += incr1)
399               {
400                  png_memcpy(dstptr, srcptr, pixel_bytes);
401                  srcptr += incr1;
402                  dstptr += incr1;
403               }
404            } /* end of else */
405
406            break;
407         }       /* end 8 bpp */
408
409         case 16:
410         {
411            png_bytep srcptr;
412            png_bytep dstptr;
413            png_uint_32 len;
414            int unmask, diff;
415            __int64 mask1=0x0101020204040808,
416                    mask0=0x1010202040408080;
417
418#if !defined(PNG_1_0_X)
419            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
420                /* && mmx_supported */ )
421#else
422            if (mmx_supported)
423#endif
424            {
425               srcptr = png_ptr->row_buf + 1;
426               dstptr = row;
427
428               unmask = ~mask;
429               len     = (png_ptr->width)&~7;
430               diff = (png_ptr->width)&7;
431               _asm
432               {
433                  movd       mm7, unmask       /*load bit pattern */
434                  psubb      mm6,mm6           /*zero mm6 */
435                  punpcklbw  mm7,mm7
436                  punpcklwd  mm7,mm7
437                  punpckldq  mm7,mm7           /*fill register with 8 masks */
438
439                  movq       mm0,mask0
440                  movq       mm1,mask1
441
442                  pand       mm0,mm7
443                  pand       mm1,mm7
444
445                  pcmpeqb    mm0,mm6
446                  pcmpeqb    mm1,mm6
447
448                  mov        ecx,len           /*load length of line */
449                  mov        esi,srcptr        /*load source */
450                  mov        ebx,dstptr        /*load dest */
451                  cmp        ecx,0             /*lcr */
452                  jz         mainloop16end
453
454mainloop16:
455                  movq       mm4,[esi]
456                  pand       mm4,mm0
457                  movq       mm6,mm0
458                  movq       mm7,[ebx]
459                  pandn      mm6,mm7
460                  por        mm4,mm6
461                  movq       [ebx],mm4
462
463                  movq       mm5,[esi+8]
464                  pand       mm5,mm1
465                  movq       mm7,mm1
466                  movq       mm6,[ebx+8]
467                  pandn      mm7,mm6
468                  por        mm5,mm7
469                  movq       [ebx+8],mm5
470
471                  add        esi,16            /*inc by 16 bytes processed */
472                  add        ebx,16
473                  sub        ecx,8             /*dec by 8 pixels processed */
474
475                  ja         mainloop16
476
477mainloop16end:
478                  mov        ecx,diff
479                  cmp        ecx,0
480                  jz         end16
481
482                  mov        edx,mask
483                  sal        edx,24            /*make low byte the high byte */
484secondloop16:
485                  sal        edx,1             /*move high bit to CF */
486                  jnc        skip16            /*if CF = 0 */
487                  mov        ax,[esi]
488                  mov        [ebx],ax
489skip16:
490                  add        esi,2
491                  add        ebx,2
492
493                  dec        ecx
494                  jnz        secondloop16
495end16:
496                  emms
497               }
498            }
499            else /* mmx not supported - use modified C routine */
500            {
501               register unsigned int incr1, initial_val, final_val;
502               png_size_t pixel_bytes;
503               png_uint_32 i;
504               register int disp = png_pass_inc[png_ptr->pass];
505               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
506
507               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
508               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
509                  pixel_bytes;
510               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
511               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
512               final_val = png_ptr->width*pixel_bytes;
513               incr1 = (disp)*pixel_bytes;
514               for (i = initial_val; i < final_val; i += incr1)
515               {
516                  png_memcpy(dstptr, srcptr, pixel_bytes);
517                  srcptr += incr1;
518                  dstptr += incr1;
519               }
520            } /* end of else */
521
522            break;
523         }       /* end 16 bpp */
524
525         case 24:
526         {
527            png_bytep srcptr;
528            png_bytep dstptr;
529            png_uint_32 len;
530            int unmask, diff;
531
532            __int64 mask2=0x0101010202020404,  /*24bpp */
533                    mask1=0x0408080810101020,
534                    mask0=0x2020404040808080;
535
536            srcptr = png_ptr->row_buf + 1;
537            dstptr = row;
538
539            unmask = ~mask;
540            len     = (png_ptr->width)&~7;
541            diff = (png_ptr->width)&7;
542
543#if !defined(PNG_1_0_X)
544            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
545                /* && mmx_supported */ )
546#else
547            if (mmx_supported)
548#endif
549            {
550               _asm
551               {
552                  movd       mm7, unmask       /*load bit pattern */
553                  psubb      mm6,mm6           /*zero mm6 */
554                  punpcklbw  mm7,mm7
555                  punpcklwd  mm7,mm7
556                  punpckldq  mm7,mm7           /*fill register with 8 masks */
557
558                  movq       mm0,mask0
559                  movq       mm1,mask1
560                  movq       mm2,mask2
561
562                  pand       mm0,mm7
563                  pand       mm1,mm7
564                  pand       mm2,mm7
565
566                  pcmpeqb    mm0,mm6
567                  pcmpeqb    mm1,mm6
568                  pcmpeqb    mm2,mm6
569
570                  mov        ecx,len           /*load length of line */
571                  mov        esi,srcptr        /*load source */
572                  mov        ebx,dstptr        /*load dest */
573                  cmp        ecx,0
574                  jz         mainloop24end
575
576mainloop24:
577                  movq       mm4,[esi]
578                  pand       mm4,mm0
579                  movq       mm6,mm0
580                  movq       mm7,[ebx]
581                  pandn      mm6,mm7
582                  por        mm4,mm6
583                  movq       [ebx],mm4
584
585
586                  movq       mm5,[esi+8]
587                  pand       mm5,mm1
588                  movq       mm7,mm1
589                  movq       mm6,[ebx+8]
590                  pandn      mm7,mm6
591                  por        mm5,mm7
592                  movq       [ebx+8],mm5
593
594                  movq       mm6,[esi+16]
595                  pand       mm6,mm2
596                  movq       mm4,mm2
597                  movq       mm7,[ebx+16]
598                  pandn      mm4,mm7
599                  por        mm6,mm4
600                  movq       [ebx+16],mm6
601
602                  add        esi,24            /*inc by 24 bytes processed */
603                  add        ebx,24
604                  sub        ecx,8             /*dec by 8 pixels processed */
605
606                  ja         mainloop24
607
608mainloop24end:
609                  mov        ecx,diff
610                  cmp        ecx,0
611                  jz         end24
612
613                  mov        edx,mask
614                  sal        edx,24            /*make low byte the high byte */
615secondloop24:
616                  sal        edx,1             /*move high bit to CF */
617                  jnc        skip24            /*if CF = 0 */
618                  mov        ax,[esi]
619                  mov        [ebx],ax
620                  xor        eax,eax
621                  mov        al,[esi+2]
622                  mov        [ebx+2],al
623skip24:
624                  add        esi,3
625                  add        ebx,3
626
627                  dec        ecx
628                  jnz        secondloop24
629
630end24:
631                  emms
632               }
633            }
634            else /* mmx not supported - use modified C routine */
635            {
636               register unsigned int incr1, initial_val, final_val;
637               png_size_t pixel_bytes;
638               png_uint_32 i;
639               register int disp = png_pass_inc[png_ptr->pass];
640               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
641
642               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
643               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
644                  pixel_bytes;
645               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
646               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
647               final_val = png_ptr->width*pixel_bytes;
648               incr1 = (disp)*pixel_bytes;
649               for (i = initial_val; i < final_val; i += incr1)
650               {
651                  png_memcpy(dstptr, srcptr, pixel_bytes);
652                  srcptr += incr1;
653                  dstptr += incr1;
654               }
655            } /* end of else */
656
657            break;
658         }       /* end 24 bpp */
659
660         case 32:
661         {
662            png_bytep srcptr;
663            png_bytep dstptr;
664            png_uint_32 len;
665            int unmask, diff;
666
667            __int64 mask3=0x0101010102020202,  /*32bpp */
668                    mask2=0x0404040408080808,
669                    mask1=0x1010101020202020,
670                    mask0=0x4040404080808080;
671
672            srcptr = png_ptr->row_buf + 1;
673            dstptr = row;
674
675            unmask = ~mask;
676            len     = (png_ptr->width)&~7;
677            diff = (png_ptr->width)&7;
678
679#if !defined(PNG_1_0_X)
680            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
681                /* && mmx_supported */ )
682#else
683            if (mmx_supported)
684#endif
685            {
686               _asm
687               {
688                  movd       mm7, unmask       /*load bit pattern */
689                  psubb      mm6,mm6           /*zero mm6 */
690                  punpcklbw  mm7,mm7
691                  punpcklwd  mm7,mm7
692                  punpckldq  mm7,mm7           /*fill register with 8 masks */
693
694                  movq       mm0,mask0
695                  movq       mm1,mask1
696                  movq       mm2,mask2
697                  movq       mm3,mask3
698
699                  pand       mm0,mm7
700                  pand       mm1,mm7
701                  pand       mm2,mm7
702                  pand       mm3,mm7
703
704                  pcmpeqb    mm0,mm6
705                  pcmpeqb    mm1,mm6
706                  pcmpeqb    mm2,mm6
707                  pcmpeqb    mm3,mm6
708
709                  mov        ecx,len           /*load length of line */
710                  mov        esi,srcptr        /*load source */
711                  mov        ebx,dstptr        /*load dest */
712
713                  cmp        ecx,0             /*lcr */
714                  jz         mainloop32end
715
716mainloop32:
717                  movq       mm4,[esi]
718                  pand       mm4,mm0
719                  movq       mm6,mm0
720                  movq       mm7,[ebx]
721                  pandn      mm6,mm7
722                  por        mm4,mm6
723                  movq       [ebx],mm4
724
725                  movq       mm5,[esi+8]
726                  pand       mm5,mm1
727                  movq       mm7,mm1
728                  movq       mm6,[ebx+8]
729                  pandn      mm7,mm6
730                  por        mm5,mm7
731                  movq       [ebx+8],mm5
732
733                  movq       mm6,[esi+16]
734                  pand       mm6,mm2
735                  movq       mm4,mm2
736                  movq       mm7,[ebx+16]
737                  pandn      mm4,mm7
738                  por        mm6,mm4
739                  movq       [ebx+16],mm6
740
741                  movq       mm7,[esi+24]
742                  pand       mm7,mm3
743                  movq       mm5,mm3
744                  movq       mm4,[ebx+24]
745                  pandn      mm5,mm4
746                  por        mm7,mm5
747                  movq       [ebx+24],mm7
748
749                  add        esi,32            /*inc by 32 bytes processed */
750                  add        ebx,32
751                  sub        ecx,8             /*dec by 8 pixels processed */
752
753                  ja         mainloop32
754
755mainloop32end:
756                  mov        ecx,diff
757                  cmp        ecx,0
758                  jz         end32
759
760                  mov        edx,mask
761                  sal        edx,24            /*make low byte the high byte */
762secondloop32:
763                  sal        edx,1             /*move high bit to CF */
764                  jnc        skip32            /*if CF = 0 */
765                  mov        eax,[esi]
766                  mov        [ebx],eax
767skip32:
768                  add        esi,4
769                  add        ebx,4
770
771                  dec        ecx
772                  jnz        secondloop32
773
774end32:
775                  emms
776               }
777            }
778            else /* mmx _not supported - Use modified C routine */
779            {
780               register unsigned int incr1, initial_val, final_val;
781               png_size_t pixel_bytes;
782               png_uint_32 i;
783               register int disp = png_pass_inc[png_ptr->pass];
784               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
785
786               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
787               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
788                  pixel_bytes;
789               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
790               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
791               final_val = png_ptr->width*pixel_bytes;
792               incr1 = (disp)*pixel_bytes;
793               for (i = initial_val; i < final_val; i += incr1)
794               {
795                  png_memcpy(dstptr, srcptr, pixel_bytes);
796                  srcptr += incr1;
797                  dstptr += incr1;
798               }
799            } /* end of else */
800
801            break;
802         }       /* end 32 bpp */
803
804         case 48:
805         {
806            png_bytep srcptr;
807            png_bytep dstptr;
808            png_uint_32 len;
809            int unmask, diff;
810
811            __int64 mask5=0x0101010101010202,
812                    mask4=0x0202020204040404,
813                    mask3=0x0404080808080808,
814                    mask2=0x1010101010102020,
815                    mask1=0x2020202040404040,
816                    mask0=0x4040808080808080;
817
818#if !defined(PNG_1_0_X)
819            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
820                /* && mmx_supported */ )
821#else
822            if (mmx_supported)
823#endif
824            {
825               srcptr = png_ptr->row_buf + 1;
826               dstptr = row;
827
828               unmask = ~mask;
829               len     = (png_ptr->width)&~7;
830               diff = (png_ptr->width)&7;
831               _asm
832               {
833                  movd       mm7, unmask       /*load bit pattern */
834                  psubb      mm6,mm6           /*zero mm6 */
835                  punpcklbw  mm7,mm7
836                  punpcklwd  mm7,mm7
837                  punpckldq  mm7,mm7           /*fill register with 8 masks */
838
839                  movq       mm0,mask0
840                  movq       mm1,mask1
841                  movq       mm2,mask2
842                  movq       mm3,mask3
843                  movq       mm4,mask4
844                  movq       mm5,mask5
845
846                  pand       mm0,mm7
847                  pand       mm1,mm7
848                  pand       mm2,mm7
849                  pand       mm3,mm7
850                  pand       mm4,mm7
851                  pand       mm5,mm7
852
853                  pcmpeqb    mm0,mm6
854                  pcmpeqb    mm1,mm6
855                  pcmpeqb    mm2,mm6
856                  pcmpeqb    mm3,mm6
857                  pcmpeqb    mm4,mm6
858                  pcmpeqb    mm5,mm6
859
860                  mov        ecx,len           /*load length of line */
861                  mov        esi,srcptr        /*load source */
862                  mov        ebx,dstptr        /*load dest */
863
864                  cmp        ecx,0
865                  jz         mainloop48end
866
867mainloop48:
868                  movq       mm7,[esi]
869                  pand       mm7,mm0
870                  movq       mm6,mm0
871                  pandn      mm6,[ebx]
872                  por        mm7,mm6
873                  movq       [ebx],mm7
874
875                  movq       mm6,[esi+8]
876                  pand       mm6,mm1
877                  movq       mm7,mm1
878                  pandn      mm7,[ebx+8]
879                  por        mm6,mm7
880                  movq       [ebx+8],mm6
881
882                  movq       mm6,[esi+16]
883                  pand       mm6,mm2
884                  movq       mm7,mm2
885                  pandn      mm7,[ebx+16]
886                  por        mm6,mm7
887                  movq       [ebx+16],mm6
888
889                  movq       mm7,[esi+24]
890                  pand       mm7,mm3
891                  movq       mm6,mm3
892                  pandn      mm6,[ebx+24]
893                  por        mm7,mm6
894                  movq       [ebx+24],mm7
895
896                  movq       mm6,[esi+32]
897                  pand       mm6,mm4
898                  movq       mm7,mm4
899                  pandn      mm7,[ebx+32]
900                  por        mm6,mm7
901                  movq       [ebx+32],mm6
902
903                  movq       mm7,[esi+40]
904                  pand       mm7,mm5
905                  movq       mm6,mm5
906                  pandn      mm6,[ebx+40]
907                  por        mm7,mm6
908                  movq       [ebx+40],mm7
909
910                  add        esi,48            /*inc by 32 bytes processed */
911                  add        ebx,48
912                  sub        ecx,8             /*dec by 8 pixels processed */
913
914                  ja         mainloop48
915mainloop48end:
916
917                  mov        ecx,diff
918                  cmp        ecx,0
919                  jz         end48
920
921                  mov        edx,mask
922                  sal        edx,24            /*make low byte the high byte */
923
924secondloop48:
925                  sal        edx,1             /*move high bit to CF */
926                  jnc        skip48            /*if CF = 0 */
927                  mov        eax,[esi]
928                  mov        [ebx],eax
929skip48:
930                  add        esi,4
931                  add        ebx,4
932
933                  dec        ecx
934                  jnz        secondloop48
935
936end48:
937                  emms
938               }
939            }
940            else /* mmx _not supported - Use modified C routine */
941            {
942               register unsigned int incr1, initial_val, final_val;
943               png_size_t pixel_bytes;
944               png_uint_32 i;
945               register int disp = png_pass_inc[png_ptr->pass];
946               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
947
948               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
949               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
950                  pixel_bytes;
951               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
952               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
953               final_val = png_ptr->width*pixel_bytes;
954               incr1 = (disp)*pixel_bytes;
955               for (i = initial_val; i < final_val; i += incr1)
956               {
957                  png_memcpy(dstptr, srcptr, pixel_bytes);
958                  srcptr += incr1;
959                  dstptr += incr1;
960               }
961            } /* end of else */
962
963            break;
964         }       /* end 48 bpp */
965
966         default:
967         {
968            png_bytep sptr;
969            png_bytep dp;
970            png_size_t pixel_bytes;
971            int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
972            unsigned int i;
973            register int disp = png_pass_inc[png_ptr->pass];  /* get the offset */
974            register unsigned int incr1, initial_val, final_val;
975
976            pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
977            sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
978               pixel_bytes;
979            dp = row + offset_table[png_ptr->pass]*pixel_bytes;
980            initial_val = offset_table[png_ptr->pass]*pixel_bytes;
981            final_val = png_ptr->width*pixel_bytes;
982            incr1 = (disp)*pixel_bytes;
983            for (i = initial_val; i < final_val; i += incr1)
984            {
985               png_memcpy(dp, sptr, pixel_bytes);
986               sptr += incr1;
987               dp += incr1;
988            }
989            break;
990         }
991      } /* end switch (png_ptr->row_info.pixel_depth) */
992   } /* end if (non-trivial mask) */
993
994} /* end png_combine_row() */
995
996
997#if defined(PNG_READ_INTERLACING_SUPPORTED)
998
999void /* PRIVATE */
1000png_do_read_interlace(png_structp png_ptr)
1001{
1002   png_row_infop row_info = &(png_ptr->row_info);
1003   png_bytep row = png_ptr->row_buf + 1;
1004   int pass = png_ptr->pass;
1005   png_uint_32 transformations = png_ptr->transformations;
1006#ifdef PNG_USE_LOCAL_ARRAYS
1007   const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
1008#endif
1009
1010   png_debug(1,"in png_do_read_interlace\n");
1011
1012   if (mmx_supported == 2) {
1013#if !defined(PNG_1_0_X)
1014       /* this should have happened in png_init_mmx_flags() already */
1015       png_warning(png_ptr, "asm_flags may not have been initialized");
1016#endif
1017       png_mmx_support();
1018   }
1019
1020   if (row != NULL && row_info != NULL)
1021   {
1022      png_uint_32 final_width;
1023
1024      final_width = row_info->width * png_pass_inc[pass];
1025
1026      switch (row_info->pixel_depth)
1027      {
1028         case 1:
1029         {
1030            png_bytep sp, dp;
1031            int sshift, dshift;
1032            int s_start, s_end, s_inc;
1033            png_byte v;
1034            png_uint_32 i;
1035            int j;
1036
1037            sp = row + (png_size_t)((row_info->width - 1) >> 3);
1038            dp = row + (png_size_t)((final_width - 1) >> 3);
1039#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1040            if (transformations & PNG_PACKSWAP)
1041            {
1042               sshift = (int)((row_info->width + 7) & 7);
1043               dshift = (int)((final_width + 7) & 7);
1044               s_start = 7;
1045               s_end = 0;
1046               s_inc = -1;
1047            }
1048            else
1049#endif
1050            {
1051               sshift = 7 - (int)((row_info->width + 7) & 7);
1052               dshift = 7 - (int)((final_width + 7) & 7);
1053               s_start = 0;
1054               s_end = 7;
1055               s_inc = 1;
1056            }
1057
1058            for (i = row_info->width; i; i--)
1059            {
1060               v = (png_byte)((*sp >> sshift) & 0x1);
1061               for (j = 0; j < png_pass_inc[pass]; j++)
1062               {
1063                  *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1064                  *dp |= (png_byte)(v << dshift);
1065                  if (dshift == s_end)
1066                  {
1067                     dshift = s_start;
1068                     dp--;
1069                  }
1070                  else
1071                     dshift += s_inc;
1072               }
1073               if (sshift == s_end)
1074               {
1075                  sshift = s_start;
1076                  sp--;
1077               }
1078               else
1079                  sshift += s_inc;
1080            }
1081            break;
1082         }
1083
1084         case 2:
1085         {
1086            png_bytep sp, dp;
1087            int sshift, dshift;
1088            int s_start, s_end, s_inc;
1089            png_uint_32 i;
1090
1091            sp = row + (png_size_t)((row_info->width - 1) >> 2);
1092            dp = row + (png_size_t)((final_width - 1) >> 2);
1093#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1094            if (transformations & PNG_PACKSWAP)
1095            {
1096               sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1097               dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1098               s_start = 6;
1099               s_end = 0;
1100               s_inc = -2;
1101            }
1102            else
1103#endif
1104            {
1105               sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1106               dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1107               s_start = 0;
1108               s_end = 6;
1109               s_inc = 2;
1110            }
1111
1112            for (i = row_info->width; i; i--)
1113            {
1114               png_byte v;
1115               int j;
1116
1117               v = (png_byte)((*sp >> sshift) & 0x3);
1118               for (j = 0; j < png_pass_inc[pass]; j++)
1119               {
1120                  *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1121                  *dp |= (png_byte)(v << dshift);
1122                  if (dshift == s_end)
1123                  {
1124                     dshift = s_start;
1125                     dp--;
1126                  }
1127                  else
1128                     dshift += s_inc;
1129               }
1130               if (sshift == s_end)
1131               {
1132                  sshift = s_start;
1133                  sp--;
1134               }
1135               else
1136                  sshift += s_inc;
1137            }
1138            break;
1139         }
1140
1141         case 4:
1142         {
1143            png_bytep sp, dp;
1144            int sshift, dshift;
1145            int s_start, s_end, s_inc;
1146            png_uint_32 i;
1147
1148            sp = row + (png_size_t)((row_info->width - 1) >> 1);
1149            dp = row + (png_size_t)((final_width - 1) >> 1);
1150#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1151            if (transformations & PNG_PACKSWAP)
1152            {
1153               sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1154               dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1155               s_start = 4;
1156               s_end = 0;
1157               s_inc = -4;
1158            }
1159            else
1160#endif
1161            {
1162               sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1163               dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1164               s_start = 0;
1165               s_end = 4;
1166               s_inc = 4;
1167            }
1168
1169            for (i = row_info->width; i; i--)
1170            {
1171               png_byte v;
1172               int j;
1173
1174               v = (png_byte)((*sp >> sshift) & 0xf);
1175               for (j = 0; j < png_pass_inc[pass]; j++)
1176               {
1177                  *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1178                  *dp |= (png_byte)(v << dshift);
1179                  if (dshift == s_end)
1180                  {
1181                     dshift = s_start;
1182                     dp--;
1183                  }
1184                  else
1185                     dshift += s_inc;
1186               }
1187               if (sshift == s_end)
1188               {
1189                  sshift = s_start;
1190                  sp--;
1191               }
1192               else
1193                  sshift += s_inc;
1194            }
1195            break;
1196         }
1197
1198         default:         /* This is the place where the routine is modified */
1199         {
1200            __int64 const4 = 0x0000000000FFFFFF;
1201            /* __int64 const5 = 0x000000FFFFFF0000;  // unused... */
1202            __int64 const6 = 0x00000000000000FF;
1203            png_bytep sptr, dp;
1204            png_uint_32 i;
1205            png_size_t pixel_bytes;
1206            int width = row_info->width;
1207
1208            pixel_bytes = (row_info->pixel_depth >> 3);
1209
1210            sptr = row + (width - 1) * pixel_bytes;
1211            dp = row + (final_width - 1) * pixel_bytes;
1212            /* New code by Nirav Chhatrapati - Intel Corporation */
1213            /* sign fix by GRR */
1214            /* NOTE:  there is NO MMX code for 48-bit and 64-bit images */
1215
1216            // use MMX routine if machine supports it
1217#if !defined(PNG_1_0_X)
1218            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1219                /* && mmx_supported */ )
1220#else
1221            if (mmx_supported)
1222#endif
1223            {
1224               if (pixel_bytes == 3)
1225               {
1226                  if (((pass == 0) || (pass == 1)) && width)
1227                  {
1228                     _asm
1229                     {
1230                        mov esi, sptr
1231                        mov edi, dp
1232                        mov ecx, width
1233                        sub edi, 21   /* (png_pass_inc[pass] - 1)*pixel_bytes */
1234loop_pass0:
1235                        movd mm0, [esi]     ; X X X X X v2 v1 v0
1236                        pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
1237                        movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
1238                        psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
1239                        movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
1240                        psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
1241                        psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
1242                        por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
1243                        por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
1244                        movq mm3, mm0       ; v2 v1 v0 v2 v1 v0 v2 v1
1245                        psllq mm0, 16       ; v0 v2 v1 v0 v2 v1 0 0
1246                        movq mm4, mm3       ; v2 v1 v0 v2 v1 v0 v2 v1
1247                        punpckhdq mm3, mm0  ; v0 v2 v1 v0 v2 v1 v0 v2
1248                        movq [edi+16] , mm4
1249                        psrlq mm0, 32       ; 0 0 0 0 v0 v2 v1 v0
1250                        movq [edi+8] , mm3
1251                        punpckldq mm0, mm4  ; v1 v0 v2 v1 v0 v2 v1 v0
1252                        sub esi, 3
1253                        movq [edi], mm0
1254                        sub edi, 24
1255                        /*sub esi, 3 */
1256                        dec ecx
1257                        jnz loop_pass0
1258                        EMMS
1259                     }
1260                  }
1261                  else if (((pass == 2) || (pass == 3)) && width)
1262                  {
1263                     _asm
1264                     {
1265                        mov esi, sptr
1266                        mov edi, dp
1267                        mov ecx, width
1268                        sub edi, 9   /* (png_pass_inc[pass] - 1)*pixel_bytes */
1269loop_pass2:
1270                        movd mm0, [esi]     ; X X X X X v2 v1 v0
1271                        pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
1272                        movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
1273                        psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
1274                        movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
1275                        psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
1276                        psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
1277                        por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
1278                        por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
1279                        movq [edi+4], mm0   ; move to memory
1280                        psrlq mm0, 16       ; 0 0 v2 v1 v0 v2 v1 v0
1281                        movd [edi], mm0     ; move to memory
1282                        sub esi, 3
1283                        sub edi, 12
1284                        dec ecx
1285                        jnz loop_pass2
1286                        EMMS
1287                     }
1288                  }
1289                  else if (width) /* && ((pass == 4) || (pass == 5)) */
1290                  {
1291                     int width_mmx = ((width >> 1) << 1) - 8;
1292                     if (width_mmx < 0)
1293                         width_mmx = 0;
1294                     width -= width_mmx;        /* 8 or 9 pix, 24 or 27 bytes */
1295                     if (width_mmx)
1296                     {
1297                        _asm
1298                        {
1299                           mov esi, sptr
1300                           mov edi, dp
1301                           mov ecx, width_mmx
1302                           sub esi, 3
1303                           sub edi, 9
1304loop_pass4:
1305                           movq mm0, [esi]     ; X X v2 v1 v0 v5 v4 v3
1306                           movq mm7, mm0       ; X X v2 v1 v0 v5 v4 v3
1307                           movq mm6, mm0       ; X X v2 v1 v0 v5 v4 v3
1308                           psllq mm0, 24       ; v1 v0 v5 v4 v3 0 0 0
1309                           pand mm7, const4    ; 0 0 0 0 0 v5 v4 v3
1310                           psrlq mm6, 24       ; 0 0 0 X X v2 v1 v0
1311                           por mm0, mm7        ; v1 v0 v5 v4 v3 v5 v4 v3
1312                           movq mm5, mm6       ; 0 0 0 X X v2 v1 v0
1313                           psllq mm6, 8        ; 0 0 X X v2 v1 v0 0
1314                           movq [edi], mm0     ; move quad to memory
1315                           psrlq mm5, 16       ; 0 0 0 0 0 X X v2
1316                           pand mm5, const6    ; 0 0 0 0 0 0 0 v2
1317                           por mm6, mm5        ; 0 0 X X v2 v1 v0 v2
1318                           movd [edi+8], mm6   ; move double to memory
1319                           sub esi, 6
1320                           sub edi, 12
1321                           sub ecx, 2
1322                           jnz loop_pass4
1323                           EMMS
1324                        }
1325                     }
1326
1327                     sptr -= width_mmx*3;
1328                     dp -= width_mmx*6;
1329                     for (i = width; i; i--)
1330                     {
1331                        png_byte v[8];
1332                        int j;
1333
1334                        png_memcpy(v, sptr, 3);
1335                        for (j = 0; j < png_pass_inc[pass]; j++)
1336                        {
1337                           png_memcpy(dp, v, 3);
1338                           dp -= 3;
1339                        }
1340                        sptr -= 3;
1341                     }
1342                  }
1343               } /* end of pixel_bytes == 3 */
1344
1345               else if (pixel_bytes == 1)
1346               {
1347                  if (((pass == 0) || (pass == 1)) && width)
1348                  {
1349                     int width_mmx = ((width >> 2) << 2);
1350                     width -= width_mmx;
1351                     if (width_mmx)
1352                     {
1353                        _asm
1354                        {
1355                           mov esi, sptr
1356                           mov edi, dp
1357                           mov ecx, width_mmx
1358                           sub edi, 31
1359                           sub esi, 3
1360loop1_pass0:
1361                           movd mm0, [esi]     ; X X X X v0 v1 v2 v3
1362                           movq mm1, mm0       ; X X X X v0 v1 v2 v3
1363                           punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
1364                           movq mm2, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
1365                           punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
1366                           movq mm3, mm0       ; v2 v2 v2 v2 v3 v3 v3 v3
1367                           punpckldq mm0, mm0  ; v3 v3 v3 v3 v3 v3 v3 v3
1368                           punpckhdq mm3, mm3  ; v2 v2 v2 v2 v2 v2 v2 v2
1369                           movq [edi], mm0     ; move to memory v3
1370                           punpckhwd mm2, mm2  ; v0 v0 v0 v0 v1 v1 v1 v1
1371                           movq [edi+8], mm3   ; move to memory v2
1372                           movq mm4, mm2       ; v0 v0 v0 v0 v1 v1 v1 v1
1373                           punpckldq mm2, mm2  ; v1 v1 v1 v1 v1 v1 v1 v1
1374                           punpckhdq mm4, mm4  ; v0 v0 v0 v0 v0 v0 v0 v0
1375                           movq [edi+16], mm2  ; move to memory v1
1376                           movq [edi+24], mm4  ; move to memory v0
1377                           sub esi, 4
1378                           sub edi, 32
1379                           sub ecx, 4
1380                           jnz loop1_pass0
1381                           EMMS
1382                        }
1383                     }
1384
1385                     sptr -= width_mmx;
1386                     dp -= width_mmx*8;
1387                     for (i = width; i; i--)
1388                     {
1389                        int j;
1390
1391                       /* I simplified this part in version 1.0.4e
1392                        * here and in several other instances where
1393                        * pixel_bytes == 1  -- GR-P
1394                        *
1395                        * Original code:
1396                        *
1397                        * png_byte v[8];
1398                        * png_memcpy(v, sptr, pixel_bytes);
1399                        * for (j = 0; j < png_pass_inc[pass]; j++)
1400                        * {
1401                        *    png_memcpy(dp, v, pixel_bytes);
1402                        *    dp -= pixel_bytes;
1403                        * }
1404                        * sptr -= pixel_bytes;
1405                        *
1406                        * Replacement code is in the next three lines:
1407                        */
1408
1409                        for (j = 0; j < png_pass_inc[pass]; j++)
1410                           *dp-- = *sptr;
1411                        sptr--;
1412                     }
1413                  }
1414                  else if (((pass == 2) || (pass == 3)) && width)
1415                  {
1416                     int width_mmx = ((width >> 2) << 2);
1417                     width -= width_mmx;
1418                     if (width_mmx)
1419                     {
1420                        _asm
1421                        {
1422                           mov esi, sptr
1423                           mov edi, dp
1424                           mov ecx, width_mmx
1425                           sub edi, 15
1426                           sub esi, 3
1427loop1_pass2:
1428                           movd mm0, [esi]     ; X X X X v0 v1 v2 v3
1429                           punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
1430                           movq mm1, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
1431                           punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
1432                           punpckhwd mm1, mm1  ; v0 v0 v0 v0 v1 v1 v1 v1
1433                           movq [edi], mm0     ; move to memory v2 and v3
1434                           sub esi, 4
1435                           movq [edi+8], mm1   ; move to memory v1     and v0
1436                           sub edi, 16
1437                           sub ecx, 4
1438                           jnz loop1_pass2
1439                           EMMS
1440                        }
1441                     }
1442
1443                     sptr -= width_mmx;
1444                     dp -= width_mmx*4;
1445                     for (i = width; i; i--)
1446                     {
1447                        int j;
1448
1449                        for (j = 0; j < png_pass_inc[pass]; j++)
1450                        {
1451                           *dp-- = *sptr;
1452                        }
1453                        sptr --;
1454                     }
1455                  }
1456                  else if (width) /* && ((pass == 4) || (pass == 5))) */
1457                  {
1458                     int width_mmx = ((width >> 3) << 3);
1459                     width -= width_mmx;
1460                     if (width_mmx)
1461                     {
1462                        _asm
1463                        {
1464                           mov esi, sptr
1465                           mov edi, dp
1466                           mov ecx, width_mmx
1467                           sub edi, 15
1468                           sub esi, 7
1469loop1_pass4:
1470                           movq mm0, [esi]     ; v0 v1 v2 v3 v4 v5 v6 v7
1471                           movq mm1, mm0       ; v0 v1 v2 v3 v4 v5 v6 v7
1472                           punpcklbw mm0, mm0  ; v4 v4 v5 v5 v6 v6 v7 v7
1473                           /*movq mm1, mm0     ; v0 v0 v1 v1 v2 v2 v3 v3 */
1474                           punpckhbw mm1, mm1  ;v0 v0 v1 v1 v2 v2 v3 v3
1475                           movq [edi+8], mm1   ; move to memory v0 v1 v2 and v3
1476                           sub esi, 8
1477                           movq [edi], mm0     ; move to memory v4 v5 v6 and v7
1478                           /*sub esi, 4 */
1479                           sub edi, 16
1480                           sub ecx, 8
1481                           jnz loop1_pass4
1482                           EMMS
1483                        }
1484                     }
1485
1486                     sptr -= width_mmx;
1487                     dp -= width_mmx*2;
1488                     for (i = width; i; i--)
1489                     {
1490                        int j;
1491
1492                        for (j = 0; j < png_pass_inc[pass]; j++)
1493                        {
1494                           *dp-- = *sptr;
1495                        }
1496                        sptr --;
1497                     }
1498                  }
1499               } /* end of pixel_bytes == 1 */
1500
1501               else if (pixel_bytes == 2)
1502               {
1503                  if (((pass == 0) || (pass == 1)) && width)
1504                  {
1505                     int width_mmx = ((width >> 1) << 1);
1506                     width -= width_mmx;
1507                     if (width_mmx)
1508                     {
1509                        _asm
1510                        {
1511                           mov esi, sptr
1512                           mov edi, dp
1513                           mov ecx, width_mmx
1514                           sub esi, 2
1515                           sub edi, 30
1516loop2_pass0:
1517                           movd mm0, [esi]        ; X X X X v1 v0 v3 v2
1518                           punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
1519                           movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
1520                           punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
1521                           punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
1522                           movq [edi], mm0
1523                           movq [edi + 8], mm0
1524                           movq [edi + 16], mm1
1525                           movq [edi + 24], mm1
1526                           sub esi, 4
1527                           sub edi, 32
1528                           sub ecx, 2
1529                           jnz loop2_pass0
1530                           EMMS
1531                        }
1532                     }
1533
1534                     sptr -= (width_mmx*2 - 2);            /* sign fixed */
1535                     dp -= (width_mmx*16 - 2);            /* sign fixed */
1536                     for (i = width; i; i--)
1537                     {
1538                        png_byte v[8];
1539                        int j;
1540                        sptr -= 2;
1541                        png_memcpy(v, sptr, 2);
1542                        for (j = 0; j < png_pass_inc[pass]; j++)
1543                        {
1544                           dp -= 2;
1545                           png_memcpy(dp, v, 2);
1546                        }
1547                     }
1548                  }
1549                  else if (((pass == 2) || (pass == 3)) && width)
1550                  {
1551                     int width_mmx = ((width >> 1) << 1) ;
1552                     width -= width_mmx;
1553                     if (width_mmx)
1554                     {
1555                        _asm
1556                        {
1557                           mov esi, sptr
1558                           mov edi, dp
1559                           mov ecx, width_mmx
1560                           sub esi, 2
1561                           sub edi, 14
1562loop2_pass2:
1563                           movd mm0, [esi]        ; X X X X v1 v0 v3 v2
1564                           punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
1565                           movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
1566                           punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
1567                           punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
1568                           movq [edi], mm0
1569                           sub esi, 4
1570                           movq [edi + 8], mm1
1571                           /*sub esi, 4 */
1572                           sub edi, 16
1573                           sub ecx, 2
1574                           jnz loop2_pass2
1575                           EMMS
1576                        }
1577                     }
1578
1579                     sptr -= (width_mmx*2 - 2);            /* sign fixed */
1580                     dp -= (width_mmx*8 - 2);            /* sign fixed */
1581                     for (i = width; i; i--)
1582                     {
1583                        png_byte v[8];
1584                        int j;
1585                        sptr -= 2;
1586                        png_memcpy(v, sptr, 2);
1587                        for (j = 0; j < png_pass_inc[pass]; j++)
1588                        {
1589                           dp -= 2;
1590                           png_memcpy(dp, v, 2);
1591                        }
1592                     }
1593                  }
1594                  else if (width)  /* pass == 4 or 5 */
1595                  {
1596                     int width_mmx = ((width >> 1) << 1) ;
1597                     width -= width_mmx;
1598                     if (width_mmx)
1599                     {
1600                        _asm
1601                        {
1602                           mov esi, sptr
1603                           mov edi, dp
1604                           mov ecx, width_mmx
1605                           sub esi, 2
1606                           sub edi, 6
1607loop2_pass4:
1608                           movd mm0, [esi]        ; X X X X v1 v0 v3 v2
1609                           punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
1610                           sub esi, 4
1611                           movq [edi], mm0
1612                           sub edi, 8
1613                           sub ecx, 2
1614                           jnz loop2_pass4
1615                           EMMS
1616                        }
1617                     }
1618
1619                     sptr -= (width_mmx*2 - 2);            /* sign fixed */
1620                     dp -= (width_mmx*4 - 2);            /* sign fixed */
1621                     for (i = width; i; i--)
1622                     {
1623                        png_byte v[8];
1624                        int j;
1625                        sptr -= 2;
1626                        png_memcpy(v, sptr, 2);
1627                        for (j = 0; j < png_pass_inc[pass]; j++)
1628                        {
1629                           dp -= 2;
1630                           png_memcpy(dp, v, 2);
1631                        }
1632                     }
1633                  }
1634               } /* end of pixel_bytes == 2 */
1635
1636               else if (pixel_bytes == 4)
1637               {
1638                  if (((pass == 0) || (pass == 1)) && width)
1639                  {
1640                     int width_mmx = ((width >> 1) << 1) ;
1641                     width -= width_mmx;
1642                     if (width_mmx)
1643                     {
1644                        _asm
1645                        {
1646                           mov esi, sptr
1647                           mov edi, dp
1648                           mov ecx, width_mmx
1649                           sub esi, 4
1650                           sub edi, 60
1651loop4_pass0:
1652                           movq mm0, [esi]        ; v3 v2 v1 v0 v7 v6 v5 v4
1653                           movq mm1, mm0          ; v3 v2 v1 v0 v7 v6 v5 v4
1654                           punpckldq mm0, mm0     ; v7 v6 v5 v4 v7 v6 v5 v4
1655                           punpckhdq mm1, mm1     ; v3 v2 v1 v0 v3 v2 v1 v0
1656                           movq [edi], mm0
1657                           movq [edi + 8], mm0
1658                           movq [edi + 16], mm0
1659                           movq [edi + 24], mm0
1660                           movq [edi+32], mm1
1661                           movq [edi + 40], mm1
1662                           movq [edi+ 48], mm1
1663                           sub esi, 8
1664                           movq [edi + 56], mm1
1665                           sub edi, 64
1666                           sub ecx, 2
1667                           jnz loop4_pass0
1668                           EMMS
1669                        }
1670                     }
1671
1672                     sptr -= (width_mmx*4 - 4);            /* sign fixed */
1673                     dp -= (width_mmx*32 - 4);            /* sign fixed */
1674                     for (i = width; i; i--)
1675                     {
1676                        png_byte v[8];
1677                        int j;
1678                        sptr -= 4;
1679                        png_memcpy(v, sptr, 4);
1680                        for (j = 0; j < png_pass_inc[pass]; j++)
1681                        {
1682                           dp -= 4;
1683                           png_memcpy(dp, v, 4);
1684                        }
1685                     }
1686                  }
1687                  else if (((pass == 2) || (pass == 3)) && width)
1688                  {
1689                     int width_mmx = ((width >> 1) << 1) ;
1690                     width -= width_mmx;
1691                     if (width_mmx)
1692                     {
1693                        _asm
1694                        {
1695                           mov esi, sptr
1696                           mov edi, dp
1697                           mov ecx, width_mmx
1698                           sub esi, 4
1699                           sub edi, 28
1700loop4_pass2:
1701                           movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
1702                           movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
1703                           punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
1704                           punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
1705                           movq [edi], mm0
1706                           movq [edi + 8], mm0
1707                           movq [edi+16], mm1
1708                           movq [edi + 24], mm1
1709                           sub esi, 8
1710                           sub edi, 32
1711                           sub ecx, 2
1712                           jnz loop4_pass2
1713                           EMMS
1714                        }
1715                     }
1716
1717                     sptr -= (width_mmx*4 - 4);            /* sign fixed */
1718                     dp -= (width_mmx*16 - 4);            /* sign fixed */
1719                     for (i = width; i; i--)
1720                     {
1721                        png_byte v[8];
1722                        int j;
1723                        sptr -= 4;
1724                        png_memcpy(v, sptr, 4);
1725                        for (j = 0; j < png_pass_inc[pass]; j++)
1726                        {
1727                           dp -= 4;
1728                           png_memcpy(dp, v, 4);
1729                        }
1730                     }
1731                  }
1732                  else if (width)  /* pass == 4 or 5 */
1733                  {
1734                     int width_mmx = ((width >> 1) << 1) ;
1735                     width -= width_mmx;
1736                     if (width_mmx)
1737                     {
1738                        _asm
1739                        {
1740                           mov esi, sptr
1741                           mov edi, dp
1742                           mov ecx, width_mmx
1743                           sub esi, 4
1744                           sub edi, 12
1745loop4_pass4:
1746                           movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
1747                           movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
1748                           punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
1749                           punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
1750                           movq [edi], mm0
1751                           sub esi, 8
1752                           movq [edi + 8], mm1
1753                           sub edi, 16
1754                           sub ecx, 2
1755                           jnz loop4_pass4
1756                           EMMS
1757                        }
1758                     }
1759
1760                     sptr -= (width_mmx*4 - 4);          /* sign fixed */
1761                     dp -= (width_mmx*8 - 4);            /* sign fixed */
1762                     for (i = width; i; i--)
1763                     {
1764                        png_byte v[8];
1765                        int j;
1766                        sptr -= 4;
1767                        png_memcpy(v, sptr, 4);
1768                        for (j = 0; j < png_pass_inc[pass]; j++)
1769                        {
1770                           dp -= 4;
1771                           png_memcpy(dp, v, 4);
1772                        }
1773                     }
1774                  }
1775
1776               } /* end of pixel_bytes == 4 */
1777
1778               else if (pixel_bytes == 6)
1779               {
1780                  for (i = width; i; i--)
1781                  {
1782                     png_byte v[8];
1783                     int j;
1784                     png_memcpy(v, sptr, 6);
1785                     for (j = 0; j < png_pass_inc[pass]; j++)
1786                     {
1787                        png_memcpy(dp, v, 6);
1788                        dp -= 6;
1789                     }
1790                     sptr -= 6;
1791                  }
1792               } /* end of pixel_bytes == 6 */
1793
1794               else
1795               {
1796                  for (i = width; i; i--)
1797                  {
1798                     png_byte v[8];
1799                     int j;
1800                     png_memcpy(v, sptr, pixel_bytes);
1801                     for (j = 0; j < png_pass_inc[pass]; j++)
1802                     {
1803                        png_memcpy(dp, v, pixel_bytes);
1804                        dp -= pixel_bytes;
1805                     }
1806                     sptr-= pixel_bytes;
1807                  }
1808               }
1809            } /* end of mmx_supported */
1810
1811            else /* MMX not supported:  use modified C code - takes advantage
1812                  * of inlining of memcpy for a constant */
1813            {
1814               if (pixel_bytes == 1)
1815               {
1816                  for (i = width; i; i--)
1817                  {
1818                     int j;
1819                     for (j = 0; j < png_pass_inc[pass]; j++)
1820                        *dp-- = *sptr;
1821                     sptr--;
1822                  }
1823               }
1824               else if (pixel_bytes == 3)
1825               {
1826                  for (i = width; i; i--)
1827                  {
1828                     png_byte v[8];
1829                     int j;
1830                     png_memcpy(v, sptr, pixel_bytes);
1831                     for (j = 0; j < png_pass_inc[pass]; j++)
1832                     {
1833                        png_memcpy(dp, v, pixel_bytes);
1834                        dp -= pixel_bytes;
1835                     }
1836                     sptr -= pixel_bytes;
1837                  }
1838               }
1839               else if (pixel_bytes == 2)
1840               {
1841                  for (i = width; i; i--)
1842                  {
1843                     png_byte v[8];
1844                     int j;
1845                     png_memcpy(v, sptr, pixel_bytes);
1846                     for (j = 0; j < png_pass_inc[pass]; j++)
1847                     {
1848                        png_memcpy(dp, v, pixel_bytes);
1849                        dp -= pixel_bytes;
1850                     }
1851                     sptr -= pixel_bytes;
1852                  }
1853               }
1854               else if (pixel_bytes == 4)
1855               {
1856                  for (i = width; i; i--)
1857                  {
1858                     png_byte v[8];
1859                     int j;
1860                     png_memcpy(v, sptr, pixel_bytes);
1861                     for (j = 0; j < png_pass_inc[pass]; j++)
1862                     {
1863                        png_memcpy(dp, v, pixel_bytes);
1864                        dp -= pixel_bytes;
1865                     }
1866                     sptr -= pixel_bytes;
1867                  }
1868               }
1869               else if (pixel_bytes == 6)
1870               {
1871                  for (i = width; i; i--)
1872                  {
1873                     png_byte v[8];
1874                     int j;
1875                     png_memcpy(v, sptr, pixel_bytes);
1876                     for (j = 0; j < png_pass_inc[pass]; j++)
1877                     {
1878                        png_memcpy(dp, v, pixel_bytes);
1879                        dp -= pixel_bytes;
1880                     }
1881                     sptr -= pixel_bytes;
1882                  }
1883               }
1884               else
1885               {
1886                  for (i = width; i; i--)
1887                  {
1888                     png_byte v[8];
1889                     int j;
1890                     png_memcpy(v, sptr, pixel_bytes);
1891                     for (j = 0; j < png_pass_inc[pass]; j++)
1892                     {
1893                        png_memcpy(dp, v, pixel_bytes);
1894                        dp -= pixel_bytes;
1895                     }
1896                     sptr -= pixel_bytes;
1897                  }
1898               }
1899
1900            } /* end of MMX not supported */
1901            break;
1902         }
1903      } /* end switch (row_info->pixel_depth) */
1904
1905      row_info->width = final_width;
1906
1907      row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
1908   }
1909
1910}
1911
1912#endif /* PNG_READ_INTERLACING_SUPPORTED */
1913
1914
1915/* These variables are utilized in the functions below.  They are declared */
1916/* globally here to ensure alignment on 8-byte boundaries. */
1917
1918union uAll {
1919   __int64 use;
1920   double  align;
1921} LBCarryMask = {0x0101010101010101},
1922  HBClearMask = {0x7f7f7f7f7f7f7f7f},
1923  ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
1924
1925
1926/* Optimized code for PNG Average filter decoder */
1927void /* PRIVATE */
1928png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1929                            , png_bytep prev_row)
1930{
1931   int bpp;
1932   png_uint_32 FullLength;
1933   png_uint_32 MMXLength;
1934   /*png_uint_32 len; */
1935   int diff;
1936
1937   bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */
1938   FullLength  = row_info->rowbytes; /* # of bytes to filter */
1939   _asm {
1940         /* Init address pointers and offset */
1941         mov edi, row          /* edi ==> Avg(x) */
1942         xor ebx, ebx          /* ebx ==> x */
1943         mov edx, edi
1944         mov esi, prev_row           /* esi ==> Prior(x) */
1945         sub edx, bpp          /* edx ==> Raw(x-bpp) */
1946
1947         xor eax, eax
1948         /* Compute the Raw value for the first bpp bytes */
1949         /*    Raw(x) = Avg(x) + (Prior(x)/2) */
1950davgrlp:
1951         mov al, [esi + ebx]   /* Load al with Prior(x) */
1952         inc ebx
1953         shr al, 1             /* divide by 2 */
1954         add al, [edi+ebx-1]   /* Add Avg(x); -1 to offset inc ebx */
1955         cmp ebx, bpp
1956         mov [edi+ebx-1], al    /* Write back Raw(x); */
1957                            /* mov does not affect flags; -1 to offset inc ebx */
1958         jb davgrlp
1959         /* get # of bytes to alignment */
1960         mov diff, edi         /* take start of row */
1961         add diff, ebx         /* add bpp */
1962         add diff, 0xf         /* add 7 + 8 to incr past alignment boundary */
1963         and diff, 0xfffffff8  /* mask to alignment boundary */
1964         sub diff, edi         /* subtract from start ==> value ebx at alignment */
1965         jz davggo
1966         /* fix alignment */
1967         /* Compute the Raw value for the bytes upto the alignment boundary */
1968         /*    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
1969         xor ecx, ecx
1970davglp1:
1971         xor eax, eax
1972         mov cl, [esi + ebx]        /* load cl with Prior(x) */
1973         mov al, [edx + ebx]  /* load al with Raw(x-bpp) */
1974         add ax, cx
1975         inc ebx
1976         shr ax, 1            /* divide by 2 */
1977         add al, [edi+ebx-1]  /* Add Avg(x); -1 to offset inc ebx */
1978         cmp ebx, diff              /* Check if at alignment boundary */
1979         mov [edi+ebx-1], al        /* Write back Raw(x); */
1980                            /* mov does not affect flags; -1 to offset inc ebx */
1981         jb davglp1               /* Repeat until at alignment boundary */
1982davggo:
1983         mov eax, FullLength
1984         mov ecx, eax
1985         sub eax, ebx          /* subtract alignment fix */
1986         and eax, 0x00000007   /* calc bytes over mult of 8 */
1987         sub ecx, eax          /* drop over bytes from original length */
1988         mov MMXLength, ecx
1989   } /* end _asm block */
1990   /* Now do the math for the rest of the row */
1991   switch ( bpp )
1992   {
1993      case 3:
1994      {
1995         ActiveMask.use  = 0x0000000000ffffff;
1996         ShiftBpp.use = 24;    /* == 3 * 8 */
1997         ShiftRem.use = 40;    /* == 64 - 24 */
1998         _asm {
1999            /* Re-init address pointers and offset */
2000            movq mm7, ActiveMask
2001            mov ebx, diff      /* ebx ==> x = offset to alignment boundary */
2002            movq mm5, LBCarryMask
2003            mov edi, row       /* edi ==> Avg(x) */
2004            movq mm4, HBClearMask
2005            mov esi, prev_row        /* esi ==> Prior(x) */
2006            /* PRIME the pump (load the first Raw(x-bpp) data set */
2007            movq mm2, [edi + ebx - 8]  /* Load previous aligned 8 bytes */
2008                               /* (we correct position in loop below) */
2009davg3lp:
2010            movq mm0, [edi + ebx]      /* Load mm0 with Avg(x) */
2011            /* Add (Prev_row/2) to Average */
2012            movq mm3, mm5
2013            psrlq mm2, ShiftRem      /* Correct position Raw(x-bpp) data */
2014            movq mm1, [esi + ebx]    /* Load mm1 with Prior(x) */
2015            movq mm6, mm7
2016            pand mm3, mm1      /* get lsb for each prev_row byte */
2017            psrlq mm1, 1       /* divide prev_row bytes by 2 */
2018            pand  mm1, mm4     /* clear invalid bit 7 of each byte */
2019            paddb mm0, mm1     /* add (Prev_row/2) to Avg for each byte */
2020            /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */
2021            movq mm1, mm3      /* now use mm1 for getting LBCarrys */
2022            pand mm1, mm2      /* get LBCarrys for each byte where both */
2023                               /* lsb's were == 1 (Only valid for active group) */
2024            psrlq mm2, 1       /* divide raw bytes by 2 */
2025            pand  mm2, mm4     /* clear invalid bit 7 of each byte */
2026            paddb mm2, mm1     /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2027            pand mm2, mm6      /* Leave only Active Group 1 bytes to add to Avg */
2028            paddb mm0, mm2     /* add (Raw/2) + LBCarrys to Avg for each Active */
2029                               /*  byte */
2030            /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */
2031            psllq mm6, ShiftBpp  /* shift the mm6 mask to cover bytes 3-5 */
2032            movq mm2, mm0        /* mov updated Raws to mm2 */
2033            psllq mm2, ShiftBpp  /* shift data to position correctly */
2034            movq mm1, mm3        /* now use mm1 for getting LBCarrys */
2035            pand mm1, mm2      /* get LBCarrys for each byte where both */
2036                               /* lsb's were == 1 (Only valid for active group) */
2037            psrlq mm2, 1       /* divide raw bytes by 2 */
2038            pand  mm2, mm4     /* clear invalid bit 7 of each byte */
2039            paddb mm2, mm1     /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2040            pand mm2, mm6      /* Leave only Active Group 2 bytes to add to Avg */
2041            paddb mm0, mm2     /* add (Raw/2) + LBCarrys to Avg for each Active */
2042                               /*  byte */
2043
2044            /* Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry */
2045            psllq mm6, ShiftBpp  /* shift the mm6 mask to cover the last two */
2046                                 /* bytes */
2047            movq mm2, mm0        /* mov updated Raws to mm2 */
2048            psllq mm2, ShiftBpp  /* shift data to position correctly */
2049                              /* Data only needs to be shifted once here to */
2050                              /* get the correct x-bpp offset. */
2051            movq mm1, mm3     /* now use mm1 for getting LBCarrys */
2052            pand mm1, mm2     /* get LBCarrys for each byte where both */
2053                              /* lsb's were == 1 (Only valid for active group) */
2054            psrlq mm2, 1      /* divide raw bytes by 2 */
2055            pand  mm2, mm4    /* clear invalid bit 7 of each byte */
2056            paddb mm2, mm1    /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2057            pand mm2, mm6     /* Leave only Active Group 2 bytes to add to Avg */
2058            add ebx, 8
2059            paddb mm0, mm2    /* add (Raw/2) + LBCarrys to Avg for each Active */
2060                              /* byte */
2061
2062            /* Now ready to write back to memory */
2063            movq [edi + ebx - 8], mm0
2064            /* Move updated Raw(x) to use as Raw(x-bpp) for next loop */
2065            cmp ebx, MMXLength
2066            movq mm2, mm0     /* mov updated Raw(x) to mm2 */
2067            jb davg3lp
2068         } /* end _asm block */
2069      }
2070      break;
2071
2072      case 6:
2073      case 4:
2074      case 7:
2075      case 5:
2076      {
2077         ActiveMask.use  = 0xffffffffffffffff;  /* use shift below to clear */
2078                                                /* appropriate inactive bytes */
2079         ShiftBpp.use = bpp << 3;
2080         ShiftRem.use = 64 - ShiftBpp.use;
2081         _asm {
2082            movq mm4, HBClearMask
2083            /* Re-init address pointers and offset */
2084            mov ebx, diff       /* ebx ==> x = offset to alignment boundary */
2085            /* Load ActiveMask and clear all bytes except for 1st active group */
2086            movq mm7, ActiveMask
2087            mov edi, row         /* edi ==> Avg(x) */
2088            psrlq mm7, ShiftRem
2089            mov esi, prev_row    /* esi ==> Prior(x) */
2090            movq mm6, mm7
2091            movq mm5, LBCarryMask
2092            psllq mm6, ShiftBpp  /* Create mask for 2nd active group */
2093            /* PRIME the pump (load the first Raw(x-bpp) data set */
2094            movq mm2, [edi + ebx - 8]  /* Load previous aligned 8 bytes */
2095                                 /* (we correct position in loop below) */
2096davg4lp:
2097            movq mm0, [edi + ebx]
2098            psrlq mm2, ShiftRem  /* shift data to position correctly */
2099            movq mm1, [esi + ebx]
2100            /* Add (Prev_row/2) to Average */
2101            movq mm3, mm5
2102            pand mm3, mm1     /* get lsb for each prev_row byte */
2103            psrlq mm1, 1      /* divide prev_row bytes by 2 */
2104            pand  mm1, mm4    /* clear invalid bit 7 of each byte */
2105            paddb mm0, mm1    /* add (Prev_row/2) to Avg for each byte */
2106            /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */
2107            movq mm1, mm3     /* now use mm1 for getting LBCarrys */
2108            pand mm1, mm2     /* get LBCarrys for each byte where both */
2109                              /* lsb's were == 1 (Only valid for active group) */
2110            psrlq mm2, 1      /* divide raw bytes by 2 */
2111            pand  mm2, mm4    /* clear invalid bit 7 of each byte */
2112            paddb mm2, mm1    /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2113            pand mm2, mm7     /* Leave only Active Group 1 bytes to add to Avg */
2114            paddb mm0, mm2    /* add (Raw/2) + LBCarrys to Avg for each Active */
2115                              /* byte */
2116            /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */
2117            movq mm2, mm0     /* mov updated Raws to mm2 */
2118            psllq mm2, ShiftBpp /* shift data to position correctly */
2119            add ebx, 8
2120            movq mm1, mm3     /* now use mm1 for getting LBCarrys */
2121            pand mm1, mm2     /* get LBCarrys for each byte where both */
2122                              /* lsb's were == 1 (Only valid for active group) */
2123            psrlq mm2, 1      /* divide raw bytes by 2 */
2124            pand  mm2, mm4    /* clear invalid bit 7 of each byte */
2125            paddb mm2, mm1    /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2126            pand mm2, mm6     /* Leave only Active Group 2 bytes to add to Avg */
2127            paddb mm0, mm2    /* add (Raw/2) + LBCarrys to Avg for each Active */
2128                              /* byte */
2129            cmp ebx, MMXLength
2130            /* Now ready to write back to memory */
2131            movq [edi + ebx - 8], mm0
2132            /* Prep Raw(x-bpp) for next loop */
2133            movq mm2, mm0     /* mov updated Raws to mm2 */
2134            jb davg4lp
2135         } /* end _asm block */
2136      }
2137      break;
2138      case 2:
2139      {
2140         ActiveMask.use  = 0x000000000000ffff;
2141         ShiftBpp.use = 16;   /* == 2 * 8     [BUGFIX] */
2142         ShiftRem.use = 48;   /* == 64 - 16   [BUGFIX] */
2143         _asm {
2144            /* Load ActiveMask */
2145            movq mm7, ActiveMask
2146            /* Re-init address pointers and offset */
2147            mov ebx, diff     /* ebx ==> x = offset to alignment boundary */
2148            movq mm5, LBCarryMask
2149            mov edi, row      /* edi ==> Avg(x) */
2150            movq mm4, HBClearMask
2151            mov esi, prev_row  /* esi ==> Prior(x) */
2152            /* PRIME the pump (load the first Raw(x-bpp) data set */
2153            movq mm2, [edi + ebx - 8]  /* Load previous aligned 8 bytes */
2154                              /* (we correct position in loop below) */
2155davg2lp:
2156            movq mm0, [edi + ebx]
2157            psrlq mm2, ShiftRem  /* shift data to position correctly   [BUGFIX] */
2158            movq mm1, [esi + ebx]
2159            /* Add (Prev_row/2) to Average */
2160            movq mm3, mm5
2161            pand mm3, mm1     /* get lsb for each prev_row byte */
2162            psrlq mm1, 1      /* divide prev_row bytes by 2 */
2163            pand  mm1, mm4    /* clear invalid bit 7 of each byte */
2164            movq mm6, mm7
2165            paddb mm0, mm1    /* add (Prev_row/2) to Avg for each byte */
2166            /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */
2167            movq mm1, mm3     /* now use mm1 for getting LBCarrys */
2168            pand mm1, mm2     /* get LBCarrys for each byte where both */
2169                              /* lsb's were == 1 (Only valid for active group) */
2170            psrlq mm2, 1      /* divide raw bytes by 2 */
2171            pand  mm2, mm4    /* clear invalid bit 7 of each byte */
2172            paddb mm2, mm1    /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2173            pand mm2, mm6     /* Leave only Active Group 1 bytes to add to Avg */
2174            paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */
2175            /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */
2176            psllq mm6, ShiftBpp /* shift the mm6 mask to cover bytes 2 & 3 */
2177            movq mm2, mm0       /* mov updated Raws to mm2 */
2178            psllq mm2, ShiftBpp /* shift data to position correctly */
2179            movq mm1, mm3       /* now use mm1 for getting LBCarrys */
2180            pand mm1, mm2       /* get LBCarrys for each byte where both */
2181                                /* lsb's were == 1 (Only valid for active group) */
2182            psrlq mm2, 1        /* divide raw bytes by 2 */
2183            pand  mm2, mm4      /* clear invalid bit 7 of each byte */
2184            paddb mm2, mm1      /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2185            pand mm2, mm6       /* Leave only Active Group 2 bytes to add to Avg */
2186            paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */
2187
2188            /* Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry */
2189            psllq mm6, ShiftBpp /* shift the mm6 mask to cover bytes 4 & 5 */
2190            movq mm2, mm0       /* mov updated Raws to mm2 */
2191            psllq mm2, ShiftBpp /* shift data to position correctly */
2192                                /* Data only needs to be shifted once here to */
2193                                /* get the correct x-bpp offset. */
2194            movq mm1, mm3       /* now use mm1 for getting LBCarrys */
2195            pand mm1, mm2       /* get LBCarrys for each byte where both */
2196                                /* lsb's were == 1 (Only valid for active group) */
2197            psrlq mm2, 1        /* divide raw bytes by 2 */
2198            pand  mm2, mm4      /* clear invalid bit 7 of each byte */
2199            paddb mm2, mm1      /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2200            pand mm2, mm6       /* Leave only Active Group 2 bytes to add to Avg */
2201            paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */
2202
2203            /* Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry */
2204            psllq mm6, ShiftBpp  /* shift the mm6 mask to cover bytes 6 & 7 */
2205            movq mm2, mm0        /* mov updated Raws to mm2 */
2206            psllq mm2, ShiftBpp  /* shift data to position correctly */
2207                                 /* Data only needs to be shifted once here to */
2208                                 /* get the correct x-bpp offset. */
2209            add ebx, 8
2210            movq mm1, mm3    /* now use mm1 for getting LBCarrys */
2211            pand mm1, mm2    /* get LBCarrys for each byte where both */
2212                             /* lsb's were == 1 (Only valid for active group) */
2213            psrlq mm2, 1     /* divide raw bytes by 2 */
2214            pand  mm2, mm4   /* clear invalid bit 7 of each byte */
2215            paddb mm2, mm1   /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2216            pand mm2, mm6    /* Leave only Active Group 2 bytes to add to Avg */
2217            paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */
2218
2219            cmp ebx, MMXLength
2220            /* Now ready to write back to memory */
2221            movq [edi + ebx - 8], mm0
2222            /* Prep Raw(x-bpp) for next loop */
2223            movq mm2, mm0    /* mov updated Raws to mm2 */
2224            jb davg2lp
2225        } /* end _asm block */
2226      }
2227      break;
2228
2229      case 1:                 /* bpp == 1 */
2230      {
2231         _asm {
2232            /* Re-init address pointers and offset */
2233            mov ebx, diff     /* ebx ==> x = offset to alignment boundary */
2234            mov edi, row      /* edi ==> Avg(x) */
2235            cmp ebx, FullLength  /* Test if offset at end of array */
2236            jnb davg1end
2237            /* Do Paeth decode for remaining bytes */
2238            mov esi, prev_row    /* esi ==> Prior(x) */
2239            mov edx, edi
2240            xor ecx, ecx         /* zero ecx before using cl & cx in loop below */
2241            sub edx, bpp         /* edx ==> Raw(x-bpp) */
2242davg1lp:
2243            /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
2244            xor eax, eax
2245            mov cl, [esi + ebx]  /* load cl with Prior(x) */
2246            mov al, [edx + ebx]  /* load al with Raw(x-bpp) */
2247            add ax, cx
2248            inc ebx
2249            shr ax, 1            /* divide by 2 */
2250            add al, [edi+ebx-1]  /* Add Avg(x); -1 to offset inc ebx */
2251            cmp ebx, FullLength  /* Check if at end of array */
2252            mov [edi+ebx-1], al  /* Write back Raw(x); */
2253                         /* mov does not affect flags; -1 to offset inc ebx */
2254            jb davg1lp
2255davg1end:
2256         } /* end _asm block */
2257      }
2258      return;
2259
2260      case 8:             /* bpp == 8 */
2261      {
2262         _asm {
2263            /* Re-init address pointers and offset */
2264            mov ebx, diff           /* ebx ==> x = offset to alignment boundary */
2265            movq mm5, LBCarryMask
2266            mov edi, row            /* edi ==> Avg(x) */
2267            movq mm4, HBClearMask
2268            mov esi, prev_row       /* esi ==> Prior(x) */
2269            /* PRIME the pump (load the first Raw(x-bpp) data set */
2270            movq mm2, [edi + ebx - 8]  /* Load previous aligned 8 bytes */
2271                                /* (NO NEED to correct position in loop below) */
2272davg8lp:
2273            movq mm0, [edi + ebx]
2274            movq mm3, mm5
2275            movq mm1, [esi + ebx]
2276            add ebx, 8
2277            pand mm3, mm1       /* get lsb for each prev_row byte */
2278            psrlq mm1, 1        /* divide prev_row bytes by 2 */
2279            pand mm3, mm2       /* get LBCarrys for each byte where both */
2280                                /* lsb's were == 1 */
2281            psrlq mm2, 1        /* divide raw bytes by 2 */
2282            pand  mm1, mm4      /* clear invalid bit 7 of each byte */
2283            paddb mm0, mm3      /* add LBCarrys to Avg for each byte */
2284            pand  mm2, mm4      /* clear invalid bit 7 of each byte */
2285            paddb mm0, mm1      /* add (Prev_row/2) to Avg for each byte */
2286            paddb mm0, mm2      /* add (Raw/2) to Avg for each byte */
2287            cmp ebx, MMXLength
2288            movq [edi + ebx - 8], mm0
2289            movq mm2, mm0       /* reuse as Raw(x-bpp) */
2290            jb davg8lp
2291        } /* end _asm block */
2292      }
2293      break;
2294      default:                  /* bpp greater than 8 */
2295      {
2296        _asm {
2297            movq mm5, LBCarryMask
2298            /* Re-init address pointers and offset */
2299            mov ebx, diff       /* ebx ==> x = offset to alignment boundary */
2300            mov edi, row        /* edi ==> Avg(x) */
2301            movq mm4, HBClearMask
2302            mov edx, edi
2303            mov esi, prev_row   /* esi ==> Prior(x) */
2304            sub edx, bpp        /* edx ==> Raw(x-bpp) */
2305davgAlp:
2306            movq mm0, [edi + ebx]
2307            movq mm3, mm5
2308            movq mm1, [esi + ebx]
2309            pand mm3, mm1       /* get lsb for each prev_row byte */
2310            movq mm2, [edx + ebx]
2311            psrlq mm1, 1        /* divide prev_row bytes by 2 */
2312            pand mm3, mm2       /* get LBCarrys for each byte where both */
2313                                /* lsb's were == 1 */
2314            psrlq mm2, 1        /* divide raw bytes by 2 */
2315            pand  mm1, mm4      /* clear invalid bit 7 of each byte */
2316            paddb mm0, mm3      /* add LBCarrys to Avg for each byte */
2317            pand  mm2, mm4      /* clear invalid bit 7 of each byte */
2318            paddb mm0, mm1      /* add (Prev_row/2) to Avg for each byte */
2319            add ebx, 8
2320            paddb mm0, mm2      /* add (Raw/2) to Avg for each byte */
2321            cmp ebx, MMXLength
2322            movq [edi + ebx - 8], mm0
2323            jb davgAlp
2324        } /* end _asm block */
2325      }
2326      break;
2327   }                         /* end switch ( bpp ) */
2328
2329   _asm {
2330         /* MMX acceleration complete now do clean-up */
2331         /* Check if any remaining bytes left to decode */
2332         mov ebx, MMXLength    /* ebx ==> x = offset bytes remaining after MMX */
2333         mov edi, row          /* edi ==> Avg(x) */
2334         cmp ebx, FullLength   /* Test if offset at end of array */
2335         jnb davgend
2336         /* Do Paeth decode for remaining bytes */
2337         mov esi, prev_row     /* esi ==> Prior(x) */
2338         mov edx, edi
2339         xor ecx, ecx          /* zero ecx before using cl & cx in loop below */
2340         sub edx, bpp          /* edx ==> Raw(x-bpp) */
2341davglp2:
2342         /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
2343         xor eax, eax
2344         mov cl, [esi + ebx]   /* load cl with Prior(x) */
2345         mov al, [edx + ebx]   /* load al with Raw(x-bpp) */
2346         add ax, cx
2347         inc ebx
2348         shr ax, 1              /* divide by 2 */
2349         add al, [edi+ebx-1]    /* Add Avg(x); -1 to offset inc ebx */
2350         cmp ebx, FullLength    /* Check if at end of array */
2351         mov [edi+ebx-1], al    /* Write back Raw(x); */
2352                          /* mov does not affect flags; -1 to offset inc ebx */
2353         jb davglp2
2354davgend:
2355         emms             /* End MMX instructions; prep for possible FP instrs. */
2356   } /* end _asm block */
2357}
2358
2359/* Optimized code for PNG Paeth filter decoder */
2360void /* PRIVATE */
2361png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2362                              png_bytep prev_row)
2363{
2364   png_uint_32 FullLength;
2365   png_uint_32 MMXLength;
2366   /*png_uint_32 len; */
2367   int bpp;
2368   int diff;
2369   /*int ptemp; */
2370   int patemp, pbtemp, pctemp;
2371
2372   bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */
2373   FullLength  = row_info->rowbytes; /* # of bytes to filter */
2374   _asm
2375   {
2376         xor ebx, ebx        /* ebx ==> x offset */
2377         mov edi, row
2378         xor edx, edx        /* edx ==> x-bpp offset */
2379         mov esi, prev_row
2380         xor eax, eax
2381
2382         /* Compute the Raw value for the first bpp bytes */
2383         /* Note: the formula works out to be always */
2384         /*   Paeth(x) = Raw(x) + Prior(x)      where x < bpp */
2385dpthrlp:
2386         mov al, [edi + ebx]
2387         add al, [esi + ebx]
2388         inc ebx
2389         cmp ebx, bpp
2390         mov [edi + ebx - 1], al
2391         jb dpthrlp
2392         /* get # of bytes to alignment */
2393         mov diff, edi         /* take start of row */
2394         add diff, ebx         /* add bpp */
2395         xor ecx, ecx
2396         add diff, 0xf         /* add 7 + 8 to incr past alignment boundary */
2397         and diff, 0xfffffff8  /* mask to alignment boundary */
2398         sub diff, edi         /* subtract from start ==> value ebx at alignment */
2399         jz dpthgo
2400         /* fix alignment */
2401dpthlp1:
2402         xor eax, eax
2403         /* pav = p - a = (a + b - c) - a = b - c */
2404         mov al, [esi + ebx]   /* load Prior(x) into al */
2405         mov cl, [esi + edx]   /* load Prior(x-bpp) into cl */
2406         sub eax, ecx          /* subtract Prior(x-bpp) */
2407         mov patemp, eax       /* Save pav for later use */
2408         xor eax, eax
2409         /* pbv = p - b = (a + b - c) - b = a - c */
2410         mov al, [edi + edx]   /* load Raw(x-bpp) into al */
2411         sub eax, ecx          /* subtract Prior(x-bpp) */
2412         mov ecx, eax
2413         /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2414         add eax, patemp       /* pcv = pav + pbv */
2415         /* pc = abs(pcv) */
2416         test eax, 0x80000000
2417         jz dpthpca
2418         neg eax               /* reverse sign of neg values */
2419dpthpca:
2420         mov pctemp, eax       /* save pc for later use */
2421         /* pb = abs(pbv) */
2422         test ecx, 0x80000000
2423         jz dpthpba
2424         neg ecx               /* reverse sign of neg values */
2425dpthpba:
2426         mov pbtemp, ecx       /* save pb for later use */
2427         /* pa = abs(pav) */
2428         mov eax, patemp
2429         test eax, 0x80000000
2430         jz dpthpaa
2431         neg eax               /* reverse sign of neg values */
2432dpthpaa:
2433         mov patemp, eax       /* save pa for later use */
2434         /* test if pa <= pb */
2435         cmp eax, ecx
2436         jna dpthabb
2437         /* pa > pb; now test if pb <= pc */
2438         cmp ecx, pctemp
2439         jna dpthbbc
2440         /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
2441         mov cl, [esi + edx]  /* load Prior(x-bpp) into cl */
2442         jmp dpthpaeth
2443dpthbbc:
2444         /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
2445         mov cl, [esi + ebx]   /* load Prior(x) into cl */
2446         jmp dpthpaeth
2447dpthabb:
2448         /* pa <= pb; now test if pa <= pc */
2449         cmp eax, pctemp
2450         jna dpthabc
2451         /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
2452         mov cl, [esi + edx]  /* load Prior(x-bpp) into cl */
2453         jmp dpthpaeth
2454dpthabc:
2455         /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
2456         mov cl, [edi + edx]  /* load Raw(x-bpp) into cl */
2457dpthpaeth:
2458         inc ebx
2459         inc edx
2460         /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
2461         add [edi + ebx - 1], cl
2462         cmp ebx, diff
2463         jb dpthlp1
2464dpthgo:
2465         mov ecx, FullLength
2466         mov eax, ecx
2467         sub eax, ebx          /* subtract alignment fix */
2468         and eax, 0x00000007   /* calc bytes over mult of 8 */
2469         sub ecx, eax          /* drop over bytes from original length */
2470         mov MMXLength, ecx
2471   } /* end _asm block */
2472   /* Now do the math for the rest of the row */
2473   switch ( bpp )
2474   {
2475      case 3:
2476      {
2477         ActiveMask.use = 0x0000000000ffffff;
2478         ActiveMaskEnd.use = 0xffff000000000000;
2479         ShiftBpp.use = 24;    /* == bpp(3) * 8 */
2480         ShiftRem.use = 40;    /* == 64 - 24 */
2481         _asm
2482         {
2483            mov ebx, diff
2484            mov edi, row
2485            mov esi, prev_row
2486            pxor mm0, mm0
2487            /* PRIME the pump (load the first Raw(x-bpp) data set */
2488            movq mm1, [edi+ebx-8]
2489dpth3lp:
2490            psrlq mm1, ShiftRem     /* shift last 3 bytes to 1st 3 bytes */
2491            movq mm2, [esi + ebx]   /* load b=Prior(x) */
2492            punpcklbw mm1, mm0      /* Unpack High bytes of a */
2493            movq mm3, [esi+ebx-8]   /* Prep c=Prior(x-bpp) bytes */
2494            punpcklbw mm2, mm0      /* Unpack High bytes of b */
2495            psrlq mm3, ShiftRem     /* shift last 3 bytes to 1st 3 bytes */
2496            /* pav = p - a = (a + b - c) - a = b - c */
2497            movq mm4, mm2
2498            punpcklbw mm3, mm0      /* Unpack High bytes of c */
2499            /* pbv = p - b = (a + b - c) - b = a - c */
2500            movq mm5, mm1
2501            psubw mm4, mm3
2502            pxor mm7, mm7
2503            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2504            movq mm6, mm4
2505            psubw mm5, mm3
2506
2507            /* pa = abs(p-a) = abs(pav) */
2508            /* pb = abs(p-b) = abs(pbv) */
2509            /* pc = abs(p-c) = abs(pcv) */
2510            pcmpgtw mm0, mm4    /* Create mask pav bytes < 0 */
2511            paddw mm6, mm5
2512            pand mm0, mm4       /* Only pav bytes < 0 in mm7 */
2513            pcmpgtw mm7, mm5    /* Create mask pbv bytes < 0 */
2514            psubw mm4, mm0
2515            pand mm7, mm5       /* Only pbv bytes < 0 in mm0 */
2516            psubw mm4, mm0
2517            psubw mm5, mm7
2518            pxor mm0, mm0
2519            pcmpgtw mm0, mm6    /* Create mask pcv bytes < 0 */
2520            pand mm0, mm6       /* Only pav bytes < 0 in mm7 */
2521            psubw mm5, mm7
2522            psubw mm6, mm0
2523            /*  test pa <= pb */
2524            movq mm7, mm4
2525            psubw mm6, mm0
2526            pcmpgtw mm7, mm5    /* pa > pb? */
2527            movq mm0, mm7
2528            /* use mm7 mask to merge pa & pb */
2529            pand mm5, mm7
2530            /* use mm0 mask copy to merge a & b */
2531            pand mm2, mm0
2532            pandn mm7, mm4
2533            pandn mm0, mm1
2534            paddw mm7, mm5
2535            paddw mm0, mm2
2536            /*  test  ((pa <= pb)? pa:pb) <= pc */
2537            pcmpgtw mm7, mm6       /* pab > pc? */
2538            pxor mm1, mm1
2539            pand mm3, mm7
2540            pandn mm7, mm0
2541            paddw mm7, mm3
2542            pxor mm0, mm0
2543            packuswb mm7, mm1
2544            movq mm3, [esi + ebx]   /* load c=Prior(x-bpp) */
2545            pand mm7, ActiveMask
2546            movq mm2, mm3           /* load b=Prior(x) step 1 */
2547            paddb mm7, [edi + ebx]  /* add Paeth predictor with Raw(x) */
2548            punpcklbw mm3, mm0      /* Unpack High bytes of c */
2549            movq [edi + ebx], mm7   /* write back updated value */
2550            movq mm1, mm7           /* Now mm1 will be used as Raw(x-bpp) */
2551            /* Now do Paeth for 2nd set of bytes (3-5) */
2552            psrlq mm2, ShiftBpp     /* load b=Prior(x) step 2 */
2553            punpcklbw mm1, mm0      /* Unpack High bytes of a */
2554            pxor mm7, mm7
2555            punpcklbw mm2, mm0      /* Unpack High bytes of b */
2556            /* pbv = p - b = (a + b - c) - b = a - c */
2557            movq mm5, mm1
2558            /* pav = p - a = (a + b - c) - a = b - c */
2559            movq mm4, mm2
2560            psubw mm5, mm3
2561            psubw mm4, mm3
2562            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = */
2563            /*       pav + pbv = pbv + pav */
2564            movq mm6, mm5
2565            paddw mm6, mm4
2566
2567            /* pa = abs(p-a) = abs(pav) */
2568            /* pb = abs(p-b) = abs(pbv) */
2569            /* pc = abs(p-c) = abs(pcv) */
2570            pcmpgtw mm0, mm5       /* Create mask pbv bytes < 0 */
2571            pcmpgtw mm7, mm4       /* Create mask pav bytes < 0 */
2572            pand mm0, mm5          /* Only pbv bytes < 0 in mm0 */
2573            pand mm7, mm4          /* Only pav bytes < 0 in mm7 */
2574            psubw mm5, mm0
2575            psubw mm4, mm7
2576            psubw mm5, mm0
2577            psubw mm4, mm7
2578            pxor mm0, mm0
2579            pcmpgtw mm0, mm6       /* Create mask pcv bytes < 0 */
2580            pand mm0, mm6          /* Only pav bytes < 0 in mm7 */
2581            psubw mm6, mm0
2582            /*  test pa <= pb */
2583            movq mm7, mm4
2584            psubw mm6, mm0
2585            pcmpgtw mm7, mm5       /* pa > pb? */
2586            movq mm0, mm7
2587            /* use mm7 mask to merge pa & pb */
2588            pand mm5, mm7
2589            /* use mm0 mask copy to merge a & b */
2590            pand mm2, mm0
2591            pandn mm7, mm4
2592            pandn mm0, mm1
2593            paddw mm7, mm5
2594            paddw mm0, mm2
2595            /*  test  ((pa <= pb)? pa:pb) <= pc */
2596            pcmpgtw mm7, mm6       /* pab > pc? */
2597            movq mm2, [esi + ebx]  /* load b=Prior(x) */
2598            pand mm3, mm7
2599            pandn mm7, mm0
2600            pxor mm1, mm1
2601            paddw mm7, mm3
2602            pxor mm0, mm0
2603            packuswb mm7, mm1
2604            movq mm3, mm2           /* load c=Prior(x-bpp) step 1 */
2605            pand mm7, ActiveMask
2606            punpckhbw mm2, mm0      /* Unpack High bytes of b */
2607            psllq mm7, ShiftBpp     /* Shift bytes to 2nd group of 3 bytes */
2608             /* pav = p - a = (a + b - c) - a = b - c */
2609            movq mm4, mm2
2610            paddb mm7, [edi + ebx]  /* add Paeth predictor with Raw(x) */
2611            psllq mm3, ShiftBpp     /* load c=Prior(x-bpp) step 2 */
2612            movq [edi + ebx], mm7   /* write back updated value */
2613            movq mm1, mm7
2614            punpckhbw mm3, mm0      /* Unpack High bytes of c */
2615            psllq mm1, ShiftBpp     /* Shift bytes */
2616                                    /* Now mm1 will be used as Raw(x-bpp) */
2617            /* Now do Paeth for 3rd, and final, set of bytes (6-7) */
2618            pxor mm7, mm7
2619            punpckhbw mm1, mm0      /* Unpack High bytes of a */
2620            psubw mm4, mm3
2621            /* pbv = p - b = (a + b - c) - b = a - c */
2622            movq mm5, mm1
2623            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2624            movq mm6, mm4
2625            psubw mm5, mm3
2626            pxor mm0, mm0
2627            paddw mm6, mm5
2628
2629            /* pa = abs(p-a) = abs(pav) */
2630            /* pb = abs(p-b) = abs(pbv) */
2631            /* pc = abs(p-c) = abs(pcv) */
2632            pcmpgtw mm0, mm4    /* Create mask pav bytes < 0 */
2633            pcmpgtw mm7, mm5    /* Create mask pbv bytes < 0 */
2634            pand mm0, mm4       /* Only pav bytes < 0 in mm7 */
2635            pand mm7, mm5       /* Only pbv bytes < 0 in mm0 */
2636            psubw mm4, mm0
2637            psubw mm5, mm7
2638            psubw mm4, mm0
2639            psubw mm5, mm7
2640            pxor mm0, mm0
2641            pcmpgtw mm0, mm6    /* Create mask pcv bytes < 0 */
2642            pand mm0, mm6       /* Only pav bytes < 0 in mm7 */
2643            psubw mm6, mm0
2644            /*  test pa <= pb */
2645            movq mm7, mm4
2646            psubw mm6, mm0
2647            pcmpgtw mm7, mm5    /* pa > pb? */
2648            movq mm0, mm7
2649            /* use mm0 mask copy to merge a & b */
2650            pand mm2, mm0
2651            /* use mm7 mask to merge pa & pb */
2652            pand mm5, mm7
2653            pandn mm0, mm1
2654            pandn mm7, mm4
2655            paddw mm0, mm2
2656            paddw mm7, mm5
2657            /*  test  ((pa <= pb)? pa:pb) <= pc */
2658            pcmpgtw mm7, mm6    /* pab > pc? */
2659            pand mm3, mm7
2660            pandn mm7, mm0
2661            paddw mm7, mm3
2662            pxor mm1, mm1
2663            packuswb mm1, mm7
2664            /* Step ebx to next set of 8 bytes and repeat loop til done */
2665            add ebx, 8
2666            pand mm1, ActiveMaskEnd
2667            paddb mm1, [edi + ebx - 8] /* add Paeth predictor with Raw(x) */
2668
2669            cmp ebx, MMXLength
2670            pxor mm0, mm0              /* pxor does not affect flags */
2671            movq [edi + ebx - 8], mm1  /* write back updated value */
2672                                 /* mm1 will be used as Raw(x-bpp) next loop */
2673                           /* mm3 ready to be used as Prior(x-bpp) next loop */
2674            jb dpth3lp
2675         } /* end _asm block */
2676      }
2677      break;
2678
2679      case 6:
2680      case 7:
2681      case 5:
2682      {
2683         ActiveMask.use  = 0x00000000ffffffff;
2684         ActiveMask2.use = 0xffffffff00000000;
2685         ShiftBpp.use = bpp << 3;    /* == bpp * 8 */
2686         ShiftRem.use = 64 - ShiftBpp.use;
2687         _asm
2688         {
2689            mov ebx, diff
2690            mov edi, row
2691            mov esi, prev_row
2692            /* PRIME the pump (load the first Raw(x-bpp) data set */
2693            movq mm1, [edi+ebx-8]
2694            pxor mm0, mm0
2695dpth6lp:
2696            /* Must shift to position Raw(x-bpp) data */
2697            psrlq mm1, ShiftRem
2698            /* Do first set of 4 bytes */
2699            movq mm3, [esi+ebx-8]      /* read c=Prior(x-bpp) bytes */
2700            punpcklbw mm1, mm0      /* Unpack Low bytes of a */
2701            movq mm2, [esi + ebx]   /* load b=Prior(x) */
2702            punpcklbw mm2, mm0      /* Unpack Low bytes of b */
2703            /* Must shift to position Prior(x-bpp) data */
2704            psrlq mm3, ShiftRem
2705            /* pav = p - a = (a + b - c) - a = b - c */
2706            movq mm4, mm2
2707            punpcklbw mm3, mm0      /* Unpack Low bytes of c */
2708            /* pbv = p - b = (a + b - c) - b = a - c */
2709            movq mm5, mm1
2710            psubw mm4, mm3
2711            pxor mm7, mm7
2712            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2713            movq mm6, mm4
2714            psubw mm5, mm3
2715            /* pa = abs(p-a) = abs(pav) */
2716            /* pb = abs(p-b) = abs(pbv) */
2717            /* pc = abs(p-c) = abs(pcv) */
2718            pcmpgtw mm0, mm4    /* Create mask pav bytes < 0 */
2719            paddw mm6, mm5
2720            pand mm0, mm4       /* Only pav bytes < 0 in mm7 */
2721            pcmpgtw mm7, mm5    /* Create mask pbv bytes < 0 */
2722            psubw mm4, mm0
2723            pand mm7, mm5       /* Only pbv bytes < 0 in mm0 */
2724            psubw mm4, mm0
2725            psubw mm5, mm7
2726            pxor mm0, mm0
2727            pcmpgtw mm0, mm6    /* Create mask pcv bytes < 0 */
2728            pand mm0, mm6       /* Only pav bytes < 0 in mm7 */
2729            psubw mm5, mm7
2730            psubw mm6, mm0
2731            /*  test pa <= pb */
2732            movq mm7, mm4
2733            psubw mm6, mm0
2734            pcmpgtw mm7, mm5    /* pa > pb? */
2735            movq mm0, mm7
2736            /* use mm7 mask to merge pa & pb */
2737            pand mm5, mm7
2738            /* use mm0 mask copy to merge a & b */
2739            pand mm2, mm0
2740            pandn mm7, mm4
2741            pandn mm0, mm1
2742            paddw mm7, mm5
2743            paddw mm0, mm2
2744            /*  test  ((pa <= pb)? pa:pb) <= pc */
2745            pcmpgtw mm7, mm6    /* pab > pc? */
2746            pxor mm1, mm1
2747            pand mm3, mm7
2748            pandn mm7, mm0
2749            paddw mm7, mm3
2750            pxor mm0, mm0
2751            packuswb mm7, mm1
2752            movq mm3, [esi + ebx - 8]  /* load c=Prior(x-bpp) */
2753            pand mm7, ActiveMask
2754            psrlq mm3, ShiftRem
2755            movq mm2, [esi + ebx]      /* load b=Prior(x) step 1 */
2756            paddb mm7, [edi + ebx]     /* add Paeth predictor with Raw(x) */
2757            movq mm6, mm2
2758            movq [edi + ebx], mm7      /* write back updated value */
2759            movq mm1, [edi+ebx-8]
2760            psllq mm6, ShiftBpp
2761            movq mm5, mm7
2762            psrlq mm1, ShiftRem
2763            por mm3, mm6
2764            psllq mm5, ShiftBpp
2765            punpckhbw mm3, mm0         /* Unpack High bytes of c */
2766            por mm1, mm5
2767            /* Do second set of 4 bytes */
2768            punpckhbw mm2, mm0         /* Unpack High bytes of b */
2769            punpckhbw mm1, mm0         /* Unpack High bytes of a */
2770            /* pav = p - a = (a + b - c) - a = b - c */
2771            movq mm4, mm2
2772            /* pbv = p - b = (a + b - c) - b = a - c */
2773            movq mm5, mm1
2774            psubw mm4, mm3
2775            pxor mm7, mm7
2776            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2777            movq mm6, mm4
2778            psubw mm5, mm3
2779            /* pa = abs(p-a) = abs(pav) */
2780            /* pb = abs(p-b) = abs(pbv) */
2781            /* pc = abs(p-c) = abs(pcv) */
2782            pcmpgtw mm0, mm4       /* Create mask pav bytes < 0 */
2783            paddw mm6, mm5
2784            pand mm0, mm4          /* Only pav bytes < 0 in mm7 */
2785            pcmpgtw mm7, mm5       /* Create mask pbv bytes < 0 */
2786            psubw mm4, mm0
2787            pand mm7, mm5          /* Only pbv bytes < 0 in mm0 */
2788            psubw mm4, mm0
2789            psubw mm5, mm7
2790            pxor mm0, mm0
2791            pcmpgtw mm0, mm6       /* Create mask pcv bytes < 0 */
2792            pand mm0, mm6          /* Only pav bytes < 0 in mm7 */
2793            psubw mm5, mm7
2794            psubw mm6, mm0
2795            /*  test pa <= pb */
2796            movq mm7, mm4
2797            psubw mm6, mm0
2798            pcmpgtw mm7, mm5       /* pa > pb? */
2799            movq mm0, mm7
2800            /* use mm7 mask to merge pa & pb */
2801            pand mm5, mm7
2802            /* use mm0 mask copy to merge a & b */
2803            pand mm2, mm0
2804            pandn mm7, mm4
2805            pandn mm0, mm1
2806            paddw mm7, mm5
2807            paddw mm0, mm2
2808            /*  test  ((pa <= pb)? pa:pb) <= pc */
2809            pcmpgtw mm7, mm6           /* pab > pc? */
2810            pxor mm1, mm1
2811            pand mm3, mm7
2812            pandn mm7, mm0
2813            pxor mm1, mm1
2814            paddw mm7, mm3
2815            pxor mm0, mm0
2816            /* Step ex to next set of 8 bytes and repeat loop til done */
2817            add ebx, 8
2818            packuswb mm1, mm7
2819            paddb mm1, [edi + ebx - 8]     /* add Paeth predictor with Raw(x) */
2820            cmp ebx, MMXLength
2821            movq [edi + ebx - 8], mm1      /* write back updated value */
2822                                /* mm1 will be used as Raw(x-bpp) next loop */
2823            jb dpth6lp
2824         } /* end _asm block */
2825      }
2826      break;
2827
2828      case 4:
2829      {
2830         ActiveMask.use  = 0x00000000ffffffff;
2831         _asm {
2832            mov ebx, diff
2833            mov edi, row
2834            mov esi, prev_row
2835            pxor mm0, mm0
2836            /* PRIME the pump (load the first Raw(x-bpp) data set */
2837            movq mm1, [edi+ebx-8]    /* Only time should need to read */
2838                                     /*  a=Raw(x-bpp) bytes */
2839dpth4lp:
2840            /* Do first set of 4 bytes */
2841            movq mm3, [esi+ebx-8]    /* read c=Prior(x-bpp) bytes */
2842            punpckhbw mm1, mm0       /* Unpack Low bytes of a */
2843            movq mm2, [esi + ebx]    /* load b=Prior(x) */
2844            punpcklbw mm2, mm0       /* Unpack High bytes of b */
2845            /* pav = p - a = (a + b - c) - a = b - c */
2846            movq mm4, mm2
2847            punpckhbw mm3, mm0       /* Unpack High bytes of c */
2848            /* pbv = p - b = (a + b - c) - b = a - c */
2849            movq mm5, mm1
2850            psubw mm4, mm3
2851            pxor mm7, mm7
2852            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2853            movq mm6, mm4
2854            psubw mm5, mm3
2855            /* pa = abs(p-a) = abs(pav) */
2856            /* pb = abs(p-b) = abs(pbv) */
2857            /* pc = abs(p-c) = abs(pcv) */
2858            pcmpgtw mm0, mm4       /* Create mask pav bytes < 0 */
2859            paddw mm6, mm5
2860            pand mm0, mm4          /* Only pav bytes < 0 in mm7 */
2861            pcmpgtw mm7, mm5       /* Create mask pbv bytes < 0 */
2862            psubw mm4, mm0
2863            pand mm7, mm5          /* Only pbv bytes < 0 in mm0 */
2864            psubw mm4, mm0
2865            psubw mm5, mm7
2866            pxor mm0, mm0
2867            pcmpgtw mm0, mm6       /* Create mask pcv bytes < 0 */
2868            pand mm0, mm6          /* Only pav bytes < 0 in mm7 */
2869            psubw mm5, mm7
2870            psubw mm6, mm0
2871            /*  test pa <= pb */
2872            movq mm7, mm4
2873            psubw mm6, mm0
2874            pcmpgtw mm7, mm5       /* pa > pb? */
2875            movq mm0, mm7
2876            /* use mm7 mask to merge pa & pb */
2877            pand mm5, mm7
2878            /* use mm0 mask copy to merge a & b */
2879            pand mm2, mm0
2880            pandn mm7, mm4
2881            pandn mm0, mm1
2882            paddw mm7, mm5
2883            paddw mm0, mm2
2884            /*  test  ((pa <= pb)? pa:pb) <= pc */
2885            pcmpgtw mm7, mm6       /* pab > pc? */
2886            pxor mm1, mm1
2887            pand mm3, mm7
2888            pandn mm7, mm0
2889            paddw mm7, mm3
2890            pxor mm0, mm0
2891            packuswb mm7, mm1
2892            movq mm3, [esi + ebx]      /* load c=Prior(x-bpp) */
2893            pand mm7, ActiveMask
2894            movq mm2, mm3              /* load b=Prior(x) step 1 */
2895            paddb mm7, [edi + ebx]     /* add Paeth predictor with Raw(x) */
2896            punpcklbw mm3, mm0         /* Unpack High bytes of c */
2897            movq [edi + ebx], mm7      /* write back updated value */
2898            movq mm1, mm7              /* Now mm1 will be used as Raw(x-bpp) */
2899            /* Do second set of 4 bytes */
2900            punpckhbw mm2, mm0         /* Unpack Low bytes of b */
2901            punpcklbw mm1, mm0         /* Unpack Low bytes of a */
2902            /* pav = p - a = (a + b - c) - a = b - c */
2903            movq mm4, mm2
2904            /* pbv = p - b = (a + b - c) - b = a - c */
2905            movq mm5, mm1
2906            psubw mm4, mm3
2907            pxor mm7, mm7
2908            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2909            movq mm6, mm4
2910            psubw mm5, mm3
2911            /* pa = abs(p-a) = abs(pav) */
2912            /* pb = abs(p-b) = abs(pbv) */
2913            /* pc = abs(p-c) = abs(pcv) */
2914            pcmpgtw mm0, mm4       /* Create mask pav bytes < 0 */
2915            paddw mm6, mm5
2916            pand mm0, mm4          /* Only pav bytes < 0 in mm7 */
2917            pcmpgtw mm7, mm5       /* Create mask pbv bytes < 0 */
2918            psubw mm4, mm0
2919            pand mm7, mm5          /* Only pbv bytes < 0 in mm0 */
2920            psubw mm4, mm0
2921            psubw mm5, mm7
2922            pxor mm0, mm0
2923            pcmpgtw mm0, mm6       /* Create mask pcv bytes < 0 */
2924            pand mm0, mm6          /* Only pav bytes < 0 in mm7 */
2925            psubw mm5, mm7
2926            psubw mm6, mm0
2927            /*  test pa <= pb */
2928            movq mm7, mm4
2929            psubw mm6, mm0
2930            pcmpgtw mm7, mm5       /* pa > pb? */
2931            movq mm0, mm7
2932            /* use mm7 mask to merge pa & pb */
2933            pand mm5, mm7
2934            /* use mm0 mask copy to merge a & b */
2935            pand mm2, mm0
2936            pandn mm7, mm4
2937            pandn mm0, mm1
2938            paddw mm7, mm5
2939            paddw mm0, mm2
2940            /*  test  ((pa <= pb)? pa:pb) <= pc */
2941            pcmpgtw mm7, mm6       /* pab > pc? */
2942            pxor mm1, mm1
2943            pand mm3, mm7
2944            pandn mm7, mm0
2945            pxor mm1, mm1
2946            paddw mm7, mm3
2947            pxor mm0, mm0
2948            /* Step ex to next set of 8 bytes and repeat loop til done */
2949            add ebx, 8
2950            packuswb mm1, mm7
2951            paddb mm1, [edi + ebx - 8]     /* add Paeth predictor with Raw(x) */
2952            cmp ebx, MMXLength
2953            movq [edi + ebx - 8], mm1      /* write back updated value */
2954                                /* mm1 will be used as Raw(x-bpp) next loop */
2955            jb dpth4lp
2956         } /* end _asm block */
2957      }
2958      break;
2959      case 8:                          /* bpp == 8 */
2960      {
2961         ActiveMask.use  = 0x00000000ffffffff;
2962         _asm {
2963            mov ebx, diff
2964            mov edi, row
2965            mov esi, prev_row
2966            pxor mm0, mm0
2967            /* PRIME the pump (load the first Raw(x-bpp) data set */
2968            movq mm1, [edi+ebx-8]      /* Only time should need to read */
2969                                       /*  a=Raw(x-bpp) bytes */
2970dpth8lp:
2971            /* Do first set of 4 bytes */
2972            movq mm3, [esi+ebx-8]      /* read c=Prior(x-bpp) bytes */
2973            punpcklbw mm1, mm0         /* Unpack Low bytes of a */
2974            movq mm2, [esi + ebx]      /* load b=Prior(x) */
2975            punpcklbw mm2, mm0         /* Unpack Low bytes of b */
2976            /* pav = p - a = (a + b - c) - a = b - c */
2977            movq mm4, mm2
2978            punpcklbw mm3, mm0         /* Unpack Low bytes of c */
2979            /* pbv = p - b = (a + b - c) - b = a - c */
2980            movq mm5, mm1
2981            psubw mm4, mm3
2982            pxor mm7, mm7
2983            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2984            movq mm6, mm4
2985            psubw mm5, mm3
2986            /* pa = abs(p-a) = abs(pav) */
2987            /* pb = abs(p-b) = abs(pbv) */
2988            /* pc = abs(p-c) = abs(pcv) */
2989            pcmpgtw mm0, mm4       /* Create mask pav bytes < 0 */
2990            paddw mm6, mm5
2991            pand mm0, mm4          /* Only pav bytes < 0 in mm7 */
2992            pcmpgtw mm7, mm5       /* Create mask pbv bytes < 0 */
2993            psubw mm4, mm0
2994            pand mm7, mm5          /* Only pbv bytes < 0 in mm0 */
2995            psubw mm4, mm0
2996            psubw mm5, mm7
2997            pxor mm0, mm0
2998            pcmpgtw mm0, mm6       /* Create mask pcv bytes < 0 */
2999            pand mm0, mm6          /* Only pav bytes < 0 in mm7 */
3000            psubw mm5, mm7
3001            psubw mm6, mm0
3002            /*  test pa <= pb */
3003            movq mm7, mm4
3004            psubw mm6, mm0
3005            pcmpgtw mm7, mm5       /* pa > pb? */
3006            movq mm0, mm7
3007            /* use mm7 mask to merge pa & pb */
3008            pand mm5, mm7
3009            /* use mm0 mask copy to merge a & b */
3010            pand mm2, mm0
3011            pandn mm7, mm4
3012            pandn mm0, mm1
3013            paddw mm7, mm5
3014            paddw mm0, mm2
3015            /*  test  ((pa <= pb)? pa:pb) <= pc */
3016            pcmpgtw mm7, mm6       /* pab > pc? */
3017            pxor mm1, mm1
3018            pand mm3, mm7
3019            pandn mm7, mm0
3020            paddw mm7, mm3
3021            pxor mm0, mm0
3022            packuswb mm7, mm1
3023            movq mm3, [esi+ebx-8]    /* read c=Prior(x-bpp) bytes */
3024            pand mm7, ActiveMask
3025            movq mm2, [esi + ebx]    /* load b=Prior(x) */
3026            paddb mm7, [edi + ebx]   /* add Paeth predictor with Raw(x) */
3027            punpckhbw mm3, mm0       /* Unpack High bytes of c */
3028            movq [edi + ebx], mm7    /* write back updated value */
3029            movq mm1, [edi+ebx-8]    /* read a=Raw(x-bpp) bytes */
3030
3031            /* Do second set of 4 bytes */
3032            punpckhbw mm2, mm0       /* Unpack High bytes of b */
3033            punpckhbw mm1, mm0       /* Unpack High bytes of a */
3034            /* pav = p - a = (a + b - c) - a = b - c */
3035            movq mm4, mm2
3036            /* pbv = p - b = (a + b - c) - b = a - c */
3037            movq mm5, mm1
3038            psubw mm4, mm3
3039            pxor mm7, mm7
3040            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3041            movq mm6, mm4
3042            psubw mm5, mm3
3043            /* pa = abs(p-a) = abs(pav) */
3044            /* pb = abs(p-b) = abs(pbv) */
3045            /* pc = abs(p-c) = abs(pcv) */
3046            pcmpgtw mm0, mm4       /* Create mask pav bytes < 0 */
3047            paddw mm6, mm5
3048            pand mm0, mm4          /* Only pav bytes < 0 in mm7 */
3049            pcmpgtw mm7, mm5       /* Create mask pbv bytes < 0 */
3050            psubw mm4, mm0
3051            pand mm7, mm5          /* Only pbv bytes < 0 in mm0 */
3052            psubw mm4, mm0
3053            psubw mm5, mm7
3054            pxor mm0, mm0
3055            pcmpgtw mm0, mm6       /* Create mask pcv bytes < 0 */
3056            pand mm0, mm6          /* Only pav bytes < 0 in mm7 */
3057            psubw mm5, mm7
3058            psubw mm6, mm0
3059            /*  test pa <= pb */
3060            movq mm7, mm4
3061            psubw mm6, mm0
3062            pcmpgtw mm7, mm5       /* pa > pb? */
3063            movq mm0, mm7
3064            /* use mm7 mask to merge pa & pb */
3065            pand mm5, mm7
3066            /* use mm0 mask copy to merge a & b */
3067            pand mm2, mm0
3068            pandn mm7, mm4
3069            pandn mm0, mm1
3070            paddw mm7, mm5
3071            paddw mm0, mm2
3072            /*  test  ((pa <= pb)? pa:pb) <= pc */
3073            pcmpgtw mm7, mm6       /* pab > pc? */
3074            pxor mm1, mm1
3075            pand mm3, mm7
3076            pandn mm7, mm0
3077            pxor mm1, mm1
3078            paddw mm7, mm3
3079            pxor mm0, mm0
3080            /* Step ex to next set of 8 bytes and repeat loop til done */
3081            add ebx, 8
3082            packuswb mm1, mm7
3083            paddb mm1, [edi + ebx - 8]     /* add Paeth predictor with Raw(x) */
3084            cmp ebx, MMXLength
3085            movq [edi + ebx - 8], mm1      /* write back updated value */
3086                            /* mm1 will be used as Raw(x-bpp) next loop */
3087            jb dpth8lp
3088         } /* end _asm block */
3089      }
3090      break;
3091
3092      case 1:                /* bpp = 1 */
3093      case 2:                /* bpp = 2 */
3094      default:               /* bpp > 8 */
3095      {
3096         _asm {
3097            mov ebx, diff
3098            cmp ebx, FullLength
3099            jnb dpthdend
3100            mov edi, row
3101            mov esi, prev_row
3102            /* Do Paeth decode for remaining bytes */
3103            mov edx, ebx
3104            xor ecx, ecx        /* zero ecx before using cl & cx in loop below */
3105            sub edx, bpp        /* Set edx = ebx - bpp */
3106dpthdlp:
3107            xor eax, eax
3108            /* pav = p - a = (a + b - c) - a = b - c */
3109            mov al, [esi + ebx]        /* load Prior(x) into al */
3110            mov cl, [esi + edx]        /* load Prior(x-bpp) into cl */
3111            sub eax, ecx                 /* subtract Prior(x-bpp) */
3112            mov patemp, eax                 /* Save pav for later use */
3113            xor eax, eax
3114            /* pbv = p - b = (a + b - c) - b = a - c */
3115            mov al, [edi + edx]        /* load Raw(x-bpp) into al */
3116            sub eax, ecx                 /* subtract Prior(x-bpp) */
3117            mov ecx, eax
3118            /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3119            add eax, patemp                 /* pcv = pav + pbv */
3120            /* pc = abs(pcv) */
3121            test eax, 0x80000000
3122            jz dpthdpca
3123            neg eax                     /* reverse sign of neg values */
3124dpthdpca:
3125            mov pctemp, eax             /* save pc for later use */
3126            /* pb = abs(pbv) */
3127            test ecx, 0x80000000
3128            jz dpthdpba
3129            neg ecx                     /* reverse sign of neg values */
3130dpthdpba:
3131            mov pbtemp, ecx             /* save pb for later use */
3132            /* pa = abs(pav) */
3133            mov eax, patemp
3134            test eax, 0x80000000
3135            jz dpthdpaa
3136            neg eax                     /* reverse sign of neg values */
3137dpthdpaa:
3138            mov patemp, eax             /* save pa for later use */
3139            /* test if pa <= pb */
3140            cmp eax, ecx
3141            jna dpthdabb
3142            /* pa > pb; now test if pb <= pc */
3143            cmp ecx, pctemp
3144            jna dpthdbbc
3145            /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3146            mov cl, [esi + edx]  /* load Prior(x-bpp) into cl */
3147            jmp dpthdpaeth
3148dpthdbbc:
3149            /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
3150            mov cl, [esi + ebx]        /* load Prior(x) into cl */
3151            jmp dpthdpaeth
3152dpthdabb:
3153            /* pa <= pb; now test if pa <= pc */
3154            cmp eax, pctemp
3155            jna dpthdabc
3156            /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3157            mov cl, [esi + edx]  /* load Prior(x-bpp) into cl */
3158            jmp dpthdpaeth
3159dpthdabc:
3160            /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
3161            mov cl, [edi + edx]  /* load Raw(x-bpp) into cl */
3162dpthdpaeth:
3163            inc ebx
3164            inc edx
3165            /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
3166            add [edi + ebx - 1], cl
3167            cmp ebx, FullLength
3168            jb dpthdlp
3169dpthdend:
3170         } /* end _asm block */
3171      }
3172      return;                   /* No need to go further with this one */
3173   }                         /* end switch ( bpp ) */
3174   _asm
3175   {
3176         /* MMX acceleration complete now do clean-up */
3177         /* Check if any remaining bytes left to decode */
3178         mov ebx, MMXLength
3179         cmp ebx, FullLength
3180         jnb dpthend
3181         mov edi, row
3182         mov esi, prev_row
3183         /* Do Paeth decode for remaining bytes */
3184         mov edx, ebx
3185         xor ecx, ecx         /* zero ecx before using cl & cx in loop below */
3186         sub edx, bpp         /* Set edx = ebx - bpp */
3187dpthlp2:
3188         xor eax, eax
3189         /* pav = p - a = (a + b - c) - a = b - c */
3190         mov al, [esi + ebx]  /* load Prior(x) into al */
3191         mov cl, [esi + edx]  /* load Prior(x-bpp) into cl */
3192         sub eax, ecx         /* subtract Prior(x-bpp) */
3193         mov patemp, eax      /* Save pav for later use */
3194         xor eax, eax
3195         /* pbv = p - b = (a + b - c) - b = a - c */
3196         mov al, [edi + edx]  /* load Raw(x-bpp) into al */
3197         sub eax, ecx         /* subtract Prior(x-bpp) */
3198         mov ecx, eax
3199         /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3200         add eax, patemp      /* pcv = pav + pbv */
3201         /* pc = abs(pcv) */
3202         test eax, 0x80000000
3203         jz dpthpca2
3204         neg eax              /* reverse sign of neg values */
3205dpthpca2:
3206         mov pctemp, eax      /* save pc for later use */
3207         /* pb = abs(pbv) */
3208         test ecx, 0x80000000
3209         jz dpthpba2
3210         neg ecx              /* reverse sign of neg values */
3211dpthpba2:
3212         mov pbtemp, ecx      /* save pb for later use */
3213         /* pa = abs(pav) */
3214         mov eax, patemp
3215         test eax, 0x80000000
3216         jz dpthpaa2
3217         neg eax              /* reverse sign of neg values */
3218dpthpaa2:
3219         mov patemp, eax      /* save pa for later use */
3220         /* test if pa <= pb */
3221         cmp eax, ecx
3222         jna dpthabb2
3223         /* pa > pb; now test if pb <= pc */
3224         cmp ecx, pctemp
3225         jna dpthbbc2
3226         /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3227         mov cl, [esi + edx]  /* load Prior(x-bpp) into cl */
3228         jmp dpthpaeth2
3229dpthbbc2:
3230         /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
3231         mov cl, [esi + ebx]        /* load Prior(x) into cl */
3232         jmp dpthpaeth2
3233dpthabb2:
3234         /* pa <= pb; now test if pa <= pc */
3235         cmp eax, pctemp
3236         jna dpthabc2
3237         /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3238         mov cl, [esi + edx]  /* load Prior(x-bpp) into cl */
3239         jmp dpthpaeth2
3240dpthabc2:
3241         /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
3242         mov cl, [edi + edx]  /* load Raw(x-bpp) into cl */
3243dpthpaeth2:
3244         inc ebx
3245         inc edx
3246         /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
3247         add [edi + ebx - 1], cl
3248         cmp ebx, FullLength
3249         jb dpthlp2
3250dpthend:
3251         emms             /* End MMX instructions; prep for possible FP instrs. */
3252   } /* end _asm block */
3253}
3254
3255/* Optimized code for PNG Sub filter decoder */
3256void /* PRIVATE */
3257png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3258{
3259   /*int test; */
3260   int bpp;
3261   png_uint_32 FullLength;
3262   png_uint_32 MMXLength;
3263   int diff;
3264
3265   bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */
3266   FullLength  = row_info->rowbytes - bpp; /* # of bytes to filter */
3267   _asm {
3268        mov edi, row
3269        mov esi, edi               /* lp = row */
3270        add edi, bpp               /* rp = row + bpp */
3271        xor eax, eax
3272        /* get # of bytes to alignment */
3273        mov diff, edi               /* take start of row */
3274        add diff, 0xf               /* add 7 + 8 to incr past */
3275                                        /* alignment boundary */
3276        xor ebx, ebx
3277        and diff, 0xfffffff8        /* mask to alignment boundary */
3278        sub diff, edi               /* subtract from start ==> value */
3279                                        /*  ebx at alignment */
3280        jz dsubgo
3281        /* fix alignment */
3282dsublp1:
3283        mov al, [esi+ebx]
3284        add [edi+ebx], al
3285        inc ebx
3286        cmp ebx, diff
3287        jb dsublp1
3288dsubgo:
3289        mov ecx, FullLength
3290        mov edx, ecx
3291        sub edx, ebx                  /* subtract alignment fix */
3292        and edx, 0x00000007           /* calc bytes over mult of 8 */
3293        sub ecx, edx                  /* drop over bytes from length */
3294        mov MMXLength, ecx
3295   } /* end _asm block */
3296
3297   /* Now do the math for the rest of the row */
3298   switch ( bpp )
3299   {
3300        case 3:
3301        {
3302         ActiveMask.use  = 0x0000ffffff000000;
3303         ShiftBpp.use = 24;       /* == 3 * 8 */
3304         ShiftRem.use  = 40;      /* == 64 - 24 */
3305         _asm {
3306            mov edi, row
3307            movq mm7, ActiveMask  /* Load ActiveMask for 2nd active byte group */
3308            mov esi, edi              /* lp = row */
3309            add edi, bpp          /* rp = row + bpp */
3310            movq mm6, mm7
3311            mov ebx, diff
3312            psllq mm6, ShiftBpp   /* Move mask in mm6 to cover 3rd active */
3313                                  /* byte group */
3314            /* PRIME the pump (load the first Raw(x-bpp) data set */
3315            movq mm1, [edi+ebx-8]
3316dsub3lp:
3317            psrlq mm1, ShiftRem   /* Shift data for adding 1st bpp bytes */
3318                          /* no need for mask; shift clears inactive bytes */
3319            /* Add 1st active group */
3320            movq mm0, [edi+ebx]
3321            paddb mm0, mm1
3322            /* Add 2nd active group */
3323            movq mm1, mm0         /* mov updated Raws to mm1 */
3324            psllq mm1, ShiftBpp   /* shift data to position correctly */
3325            pand mm1, mm7         /* mask to use only 2nd active group */
3326            paddb mm0, mm1
3327            /* Add 3rd active group */
3328            movq mm1, mm0         /* mov updated Raws to mm1 */
3329            psllq mm1, ShiftBpp   /* shift data to position correctly */
3330            pand mm1, mm6         /* mask to use only 3rd active group */
3331            add ebx, 8
3332            paddb mm0, mm1
3333            cmp ebx, MMXLength
3334            movq [edi+ebx-8], mm0     /* Write updated Raws back to array */
3335            /* Prep for doing 1st add at top of loop */
3336            movq mm1, mm0
3337            jb dsub3lp
3338         } /* end _asm block */
3339      }
3340      break;
3341
3342      case 1:
3343      {
3344         /* Placed here just in case this is a duplicate of the */
3345         /* non-MMX code for the SUB filter in png_read_filter_row below */
3346         //
3347         /*         png_bytep rp; */
3348         /*         png_bytep lp; */
3349         /*         png_uint_32 i; */
3350         /*         bpp = (row_info->pixel_depth + 7) >> 3; */
3351         /*         for (i = (png_uint_32)bpp, rp = row + bpp, lp = row; */
3352         /*            i < row_info->rowbytes; i++, rp++, lp++) */
3353         /*      { */
3354         /*            *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff); */
3355         /*      } */
3356         _asm {
3357            mov ebx, diff
3358            mov edi, row
3359            cmp ebx, FullLength
3360            jnb dsub1end
3361            mov esi, edi          /* lp = row */
3362            xor eax, eax
3363            add edi, bpp      /* rp = row + bpp */
3364dsub1lp:
3365            mov al, [esi+ebx]
3366            add [edi+ebx], al
3367            inc ebx
3368            cmp ebx, FullLength
3369            jb dsub1lp
3370dsub1end:
3371         } /* end _asm block */
3372      }
3373      return;
3374
3375      case 6:
3376      case 7:
3377      case 4:
3378      case 5:
3379      {
3380         ShiftBpp.use = bpp << 3;
3381         ShiftRem.use = 64 - ShiftBpp.use;
3382         _asm {
3383            mov edi, row
3384            mov ebx, diff
3385            mov esi, edi               /* lp = row */
3386            add edi, bpp           /* rp = row + bpp */
3387            /* PRIME the pump (load the first Raw(x-bpp) data set */
3388            movq mm1, [edi+ebx-8]
3389dsub4lp:
3390            psrlq mm1, ShiftRem /* Shift data for adding 1st bpp bytes */
3391                          /* no need for mask; shift clears inactive bytes */
3392            movq mm0, [edi+ebx]
3393            paddb mm0, mm1
3394            /* Add 2nd active group */
3395            movq mm1, mm0          /* mov updated Raws to mm1 */
3396            psllq mm1, ShiftBpp    /* shift data to position correctly */
3397                                   /* there is no need for any mask */
3398                                   /* since shift clears inactive bits/bytes */
3399            add ebx, 8
3400            paddb mm0, mm1
3401            cmp ebx, MMXLength
3402            movq [edi+ebx-8], mm0
3403            movq mm1, mm0          /* Prep for doing 1st add at top of loop */
3404            jb dsub4lp
3405         } /* end _asm block */
3406      }
3407      break;
3408
3409      case 2:
3410      {
3411         ActiveMask.use  = 0x00000000ffff0000;
3412         ShiftBpp.use = 16;       /* == 2 * 8 */
3413         ShiftRem.use = 48;       /* == 64 - 16 */
3414         _asm {
3415            movq mm7, ActiveMask  /* Load ActiveMask for 2nd active byte group */
3416            mov ebx, diff
3417            movq mm6, mm7
3418            mov edi, row
3419            psllq mm6, ShiftBpp     /* Move mask in mm6 to cover 3rd active */
3420                                    /*  byte group */
3421            mov esi, edi            /* lp = row */
3422            movq mm5, mm6
3423            add edi, bpp            /* rp = row + bpp */
3424            psllq mm5, ShiftBpp     /* Move mask in mm5 to cover 4th active */
3425                                    /*  byte group */
3426            /* PRIME the pump (load the first Raw(x-bpp) data set */
3427            movq mm1, [edi+ebx-8]
3428dsub2lp:
3429            /* Add 1st active group */
3430            psrlq mm1, ShiftRem     /* Shift data for adding 1st bpp bytes */
3431                                    /* no need for mask; shift clears inactive */
3432                                    /*  bytes */
3433            movq mm0, [edi+ebx]
3434            paddb mm0, mm1
3435            /* Add 2nd active group */
3436            movq mm1, mm0           /* mov updated Raws to mm1 */
3437            psllq mm1, ShiftBpp     /* shift data to position correctly */
3438            pand mm1, mm7           /* mask to use only 2nd active group */
3439            paddb mm0, mm1
3440            /* Add 3rd active group */
3441            movq mm1, mm0           /* mov updated Raws to mm1 */
3442            psllq mm1, ShiftBpp     /* shift data to position correctly */
3443            pand mm1, mm6           /* mask to use only 3rd active group */
3444            paddb mm0, mm1
3445            /* Add 4th active group */
3446            movq mm1, mm0           /* mov updated Raws to mm1 */
3447            psllq mm1, ShiftBpp     /* shift data to position correctly */
3448            pand mm1, mm5           /* mask to use only 4th active group */
3449            add ebx, 8
3450            paddb mm0, mm1
3451            cmp ebx, MMXLength
3452            movq [edi+ebx-8], mm0   /* Write updated Raws back to array */
3453            movq mm1, mm0           /* Prep for doing 1st add at top of loop */
3454            jb dsub2lp
3455         } /* end _asm block */
3456      }
3457      break;
3458      case 8:
3459      {
3460         _asm {
3461            mov edi, row
3462            mov ebx, diff
3463            mov esi, edi            /* lp = row */
3464            add edi, bpp            /* rp = row + bpp */
3465            mov ecx, MMXLength
3466            movq mm7, [edi+ebx-8]   /* PRIME the pump (load the first */
3467                                    /* Raw(x-bpp) data set */
3468            and ecx, 0x0000003f     /* calc bytes over mult of 64 */
3469dsub8lp:
3470            movq mm0, [edi+ebx]     /* Load Sub(x) for 1st 8 bytes */
3471            paddb mm0, mm7
3472            movq mm1, [edi+ebx+8]   /* Load Sub(x) for 2nd 8 bytes */
3473            movq [edi+ebx], mm0    /* Write Raw(x) for 1st 8 bytes */
3474                                   /* Now mm0 will be used as Raw(x-bpp) for */
3475                                   /* the 2nd group of 8 bytes.  This will be */
3476                                   /* repeated for each group of 8 bytes with */
3477                                   /* the 8th group being used as the Raw(x-bpp) */
3478                                   /* for the 1st group of the next loop. */
3479            paddb mm1, mm0
3480            movq mm2, [edi+ebx+16]  /* Load Sub(x) for 3rd 8 bytes */
3481            movq [edi+ebx+8], mm1   /* Write Raw(x) for 2nd 8 bytes */
3482            paddb mm2, mm1
3483            movq mm3, [edi+ebx+24]  /* Load Sub(x) for 4th 8 bytes */
3484            movq [edi+ebx+16], mm2  /* Write Raw(x) for 3rd 8 bytes */
3485            paddb mm3, mm2
3486            movq mm4, [edi+ebx+32]  /* Load Sub(x) for 5th 8 bytes */
3487            movq [edi+ebx+24], mm3  /* Write Raw(x) for 4th 8 bytes */
3488            paddb mm4, mm3
3489            movq mm5, [edi+ebx+40]  /* Load Sub(x) for 6th 8 bytes */
3490            movq [edi+ebx+32], mm4  /* Write Raw(x) for 5th 8 bytes */
3491            paddb mm5, mm4
3492            movq mm6, [edi+ebx+48]  /* Load Sub(x) for 7th 8 bytes */
3493            movq [edi+ebx+40], mm5  /* Write Raw(x) for 6th 8 bytes */
3494            paddb mm6, mm5
3495            movq mm7, [edi+ebx+56]  /* Load Sub(x) for 8th 8 bytes */
3496            movq [edi+ebx+48], mm6  /* Write Raw(x) for 7th 8 bytes */
3497            add ebx, 64
3498            paddb mm7, mm6
3499            cmp ebx, ecx
3500            movq [edi+ebx-8], mm7   /* Write Raw(x) for 8th 8 bytes */
3501            jb dsub8lp
3502            cmp ebx, MMXLength
3503            jnb dsub8lt8
3504dsub8lpA:
3505            movq mm0, [edi+ebx]
3506            add ebx, 8
3507            paddb mm0, mm7
3508            cmp ebx, MMXLength
3509            movq [edi+ebx-8], mm0   /* use -8 to offset early add to ebx */
3510            movq mm7, mm0           /* Move calculated Raw(x) data to mm1 to */
3511                                    /* be the new Raw(x-bpp) for the next loop */
3512            jb dsub8lpA
3513dsub8lt8:
3514         } /* end _asm block */
3515      }
3516      break;
3517
3518      default:                /* bpp greater than 8 bytes */
3519      {
3520         _asm {
3521            mov ebx, diff
3522            mov edi, row
3523            mov esi, edi           /* lp = row */
3524            add edi, bpp           /* rp = row + bpp */
3525dsubAlp:
3526            movq mm0, [edi+ebx]
3527            movq mm1, [esi+ebx]
3528            add ebx, 8
3529            paddb mm0, mm1
3530            cmp ebx, MMXLength
3531            movq [edi+ebx-8], mm0  /* mov does not affect flags; -8 to offset */
3532                                   /*  add ebx */
3533            jb dsubAlp
3534         } /* end _asm block */
3535      }
3536      break;
3537
3538   } /* end switch ( bpp ) */
3539
3540   _asm {
3541        mov ebx, MMXLength
3542        mov edi, row
3543        cmp ebx, FullLength
3544        jnb dsubend
3545        mov esi, edi               /* lp = row */
3546        xor eax, eax
3547        add edi, bpp               /* rp = row + bpp */
3548dsublp2:
3549        mov al, [esi+ebx]
3550        add [edi+ebx], al
3551        inc ebx
3552        cmp ebx, FullLength
3553        jb dsublp2
3554dsubend:
3555        emms             /* End MMX instructions; prep for possible FP instrs. */
3556   } /* end _asm block */
3557}
3558
3559/* Optimized code for PNG Up filter decoder */
3560void /* PRIVATE */
3561png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3562   png_bytep prev_row)
3563{
3564   png_uint_32 len;
3565   len  = row_info->rowbytes;       /* # of bytes to filter */
3566   _asm {
3567      mov edi, row
3568      /* get # of bytes to alignment */
3569      mov ecx, edi
3570      xor ebx, ebx
3571      add ecx, 0x7
3572      xor eax, eax
3573      and ecx, 0xfffffff8
3574      mov esi, prev_row
3575      sub ecx, edi
3576      jz dupgo
3577      /* fix alignment */
3578duplp1:
3579      mov al, [edi+ebx]
3580      add al, [esi+ebx]
3581      inc ebx
3582      cmp ebx, ecx
3583      mov [edi + ebx-1], al  /* mov does not affect flags; -1 to offset inc ebx */
3584      jb duplp1
3585dupgo:
3586      mov ecx, len
3587      mov edx, ecx
3588      sub edx, ebx                  /* subtract alignment fix */
3589      and edx, 0x0000003f           /* calc bytes over mult of 64 */
3590      sub ecx, edx                  /* drop over bytes from length */
3591      /* Unrolled loop - use all MMX registers and interleave to reduce */
3592      /* number of branch instructions (loops) and reduce partial stalls */
3593duploop:
3594      movq mm1, [esi+ebx]
3595      movq mm0, [edi+ebx]
3596      movq mm3, [esi+ebx+8]
3597      paddb mm0, mm1
3598      movq mm2, [edi+ebx+8]
3599      movq [edi+ebx], mm0
3600      paddb mm2, mm3
3601      movq mm5, [esi+ebx+16]
3602      movq [edi+ebx+8], mm2
3603      movq mm4, [edi+ebx+16]
3604      movq mm7, [esi+ebx+24]
3605      paddb mm4, mm5
3606      movq mm6, [edi+ebx+24]
3607      movq [edi+ebx+16], mm4
3608      paddb mm6, mm7
3609      movq mm1, [esi+ebx+32]
3610      movq [edi+ebx+24], mm6
3611      movq mm0, [edi+ebx+32]
3612      movq mm3, [esi+ebx+40]
3613      paddb mm0, mm1
3614      movq mm2, [edi+ebx+40]
3615      movq [edi+ebx+32], mm0
3616      paddb mm2, mm3
3617      movq mm5, [esi+ebx+48]
3618      movq [edi+ebx+40], mm2
3619      movq mm4, [edi+ebx+48]
3620      movq mm7, [esi+ebx+56]
3621      paddb mm4, mm5
3622      movq mm6, [edi+ebx+56]
3623      movq [edi+ebx+48], mm4
3624      add ebx, 64
3625      paddb mm6, mm7
3626      cmp ebx, ecx
3627      movq [edi+ebx-8], mm6 /* (+56)movq does not affect flags; */
3628                                     /* -8 to offset add ebx */
3629      jb duploop
3630
3631      cmp edx, 0                     /* Test for bytes over mult of 64 */
3632      jz dupend
3633
3634
3635      /* 2 lines added by lcreeve@netins.net */
3636      /* (mail 11 Jul 98 in png-implement list) */
3637      cmp edx, 8 /*test for less than 8 bytes */
3638      jb duplt8
3639
3640
3641      add ecx, edx
3642      and edx, 0x00000007           /* calc bytes over mult of 8 */
3643      sub ecx, edx                  /* drop over bytes from length */
3644      jz duplt8
3645      /* Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously */
3646duplpA:
3647      movq mm1, [esi+ebx]
3648      movq mm0, [edi+ebx]
3649      add ebx, 8
3650      paddb mm0, mm1
3651      cmp ebx, ecx
3652      movq [edi+ebx-8], mm0 /* movq does not affect flags; -8 to offset add ebx */
3653      jb duplpA
3654      cmp edx, 0            /* Test for bytes over mult of 8 */
3655      jz dupend
3656duplt8:
3657      xor eax, eax
3658      add ecx, edx          /* move over byte count into counter */
3659      /* Loop using x86 registers to update remaining bytes */
3660duplp2:
3661      mov al, [edi + ebx]
3662      add al, [esi + ebx]
3663      inc ebx
3664      cmp ebx, ecx
3665      mov [edi + ebx-1], al /* mov does not affect flags; -1 to offset inc ebx */
3666      jb duplp2
3667dupend:
3668      /* Conversion of filtered row completed */
3669      emms          /* End MMX instructions; prep for possible FP instrs. */
3670   } /* end _asm block */
3671}
3672
3673
3674/* Optimized png_read_filter_row routines */
3675void /* PRIVATE */
3676png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3677   row, png_bytep prev_row, int filter)
3678{
3679#ifdef PNG_DEBUG
3680   char filnm[10];
3681#endif
3682
3683   if (mmx_supported == 2) {
3684#if !defined(PNG_1_0_X)
3685       /* this should have happened in png_init_mmx_flags() already */
3686       png_warning(png_ptr, "asm_flags may not have been initialized");
3687#endif
3688       png_mmx_support();
3689   }
3690
3691#ifdef PNG_DEBUG
3692   png_debug(1, "in png_read_filter_row\n");
3693   switch (filter)
3694   {
3695      case 0: sprintf(filnm, "none");
3696         break;
3697#if !defined(PNG_1_0_X)
3698      case 1: sprintf(filnm, "sub-%s",
3699        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
3700         break;
3701      case 2: sprintf(filnm, "up-%s",
3702        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
3703         break;
3704      case 3: sprintf(filnm, "avg-%s",
3705        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
3706         break;
3707      case 4: sprintf(filnm, "Paeth-%s",
3708        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
3709         break;
3710#else
3711      case 1: sprintf(filnm, "sub");
3712         break;
3713      case 2: sprintf(filnm, "up");
3714         break;
3715      case 3: sprintf(filnm, "avg");
3716         break;
3717      case 4: sprintf(filnm, "Paeth");
3718         break;
3719#endif
3720      default: sprintf(filnm, "unknw");
3721         break;
3722   }
3723   png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3724   png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3725      (int)((row_info->pixel_depth + 7) >> 3));
3726   png_debug1(0,"len=%8d, ", row_info->rowbytes);
3727#endif /* PNG_DEBUG */
3728
3729   switch (filter)
3730   {
3731      case PNG_FILTER_VALUE_NONE:
3732         break;
3733
3734      case PNG_FILTER_VALUE_SUB:
3735      {
3736#if !defined(PNG_1_0_X)
3737         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
3738             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3739             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3740#else
3741         if (mmx_supported)
3742#endif
3743         {
3744            png_read_filter_row_mmx_sub(row_info, row);
3745         }
3746         else
3747         {
3748            png_uint_32 i;
3749            png_uint_32 istop = row_info->rowbytes;
3750            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3751            png_bytep rp = row + bpp;
3752            png_bytep lp = row;
3753
3754            for (i = bpp; i < istop; i++)
3755            {
3756               *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3757               rp++;
3758            }
3759         }
3760         break;
3761      }
3762
3763      case PNG_FILTER_VALUE_UP:
3764      {
3765#if !defined(PNG_1_0_X)
3766         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
3767             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3768             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3769#else
3770         if (mmx_supported)
3771#endif
3772         {
3773            png_read_filter_row_mmx_up(row_info, row, prev_row);
3774         }
3775         else
3776         {
3777            png_uint_32 i;
3778            png_uint_32 istop = row_info->rowbytes;
3779            png_bytep rp = row;
3780            png_bytep pp = prev_row;
3781
3782            for (i = 0; i < istop; ++i)
3783            {
3784               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3785               rp++;
3786            }
3787         }
3788         break;
3789      }
3790
3791      case PNG_FILTER_VALUE_AVG:
3792      {
3793#if !defined(PNG_1_0_X)
3794         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
3795             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3796             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3797#else
3798         if (mmx_supported)
3799#endif
3800         {
3801            png_read_filter_row_mmx_avg(row_info, row, prev_row);
3802         }
3803         else
3804         {
3805            png_uint_32 i;
3806            png_bytep rp = row;
3807            png_bytep pp = prev_row;
3808            png_bytep lp = row;
3809            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3810            png_uint_32 istop = row_info->rowbytes - bpp;
3811
3812            for (i = 0; i < bpp; i++)
3813            {
3814               *rp = (png_byte)(((int)(*rp) +
3815                  ((int)(*pp++) >> 1)) & 0xff);
3816               rp++;
3817            }
3818
3819            for (i = 0; i < istop; i++)
3820            {
3821               *rp = (png_byte)(((int)(*rp) +
3822                  ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3823               rp++;
3824            }
3825         }
3826         break;
3827      }
3828
3829      case PNG_FILTER_VALUE_PAETH:
3830      {
3831#if !defined(PNG_1_0_X)
3832         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
3833             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3834             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3835#else
3836         if (mmx_supported)
3837#endif
3838         {
3839            png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3840         }
3841         else
3842         {
3843            png_uint_32 i;
3844            png_bytep rp = row;
3845            png_bytep pp = prev_row;
3846            png_bytep lp = row;
3847            png_bytep cp = prev_row;
3848            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3849            png_uint_32 istop=row_info->rowbytes - bpp;
3850
3851            for (i = 0; i < bpp; i++)
3852            {
3853               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3854               rp++;
3855            }
3856
3857            for (i = 0; i < istop; i++)   /* use leftover rp,pp */
3858            {
3859               int a, b, c, pa, pb, pc, p;
3860
3861               a = *lp++;
3862               b = *pp++;
3863               c = *cp++;
3864
3865               p = b - c;
3866               pc = a - c;
3867
3868#ifdef PNG_USE_ABS
3869               pa = abs(p);
3870               pb = abs(pc);
3871               pc = abs(p + pc);
3872#else
3873               pa = p < 0 ? -p : p;
3874               pb = pc < 0 ? -pc : pc;
3875               pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3876#endif
3877
3878               /*
3879                  if (pa <= pb && pa <= pc)
3880                     p = a;
3881                  else if (pb <= pc)
3882                     p = b;
3883                  else
3884                     p = c;
3885                */
3886
3887               p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3888
3889               *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3890               rp++;
3891            }
3892         }
3893         break;
3894      }
3895
3896      default:
3897         png_warning(png_ptr, "Ignoring bad row filter type");
3898         *row=0;
3899         break;
3900   }
3901}
3902
3903#endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */
3904