1/* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
2 *
3 * For Intel x86 CPU and Microsoft Visual C++ compiler
4 *
5 * libpng 1.0.8 - July 24, 2000
6 * For conditions of distribution and use, see copyright notice in png.h
7 * Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
8 * Copyright (c) 1998, Intel Corporation
9 *
10 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11 * Interface to libpng contributed by Gilles Vollant, 1999
12 *
13 */
14
15/* $Id: pngvcrd.c 14574 2005-10-29 16:27:43Z bonefish $ */
16
17#define PNG_INTERNAL
18#include "png.h"
19
20#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
21
22/*
23   One of these might need to be defined.
24#define DISABLE_PNGVCRD_COMBINE
25#define DISABLE_PNGVCRD_INTERLACE
26*/
27
28static int mmx_supported=2;
29
30void /* PRIVATE */
31png_read_filter_row_c(png_structp png_ptr, png_row_infop row_info,
32   png_bytep row, png_bytep prev_row, int filter);
33
34static int mmxsupport()
35{
36  int mmx_supported_local = 0;
37  _asm {
38    push ebx          //CPUID will trash these
39    push ecx
40    push edx
41    pushfd            //Save Eflag to stack
42    pop eax           //Get Eflag from stack into eax
43    mov ecx, eax      //Make another copy of Eflag in ecx
44    xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
45    push eax          //Save modified Eflag back to stack
46
47    popfd             //Restored modified value back to Eflag reg
48    pushfd            //Save Eflag to stack
49    pop eax           //Get Eflag from stack
50    xor eax, ecx      //Compare the new Eflag with the original Eflag
51    jz NOT_SUPPORTED  //If the same, CPUID instruction is not supported,
52                      //skip following instructions and jump to
53                      //NOT_SUPPORTED label
54
55    xor eax, eax      //Set eax to zero
56
57    _asm _emit 0x0f   //CPUID instruction  (two bytes opcode)
58    _asm _emit 0xa2
59
60    cmp eax, 1        //make sure eax return non-zero value
61    jl NOT_SUPPORTED  //If eax is zero, mmx not supported
62
63    xor eax, eax      //set eax to zero
64    inc eax           //Now increment eax to 1.  This instruction is
65                      //faster than the instruction "mov eax, 1"
66
67    _asm _emit 0x0f   //CPUID instruction
68    _asm _emit 0xa2
69
70    and edx, 0x00800000  //mask out all bits but mmx bit(24)
71    cmp edx, 0        // 0 = mmx not supported
72    jz  NOT_SUPPORTED // non-zero = Yes, mmx IS supported
73
74    mov  mmx_supported_local, 1  //set return value to 1
75
76NOT_SUPPORTED:
77    mov  eax, mmx_supported_local  //move return value to eax
78    pop edx          //CPUID trashed these
79    pop ecx
80    pop ebx
81  }
82
83  //mmx_supported_local=0; // test code for force don't support MMX
84  //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
85
86  return mmx_supported_local;
87}
88
89/* Combines the row recently read in with the previous row.
90   This routine takes care of alpha and transparency if requested.
91   This routine also handles the two methods of progressive display
92   of interlaced images, depending on the mask value.
93   The mask value describes which pixels are to be combined with
94   the row.  The pattern always repeats every 8 pixels, so just 8
95   bits are needed.  A one indicates the pixel is to be combined; a
96   zero indicates the pixel is to be skipped.  This is in addition
97   to any alpha or transparency value associated with the pixel.  If
98   you want all pixels to be combined, pass 0xff (255) in mask.  */
99
100/* Use this routine for x86 platform - uses faster MMX routine if machine
101   supports MMX */
102
103void /* PRIVATE */
104png_combine_row(png_structp png_ptr, png_bytep row, int mask)
105{
106#ifdef PNG_USE_LOCAL_ARRAYS
107   const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
108#endif
109#ifdef DISABLE_PNGVCRD_COMBINE
110   int save_mmx_supported = mmx_supported;
111#endif
112
113   png_debug(1,"in png_combine_row_asm\n");
114
115#ifdef DISABLE_PNGVCRD_COMBINE
116   if ((png_ptr->transformations & PNG_INTERLACE) && png_ptr->pass != 6)
117       mmx_supported = 0;
118   else
119#endif
120       if (mmx_supported == 2)
121           mmx_supported = mmxsupport();
122
123   if (mask == 0xff)
124   {
125      png_memcpy(row, png_ptr->row_buf + 1,
126       (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
127   }
128   /* GRR:  add "else if (mask == 0)" case?
129    *       or does png_combine_row() not even get called in that case? */
130   else
131   {
132      switch (png_ptr->row_info.pixel_depth)
133      {
134         case 1:
135         {
136            png_bytep sp;
137            png_bytep dp;
138            int s_inc, s_start, s_end;
139            int m;
140            int shift;
141            png_uint_32 i;
142
143            sp = png_ptr->row_buf + 1;
144            dp = row;
145            m = 0x80;
146#if defined(PNG_READ_PACKSWAP_SUPPORTED)
147            if (png_ptr->transformations & PNG_PACKSWAP)
148            {
149                s_start = 0;
150                s_end = 7;
151                s_inc = 1;
152            }
153            else
154#endif
155            {
156                s_start = 7;
157                s_end = 0;
158                s_inc = -1;
159            }
160
161            shift = s_start;
162
163            for (i = 0; i < png_ptr->width; i++)
164            {
165               if (m & mask)
166               {
167                  int value;
168
169                  value = (*sp >> shift) & 0x1;
170                  *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
171                  *dp |= (png_byte)(value << shift);
172               }
173
174               if (shift == s_end)
175               {
176                  shift = s_start;
177                  sp++;
178                  dp++;
179               }
180               else
181                  shift += s_inc;
182
183               if (m == 1)
184                  m = 0x80;
185               else
186                  m >>= 1;
187            }
188            break;
189         }
190
191         case 2:
192         {
193            png_bytep sp;
194            png_bytep dp;
195            int s_start, s_end, s_inc;
196            int m;
197            int shift;
198            png_uint_32 i;
199            int value;
200
201            sp = png_ptr->row_buf + 1;
202            dp = row;
203            m = 0x80;
204#if defined(PNG_READ_PACKSWAP_SUPPORTED)
205            if (png_ptr->transformations & PNG_PACKSWAP)
206            {
207               s_start = 0;
208               s_end = 6;
209               s_inc = 2;
210            }
211            else
212#endif
213            {
214               s_start = 6;
215               s_end = 0;
216               s_inc = -2;
217            }
218
219            shift = s_start;
220
221            for (i = 0; i < png_ptr->width; i++)
222            {
223               if (m & mask)
224               {
225                  value = (*sp >> shift) & 0x3;
226                  *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
227                  *dp |= (png_byte)(value << shift);
228               }
229
230               if (shift == s_end)
231               {
232                  shift = s_start;
233                  sp++;
234                  dp++;
235               }
236               else
237                  shift += s_inc;
238               if (m == 1)
239                  m = 0x80;
240               else
241                  m >>= 1;
242            }
243            break;
244         }
245
246         case 4:
247         {
248            png_bytep sp;
249            png_bytep dp;
250            int s_start, s_end, s_inc;
251            int m;
252            int shift;
253            png_uint_32 i;
254            int value;
255
256            sp = png_ptr->row_buf + 1;
257            dp = row;
258            m = 0x80;
259#if defined(PNG_READ_PACKSWAP_SUPPORTED)
260            if (png_ptr->transformations & PNG_PACKSWAP)
261            {
262               s_start = 0;
263               s_end = 4;
264               s_inc = 4;
265            }
266            else
267#endif
268            {
269               s_start = 4;
270               s_end = 0;
271               s_inc = -4;
272            }
273            shift = s_start;
274
275            for (i = 0; i < png_ptr->width; i++)
276            {
277               if (m & mask)
278               {
279                  value = (*sp >> shift) & 0xf;
280                  *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
281                  *dp |= (png_byte)(value << shift);
282               }
283
284               if (shift == s_end)
285               {
286                  shift = s_start;
287                  sp++;
288                  dp++;
289               }
290               else
291                  shift += s_inc;
292               if (m == 1)
293                  m = 0x80;
294               else
295                  m >>= 1;
296            }
297            break;
298         }
299
300         case 8:
301         {
302            png_bytep srcptr;
303            png_bytep dstptr;
304            png_uint_32 len;
305            int m;
306            int diff, unmask;
307
308            __int64 mask0=0x0102040810204080;
309
310            if (mmx_supported)
311            {
312               srcptr = png_ptr->row_buf + 1;
313               dstptr = row;
314               m = 0x80;
315               unmask = ~mask;
316               len  = png_ptr->width &~7;  //reduce to multiple of 8
317               diff = png_ptr->width & 7;  //amount lost
318
319               _asm
320               {
321                  movd       mm7, unmask   //load bit pattern
322                  psubb      mm6,mm6       //zero mm6
323                  punpcklbw  mm7,mm7
324                  punpcklwd  mm7,mm7
325                  punpckldq  mm7,mm7       //fill register with 8 masks
326
327                  movq       mm0,mask0
328
329                  pand       mm0,mm7       //nonzero if keep byte
330                  pcmpeqb    mm0,mm6       //zeros->1s, v versa
331
332                  mov        ecx,len       //load length of line (pixels)
333                  mov        esi,srcptr    //load source
334                  mov        ebx,dstptr    //load dest
335                  cmp        ecx,0         //lcr
336                  je         mainloop8end
337
338mainloop8:
339                  movq       mm4,[esi]
340                  pand       mm4,mm0
341                  movq       mm6,mm0
342                  pandn      mm6,[ebx]
343                  por        mm4,mm6
344                  movq       [ebx],mm4
345
346                  add        esi,8         //inc by 8 bytes processed
347                  add        ebx,8
348                  sub        ecx,8         //dec by 8 pixels processed
349
350                  ja         mainloop8
351mainloop8end:
352
353                  mov        ecx,diff
354                  cmp        ecx,0
355                  jz         end8
356
357                  mov        edx,mask
358                  sal        edx,24        //make low byte the high byte
359
360secondloop8:
361                  sal        edx,1         //move high bit to CF
362                  jnc        skip8         //if CF = 0
363                  mov        al,[esi]
364                  mov        [ebx],al
365skip8:
366                  inc        esi
367                  inc        ebx
368
369                  dec        ecx
370                  jnz        secondloop8
371end8:
372                  emms
373               }
374            }
375            else /* mmx not supported - use modified C routine */
376            {
377               register unsigned int incr1, initial_val, final_val;
378               png_size_t pixel_bytes;
379               png_uint_32 i;
380               register int disp = png_pass_inc[png_ptr->pass];
381               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
382
383               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
384               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
385                  pixel_bytes;
386               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
387               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
388               final_val = png_ptr->width*pixel_bytes;
389               incr1 = (disp)*pixel_bytes;
390               for (i = initial_val; i < final_val; i += incr1)
391               {
392                  png_memcpy(dstptr, srcptr, pixel_bytes);
393                  srcptr += incr1;
394                  dstptr += incr1;
395               }
396            } /* end of else */
397
398            break;
399         }       // end 8 bpp
400
401         case 16:
402         {
403            png_bytep srcptr;
404            png_bytep dstptr;
405            png_uint_32 len;
406            int unmask, diff;
407            __int64 mask1=0x0101020204040808,
408                    mask0=0x1010202040408080;
409
410            if (mmx_supported)
411            {
412               srcptr = png_ptr->row_buf + 1;
413               dstptr = row;
414
415               unmask = ~mask;
416               len     = (png_ptr->width)&~7;
417               diff = (png_ptr->width)&7;
418               _asm
419               {
420                  movd       mm7, unmask       //load bit pattern
421                  psubb      mm6,mm6           //zero mm6
422                  punpcklbw  mm7,mm7
423                  punpcklwd  mm7,mm7
424                  punpckldq  mm7,mm7           //fill register with 8 masks
425
426                  movq       mm0,mask0
427                  movq       mm1,mask1
428
429                  pand       mm0,mm7
430                  pand       mm1,mm7
431
432                  pcmpeqb    mm0,mm6
433                  pcmpeqb    mm1,mm6
434
435                  mov        ecx,len           //load length of line
436                  mov        esi,srcptr        //load source
437                  mov        ebx,dstptr        //load dest
438                  cmp        ecx,0             //lcr
439                  jz         mainloop16end
440
441mainloop16:
442                  movq       mm4,[esi]
443                  pand       mm4,mm0
444                  movq       mm6,mm0
445                  movq       mm7,[ebx]
446                  pandn      mm6,mm7
447                  por        mm4,mm6
448                  movq       [ebx],mm4
449
450                  movq       mm5,[esi+8]
451                  pand       mm5,mm1
452                  movq       mm7,mm1
453                  movq       mm6,[ebx+8]
454                  pandn      mm7,mm6
455                  por        mm5,mm7
456                  movq       [ebx+8],mm5
457
458                  add        esi,16            //inc by 16 bytes processed
459                  add        ebx,16
460                  sub        ecx,8             //dec by 8 pixels processed
461
462                  ja         mainloop16
463
464mainloop16end:
465                  mov        ecx,diff
466                  cmp        ecx,0
467                  jz         end16
468
469                  mov        edx,mask
470                  sal        edx,24            //make low byte the high byte
471secondloop16:
472                  sal        edx,1             //move high bit to CF
473                  jnc        skip16            //if CF = 0
474                  mov        ax,[esi]
475                  mov        [ebx],ax
476skip16:
477                  add        esi,2
478                  add        ebx,2
479
480                  dec        ecx
481                  jnz        secondloop16
482end16:
483                  emms
484               }
485            }
486            else /* mmx not supported - use modified C routine */
487            {
488               register unsigned int incr1, initial_val, final_val;
489               png_size_t pixel_bytes;
490               png_uint_32 i;
491               register int disp = png_pass_inc[png_ptr->pass];
492               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
493
494               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
495               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
496                  pixel_bytes;
497               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
498               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
499               final_val = png_ptr->width*pixel_bytes;
500               incr1 = (disp)*pixel_bytes;
501               for (i = initial_val; i < final_val; i += incr1)
502               {
503                  png_memcpy(dstptr, srcptr, pixel_bytes);
504                  srcptr += incr1;
505                  dstptr += incr1;
506               }
507            } /* end of else */
508
509            break;
510         }       // end 16 bpp
511
512         case 24:
513         {
514            png_bytep srcptr;
515            png_bytep dstptr;
516            png_uint_32 len;
517            int unmask, diff;
518
519            __int64 mask2=0x0101010202020404,  //24bpp
520                    mask1=0x0408080810101020,
521                    mask0=0x2020404040808080;
522
523            srcptr = png_ptr->row_buf + 1;
524            dstptr = row;
525
526            unmask = ~mask;
527            len     = (png_ptr->width)&~7;
528            diff = (png_ptr->width)&7;
529
530            if (mmx_supported)
531            {
532               _asm
533               {
534                  movd       mm7, unmask       //load bit pattern
535                  psubb      mm6,mm6           //zero mm6
536                  punpcklbw  mm7,mm7
537                  punpcklwd  mm7,mm7
538                  punpckldq  mm7,mm7           //fill register with 8 masks
539
540                  movq       mm0,mask0
541                  movq       mm1,mask1
542                  movq       mm2,mask2
543
544                  pand       mm0,mm7
545                  pand       mm1,mm7
546                  pand       mm2,mm7
547
548                  pcmpeqb    mm0,mm6
549                  pcmpeqb    mm1,mm6
550                  pcmpeqb    mm2,mm6
551
552                  mov        ecx,len           //load length of line
553                  mov        esi,srcptr        //load source
554                  mov        ebx,dstptr        //load dest
555                  cmp        ecx,0
556                  jz         mainloop24end
557
558mainloop24:
559                  movq       mm4,[esi]
560                  pand       mm4,mm0
561                  movq       mm6,mm0
562                  movq       mm7,[ebx]
563                  pandn      mm6,mm7
564                  por        mm4,mm6
565                  movq       [ebx],mm4
566
567
568                  movq       mm5,[esi+8]
569                  pand       mm5,mm1
570                  movq       mm7,mm1
571                  movq       mm6,[ebx+8]
572                  pandn      mm7,mm6
573                  por        mm5,mm7
574                  movq       [ebx+8],mm5
575
576                  movq       mm6,[esi+16]
577                  pand       mm6,mm2
578                  movq       mm4,mm2
579                  movq       mm7,[ebx+16]
580                  pandn      mm4,mm7
581                  por        mm6,mm4
582                  movq       [ebx+16],mm6
583
584                  add        esi,24            //inc by 24 bytes processed
585                  add        ebx,24
586                  sub        ecx,8             //dec by 8 pixels processed
587
588                  ja         mainloop24
589
590mainloop24end:
591                  mov        ecx,diff
592                  cmp        ecx,0
593                  jz         end24
594
595                  mov        edx,mask
596                  sal        edx,24            //make low byte the high byte
597secondloop24:
598                  sal        edx,1             //move high bit to CF
599                  jnc        skip24            //if CF = 0
600                  mov        ax,[esi]
601                  mov        [ebx],ax
602                  xor        eax,eax
603                  mov        al,[esi+2]
604                  mov        [ebx+2],al
605skip24:
606                  add        esi,3
607                  add        ebx,3
608
609                  dec        ecx
610                  jnz        secondloop24
611
612end24:
613                  emms
614               }
615            }
616            else /* mmx not supported - use modified C routine */
617            {
618               register unsigned int incr1, initial_val, final_val;
619               png_size_t pixel_bytes;
620               png_uint_32 i;
621               register int disp = png_pass_inc[png_ptr->pass];
622               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
623
624               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
625               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
626                  pixel_bytes;
627               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
628               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
629               final_val = png_ptr->width*pixel_bytes;
630               incr1 = (disp)*pixel_bytes;
631               for (i = initial_val; i < final_val; i += incr1)
632               {
633                  png_memcpy(dstptr, srcptr, pixel_bytes);
634                  srcptr += incr1;
635                  dstptr += incr1;
636               }
637            } /* end of else */
638
639            break;
640         }       // end 24 bpp
641
642         case 32:
643         {
644            png_bytep srcptr;
645            png_bytep dstptr;
646            png_uint_32 len;
647            int unmask, diff;
648
649            __int64 mask3=0x0101010102020202,  //32bpp
650                    mask2=0x0404040408080808,
651                    mask1=0x1010101020202020,
652                    mask0=0x4040404080808080;
653
654            srcptr = png_ptr->row_buf + 1;
655            dstptr = row;
656
657            unmask = ~mask;
658            len     = (png_ptr->width)&~7;
659            diff = (png_ptr->width)&7;
660
661            if (mmx_supported)
662            {
663               _asm
664               {
665                  movd       mm7, unmask       //load bit pattern
666                  psubb      mm6,mm6           //zero mm6
667                  punpcklbw  mm7,mm7
668                  punpcklwd  mm7,mm7
669                  punpckldq  mm7,mm7           //fill register with 8 masks
670
671                  movq       mm0,mask0
672                  movq       mm1,mask1
673                  movq       mm2,mask2
674                  movq       mm3,mask3
675
676                  pand       mm0,mm7
677                  pand       mm1,mm7
678                  pand       mm2,mm7
679                  pand       mm3,mm7
680
681                  pcmpeqb    mm0,mm6
682                  pcmpeqb    mm1,mm6
683                  pcmpeqb    mm2,mm6
684                  pcmpeqb    mm3,mm6
685
686                  mov        ecx,len           //load length of line
687                  mov        esi,srcptr        //load source
688                  mov        ebx,dstptr        //load dest
689
690                  cmp        ecx,0             //lcr
691                  jz         mainloop32end
692
693mainloop32:
694                  movq       mm4,[esi]
695                  pand       mm4,mm0
696                  movq       mm6,mm0
697                  movq       mm7,[ebx]
698                  pandn      mm6,mm7
699                  por        mm4,mm6
700                  movq       [ebx],mm4
701
702                  movq       mm5,[esi+8]
703                  pand       mm5,mm1
704                  movq       mm7,mm1
705                  movq       mm6,[ebx+8]
706                  pandn      mm7,mm6
707                  por        mm5,mm7
708                  movq       [ebx+8],mm5
709
710                  movq       mm6,[esi+16]
711                  pand       mm6,mm2
712                  movq       mm4,mm2
713                  movq       mm7,[ebx+16]
714                  pandn      mm4,mm7
715                  por        mm6,mm4
716                  movq       [ebx+16],mm6
717
718                  movq       mm7,[esi+24]
719                  pand       mm7,mm3
720                  movq       mm5,mm3
721                  movq       mm4,[ebx+24]
722                  pandn      mm5,mm4
723                  por        mm7,mm5
724                  movq       [ebx+24],mm7
725
726                  add        esi,32            //inc by 32 bytes processed
727                  add        ebx,32
728                  sub        ecx,8             //dec by 8 pixels processed
729
730                  ja         mainloop32
731
732mainloop32end:
733                  mov        ecx,diff
734                  cmp        ecx,0
735                  jz         end32
736
737                  mov        edx,mask
738                  sal        edx,24            //make low byte the high byte
739secondloop32:
740                  sal        edx,1             //move high bit to CF
741                  jnc        skip32            //if CF = 0
742                  mov        eax,[esi]
743                  mov        [ebx],eax
744skip32:
745                  add        esi,4
746                  add        ebx,4
747
748                  dec        ecx
749                  jnz        secondloop32
750
751end32:
752                  emms
753               }
754            }
755            else /* mmx _not supported - Use modified C routine */
756            {
757               register unsigned int incr1, initial_val, final_val;
758               png_size_t pixel_bytes;
759               png_uint_32 i;
760               register int disp = png_pass_inc[png_ptr->pass];
761               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
762
763               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
764               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
765                  pixel_bytes;
766               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
767               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
768               final_val = png_ptr->width*pixel_bytes;
769               incr1 = (disp)*pixel_bytes;
770               for (i = initial_val; i < final_val; i += incr1)
771               {
772                  png_memcpy(dstptr, srcptr, pixel_bytes);
773                  srcptr += incr1;
774                  dstptr += incr1;
775               }
776            } /* end of else */
777
778            break;
779         }       // end 32 bpp
780
781         case 48:
782         {
783            png_bytep srcptr;
784            png_bytep dstptr;
785            png_uint_32 len;
786            int unmask, diff;
787
788            __int64 mask5=0x0101010101010202,
789                    mask4=0x0202020204040404,
790                    mask3=0x0404080808080808,
791                    mask2=0x1010101010102020,
792                    mask1=0x2020202040404040,
793                    mask0=0x4040808080808080;
794
795            if (mmx_supported)
796            {
797               srcptr = png_ptr->row_buf + 1;
798               dstptr = row;
799
800               unmask = ~mask;
801               len     = (png_ptr->width)&~7;
802               diff = (png_ptr->width)&7;
803               _asm
804               {
805                  movd       mm7, unmask       //load bit pattern
806                  psubb      mm6,mm6           //zero mm6
807                  punpcklbw  mm7,mm7
808                  punpcklwd  mm7,mm7
809                  punpckldq  mm7,mm7           //fill register with 8 masks
810
811                  movq       mm0,mask0
812                  movq       mm1,mask1
813                  movq       mm2,mask2
814                  movq       mm3,mask3
815                  movq       mm4,mask4
816                  movq       mm5,mask5
817
818                  pand       mm0,mm7
819                  pand       mm1,mm7
820                  pand       mm2,mm7
821                  pand       mm3,mm7
822                  pand       mm4,mm7
823                  pand       mm5,mm7
824
825                  pcmpeqb    mm0,mm6
826                  pcmpeqb    mm1,mm6
827                  pcmpeqb    mm2,mm6
828                  pcmpeqb    mm3,mm6
829                  pcmpeqb    mm4,mm6
830                  pcmpeqb    mm5,mm6
831
832                  mov        ecx,len           //load length of line
833                  mov        esi,srcptr        //load source
834                  mov        ebx,dstptr        //load dest
835
836                  cmp        ecx,0
837                  jz         mainloop48end
838
839mainloop48:
840                  movq       mm7,[esi]
841                  pand       mm7,mm0
842                  movq       mm6,mm0
843                  pandn      mm6,[ebx]
844                  por        mm7,mm6
845                  movq       [ebx],mm7
846
847                  movq       mm6,[esi+8]
848                  pand       mm6,mm1
849                  movq       mm7,mm1
850                  pandn      mm7,[ebx+8]
851                  por        mm6,mm7
852                  movq       [ebx+8],mm6
853
854                  movq       mm6,[esi+16]
855                  pand       mm6,mm2
856                  movq       mm7,mm2
857                  pandn      mm7,[ebx+16]
858                  por        mm6,mm7
859                  movq       [ebx+16],mm6
860
861                  movq       mm7,[esi+24]
862                  pand       mm7,mm3
863                  movq       mm6,mm3
864                  pandn      mm6,[ebx+24]
865                  por        mm7,mm6
866                  movq       [ebx+24],mm7
867
868                  movq       mm6,[esi+32]
869                  pand       mm6,mm4
870                  movq       mm7,mm4
871                  pandn      mm7,[ebx+32]
872                  por        mm6,mm7
873                  movq       [ebx+32],mm6
874
875                  movq       mm7,[esi+40]
876                  pand       mm7,mm5
877                  movq       mm6,mm5
878                  pandn      mm6,[ebx+40]
879                  por        mm7,mm6
880                  movq       [ebx+40],mm7
881
882                  add        esi,48            //inc by 32 bytes processed
883                  add        ebx,48
884                  sub        ecx,8             //dec by 8 pixels processed
885
886                  ja         mainloop48
887mainloop48end:
888
889                  mov        ecx,diff
890                  cmp        ecx,0
891                  jz         end48
892
893                  mov        edx,mask
894                  sal        edx,24            //make low byte the high byte
895
896secondloop48:
897                  sal        edx,1             //move high bit to CF
898                  jnc        skip48            //if CF = 0
899                  mov        eax,[esi]
900                  mov        [ebx],eax
901skip48:
902                  add        esi,4
903                  add        ebx,4
904
905                  dec        ecx
906                  jnz        secondloop48
907
908end48:
909                  emms
910               }
911            }
912            else /* mmx _not supported - Use modified C routine */
913            {
914               register unsigned int incr1, initial_val, final_val;
915               png_size_t pixel_bytes;
916               png_uint_32 i;
917               register int disp = png_pass_inc[png_ptr->pass];
918               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
919
920               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
921               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
922                  pixel_bytes;
923               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
924               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
925               final_val = png_ptr->width*pixel_bytes;
926               incr1 = (disp)*pixel_bytes;
927               for (i = initial_val; i < final_val; i += incr1)
928               {
929                  png_memcpy(dstptr, srcptr, pixel_bytes);
930                  srcptr += incr1;
931                  dstptr += incr1;
932               }
933            } /* end of else */
934
935            break;
936         }       // end 48 bpp
937
938         default:
939         {
940            png_bytep sptr;
941            png_bytep dp;
942            png_size_t pixel_bytes;
943            int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
944            unsigned int i;
945            register int disp = png_pass_inc[png_ptr->pass];  // get the offset
946            register unsigned int incr1, initial_val, final_val;
947
948            pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
949            sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
950               pixel_bytes;
951            dp = row + offset_table[png_ptr->pass]*pixel_bytes;
952            initial_val = offset_table[png_ptr->pass]*pixel_bytes;
953            final_val = png_ptr->width*pixel_bytes;
954            incr1 = (disp)*pixel_bytes;
955            for (i = initial_val; i < final_val; i += incr1)
956            {
957               png_memcpy(dp, sptr, pixel_bytes);
958               sptr += incr1;
959               dp += incr1;
960            }
961            break;
962         }
963      } /* end switch (png_ptr->row_info.pixel_depth) */
964   } /* end if (non-trivial mask) */
965
966#ifdef DISABLE_PNGVCRD_COMBINE
967   mmx_supported = save_mmx_supported;
968#endif
969
970} /* end png_combine_row() */
971
972
973#if defined(PNG_READ_INTERLACING_SUPPORTED)
974
975void /* PRIVATE */
976png_do_read_interlace(png_row_infop row_info, png_bytep row, int pass,
977   png_uint_32 transformations)
978{
979#ifdef PNG_USE_LOCAL_ARRAYS
980   const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
981#endif
982#ifdef DISABLE_PNGVCRD_INTERLACE
983   int save_mmx_supported = mmx_supported;
984#endif
985
986   png_debug(1,"in png_do_read_interlace\n");
987
988#ifdef DISABLE_PNGVCRD_INTERLACE
989   /* In libpng versions 1.0.3a through 1.0.4d,
990    * a sign error in the post-MMX cleanup code for each pixel_depth resulted
991    * in bad pixels at the beginning of some rows of some images, and also
992    * (due to out-of-range memory reads and writes) caused heap corruption
993    * when compiled with MSVC 6.0.  The error was fixed in version 1.0.4e,
994    * and the code appears to work completely correctly, so it is enabled
995    * by default.
996    */
997   if (1)  /* all passes caused a heap problem in the old code */
998      mmx_supported = 0;
999   else
1000#endif
1001       if (mmx_supported == 2)
1002           mmx_supported = mmxsupport();
1003
1004   if (row != NULL && row_info != NULL)
1005   {
1006      png_uint_32 final_width;
1007
1008      final_width = row_info->width * png_pass_inc[pass];
1009
1010      switch (row_info->pixel_depth)
1011      {
1012         case 1:
1013         {
1014            png_bytep sp, dp;
1015            int sshift, dshift;
1016            int s_start, s_end, s_inc;
1017            png_byte v;
1018            png_uint_32 i;
1019            int j;
1020
1021            sp = row + (png_size_t)((row_info->width - 1) >> 3);
1022            dp = row + (png_size_t)((final_width - 1) >> 3);
1023#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1024            if (transformations & PNG_PACKSWAP)
1025            {
1026               sshift = (int)((row_info->width + 7) & 7);
1027               dshift = (int)((final_width + 7) & 7);
1028               s_start = 7;
1029               s_end = 0;
1030               s_inc = -1;
1031            }
1032            else
1033#endif
1034            {
1035               sshift = 7 - (int)((row_info->width + 7) & 7);
1036               dshift = 7 - (int)((final_width + 7) & 7);
1037               s_start = 0;
1038               s_end = 7;
1039               s_inc = 1;
1040            }
1041
1042            for (i = row_info->width; i; i--)
1043            {
1044               v = (png_byte)((*sp >> sshift) & 0x1);
1045               for (j = 0; j < png_pass_inc[pass]; j++)
1046               {
1047                  *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1048                  *dp |= (png_byte)(v << dshift);
1049                  if (dshift == s_end)
1050                  {
1051                     dshift = s_start;
1052                     dp--;
1053                  }
1054                  else
1055                     dshift += s_inc;
1056               }
1057               if (sshift == s_end)
1058               {
1059                  sshift = s_start;
1060                  sp--;
1061               }
1062               else
1063                  sshift += s_inc;
1064            }
1065            break;
1066         }
1067
1068         case 2:
1069         {
1070            png_bytep sp, dp;
1071            int sshift, dshift;
1072            int s_start, s_end, s_inc;
1073            png_uint_32 i;
1074
1075            sp = row + (png_size_t)((row_info->width - 1) >> 2);
1076            dp = row + (png_size_t)((final_width - 1) >> 2);
1077#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1078            if (transformations & PNG_PACKSWAP)
1079            {
1080               sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1081               dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1082               s_start = 6;
1083               s_end = 0;
1084               s_inc = -2;
1085            }
1086            else
1087#endif
1088            {
1089               sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1090               dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1091               s_start = 0;
1092               s_end = 6;
1093               s_inc = 2;
1094            }
1095
1096            for (i = row_info->width; i; i--)
1097            {
1098               png_byte v;
1099               int j;
1100
1101               v = (png_byte)((*sp >> sshift) & 0x3);
1102               for (j = 0; j < png_pass_inc[pass]; j++)
1103               {
1104                  *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1105                  *dp |= (png_byte)(v << dshift);
1106                  if (dshift == s_end)
1107                  {
1108                     dshift = s_start;
1109                     dp--;
1110                  }
1111                  else
1112                     dshift += s_inc;
1113               }
1114               if (sshift == s_end)
1115               {
1116                  sshift = s_start;
1117                  sp--;
1118               }
1119               else
1120                  sshift += s_inc;
1121            }
1122            break;
1123         }
1124
1125         case 4:
1126         {
1127            png_bytep sp, dp;
1128            int sshift, dshift;
1129            int s_start, s_end, s_inc;
1130            png_uint_32 i;
1131
1132            sp = row + (png_size_t)((row_info->width - 1) >> 1);
1133            dp = row + (png_size_t)((final_width - 1) >> 1);
1134#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1135            if (transformations & PNG_PACKSWAP)
1136            {
1137               sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1138               dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1139               s_start = 4;
1140               s_end = 0;
1141               s_inc = -4;
1142            }
1143            else
1144#endif
1145            {
1146               sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1147               dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1148               s_start = 0;
1149               s_end = 4;
1150               s_inc = 4;
1151            }
1152
1153            for (i = row_info->width; i; i--)
1154            {
1155               png_byte v;
1156               int j;
1157
1158               v = (png_byte)((*sp >> sshift) & 0xf);
1159               for (j = 0; j < png_pass_inc[pass]; j++)
1160               {
1161                  *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1162                  *dp |= (png_byte)(v << dshift);
1163                  if (dshift == s_end)
1164                  {
1165                     dshift = s_start;
1166                     dp--;
1167                  }
1168                  else
1169                     dshift += s_inc;
1170               }
1171               if (sshift == s_end)
1172               {
1173                  sshift = s_start;
1174                  sp--;
1175               }
1176               else
1177                  sshift += s_inc;
1178            }
1179            break;
1180         }
1181
1182         default:         // This is the place where the routine is modified
1183         {
1184            __int64 const4 = 0x0000000000FFFFFF;
1185            // __int64 const5 = 0x000000FFFFFF0000;  // unused...
1186            __int64 const6 = 0x00000000000000FF;
1187            png_bytep sptr, dp;
1188            png_uint_32 i;
1189            png_size_t pixel_bytes;
1190            int width = row_info->width;
1191
1192            pixel_bytes = (row_info->pixel_depth >> 3);
1193
1194            sptr = row + (width - 1) * pixel_bytes;
1195            dp = row + (final_width - 1) * pixel_bytes;
1196            // New code by Nirav Chhatrapati - Intel Corporation
1197            // sign fix by GRR
1198            // NOTE:  there is NO MMX code for 48-bit and 64-bit images
1199
1200            if (mmx_supported) // use MMX routine if machine supports it
1201            {
1202               if (pixel_bytes == 3)
1203               {
1204                  if (((pass == 0) || (pass == 1)) && width)
1205                  {
1206                     _asm
1207                     {
1208                        mov esi, sptr
1209                        mov edi, dp
1210                        mov ecx, width
1211                        sub edi, 21   // (png_pass_inc[pass] - 1)*pixel_bytes
1212loop_pass0:
1213                        movd mm0, [esi]     ; X X X X X v2 v1 v0
1214                        pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
1215                        movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
1216                        psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
1217                        movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
1218                        psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
1219                        psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
1220                        por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
1221                        por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
1222                        movq mm3, mm0       ; v2 v1 v0 v2 v1 v0 v2 v1
1223                        psllq mm0, 16       ; v0 v2 v1 v0 v2 v1 0 0
1224                        movq mm4, mm3       ; v2 v1 v0 v2 v1 v0 v2 v1
1225                        punpckhdq mm3, mm0  ; v0 v2 v1 v0 v2 v1 v0 v2
1226                        movq [edi+16] , mm4
1227                        psrlq mm0, 32       ; 0 0 0 0 v0 v2 v1 v0
1228                        movq [edi+8] , mm3
1229                        punpckldq mm0, mm4  ; v1 v0 v2 v1 v0 v2 v1 v0
1230                        sub esi, 3
1231                        movq [edi], mm0
1232                        sub edi, 24
1233                        //sub esi, 3
1234                        dec ecx
1235                        jnz loop_pass0
1236                        EMMS
1237                     }
1238                  }
1239                  else if (((pass == 2) || (pass == 3)) && width)
1240                  {
1241                     _asm
1242                     {
1243                        mov esi, sptr
1244                        mov edi, dp
1245                        mov ecx, width
1246                        sub edi, 9   // (png_pass_inc[pass] - 1)*pixel_bytes
1247loop_pass2:
1248                        movd mm0, [esi]     ; X X X X X v2 v1 v0
1249                        pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
1250                        movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
1251                        psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
1252                        movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
1253                        psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
1254                        psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
1255                        por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
1256                        por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
1257                        movq [edi+4], mm0   ; move to memory
1258                        psrlq mm0, 16       ; 0 0 v2 v1 v0 v2 v1 v0
1259                        movd [edi], mm0     ; move to memory
1260                        sub esi, 3
1261                        sub edi, 12
1262                        dec ecx
1263                        jnz loop_pass2
1264                        EMMS
1265                     }
1266                  }
1267                  else if (width) /* && ((pass == 4) || (pass == 5)) */
1268                  {
1269                     int width_mmx = ((width >> 1) << 1) - 8;
1270                     if (width_mmx < 0)
1271                         width_mmx = 0;
1272                     width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
1273                     if (width_mmx)
1274                     {
1275                        _asm
1276                        {
1277                           mov esi, sptr
1278                           mov edi, dp
1279                           mov ecx, width_mmx
1280                           sub esi, 3
1281                           sub edi, 9
1282loop_pass4:
1283                           movq mm0, [esi]     ; X X v2 v1 v0 v5 v4 v3
1284                           movq mm7, mm0       ; X X v2 v1 v0 v5 v4 v3
1285                           movq mm6, mm0       ; X X v2 v1 v0 v5 v4 v3
1286                           psllq mm0, 24       ; v1 v0 v5 v4 v3 0 0 0
1287                           pand mm7, const4    ; 0 0 0 0 0 v5 v4 v3
1288                           psrlq mm6, 24       ; 0 0 0 X X v2 v1 v0
1289                           por mm0, mm7        ; v1 v0 v5 v4 v3 v5 v4 v3
1290                           movq mm5, mm6       ; 0 0 0 X X v2 v1 v0
1291                           psllq mm6, 8        ; 0 0 X X v2 v1 v0 0
1292                           movq [edi], mm0     ; move quad to memory
1293                           psrlq mm5, 16       ; 0 0 0 0 0 X X v2
1294                           pand mm5, const6    ; 0 0 0 0 0 0 0 v2
1295                           por mm6, mm5        ; 0 0 X X v2 v1 v0 v2
1296                           movd [edi+8], mm6   ; move double to memory
1297                           sub esi, 6
1298                           sub edi, 12
1299                           sub ecx, 2
1300                           jnz loop_pass4
1301                           EMMS
1302                        }
1303                     }
1304
1305                     sptr -= width_mmx*3;
1306                     dp -= width_mmx*6;
1307                     for (i = width; i; i--)
1308                     {
1309                        png_byte v[8];
1310                        int j;
1311
1312                        png_memcpy(v, sptr, 3);
1313                        for (j = 0; j < png_pass_inc[pass]; j++)
1314                        {
1315                           png_memcpy(dp, v, 3);
1316                           dp -= 3;
1317                        }
1318                        sptr -= 3;
1319                     }
1320                  }
1321               } /* end of pixel_bytes == 3 */
1322
1323               else if (pixel_bytes == 1)
1324               {
1325                  if (((pass == 0) || (pass == 1)) && width)
1326                  {
1327                     int width_mmx = ((width >> 2) << 2);
1328                     width -= width_mmx;
1329                     if (width_mmx)
1330                     {
1331                        _asm
1332                        {
1333                           mov esi, sptr
1334                           mov edi, dp
1335                           mov ecx, width_mmx
1336                           sub edi, 31
1337                           sub esi, 3
1338loop1_pass0:
1339                           movd mm0, [esi]     ; X X X X v0 v1 v2 v3
1340                           movq mm1, mm0       ; X X X X v0 v1 v2 v3
1341                           punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
1342                           movq mm2, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
1343                           punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
1344                           movq mm3, mm0       ; v2 v2 v2 v2 v3 v3 v3 v3
1345                           punpckldq mm0, mm0  ; v3 v3 v3 v3 v3 v3 v3 v3
1346                           punpckhdq mm3, mm3  ; v2 v2 v2 v2 v2 v2 v2 v2
1347                           movq [edi], mm0     ; move to memory v3
1348                           punpckhwd mm2, mm2  ; v0 v0 v0 v0 v1 v1 v1 v1
1349                           movq [edi+8], mm3   ; move to memory v2
1350                           movq mm4, mm2       ; v0 v0 v0 v0 v1 v1 v1 v1
1351                           punpckldq mm2, mm2  ; v1 v1 v1 v1 v1 v1 v1 v1
1352                           punpckhdq mm4, mm4  ; v0 v0 v0 v0 v0 v0 v0 v0
1353                           movq [edi+16], mm2  ; move to memory v1
1354                           movq [edi+24], mm4  ; move to memory v0
1355                           sub esi, 4
1356                           sub edi, 32
1357                           sub ecx, 4
1358                           jnz loop1_pass0
1359                           EMMS
1360                        }
1361                     }
1362
1363                     sptr -= width_mmx;
1364                     dp -= width_mmx*8;
1365                     for (i = width; i; i--)
1366                     {
1367                        int j;
1368
1369                       /* I simplified this part in version 1.0.4e
1370                        * here and in several other instances where
1371                        * pixel_bytes == 1  -- GR-P
1372                        *
1373                        * Original code:
1374                        *
1375                        * png_byte v[8];
1376                        * png_memcpy(v, sptr, pixel_bytes);
1377                        * for (j = 0; j < png_pass_inc[pass]; j++)
1378                        * {
1379                        *    png_memcpy(dp, v, pixel_bytes);
1380                        *    dp -= pixel_bytes;
1381                        * }
1382                        * sptr -= pixel_bytes;
1383                        *
1384                        * Replacement code is in the next three lines:
1385                        */
1386
1387                        for (j = 0; j < png_pass_inc[pass]; j++)
1388                           *dp-- = *sptr;
1389                        sptr--;
1390                     }
1391                  }
1392                  else if (((pass == 2) || (pass == 3)) && width)
1393                  {
1394                     int width_mmx = ((width >> 2) << 2);
1395                     width -= width_mmx;
1396                     if (width_mmx)
1397                     {
1398                        _asm
1399                        {
1400                           mov esi, sptr
1401                           mov edi, dp
1402                           mov ecx, width_mmx
1403                           sub edi, 15
1404                           sub esi, 3
1405loop1_pass2:
1406                           movd mm0, [esi]     ; X X X X v0 v1 v2 v3
1407                           punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
1408                           movq mm1, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
1409                           punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
1410                           punpckhwd mm1, mm1  ; v0 v0 v0 v0 v1 v1 v1 v1
1411                           movq [edi], mm0     ; move to memory v2 and v3
1412                           sub esi, 4
1413                           movq [edi+8], mm1   ; move to memory v1     and v0
1414                           sub edi, 16
1415                           sub ecx, 4
1416                           jnz loop1_pass2
1417                           EMMS
1418                        }
1419                     }
1420
1421                     sptr -= width_mmx;
1422                     dp -= width_mmx*4;
1423                     for (i = width; i; i--)
1424                     {
1425                        int j;
1426
1427                        for (j = 0; j < png_pass_inc[pass]; j++)
1428                        {
1429                           *dp-- = *sptr;
1430                        }
1431                        sptr --;
1432                     }
1433                  }
1434                  else if (width) /* && ((pass == 4) || (pass == 5))) */
1435                  {
1436                     int width_mmx = ((width >> 3) << 3);
1437                     width -= width_mmx;
1438                     if (width_mmx)
1439                     {
1440                        _asm
1441                        {
1442                           mov esi, sptr
1443                           mov edi, dp
1444                           mov ecx, width_mmx
1445                           sub edi, 15
1446                           sub esi, 7
1447loop1_pass4:
1448                           movq mm0, [esi]     ; v0 v1 v2 v3 v4 v5 v6 v7
1449                           movq mm1, mm0       ; v0 v1 v2 v3 v4 v5 v6 v7
1450                           punpcklbw mm0, mm0  ; v4 v4 v5 v5 v6 v6 v7 v7
1451                           //movq mm1, mm0     ; v0 v0 v1 v1 v2 v2 v3 v3
1452                           punpckhbw mm1, mm1  ;v0 v0 v1 v1 v2 v2 v3 v3
1453                           movq [edi+8], mm1   ; move to memory v0 v1 v2 and v3
1454                           sub esi, 8
1455                           movq [edi], mm0     ; move to memory v4 v5 v6 and v7
1456                           //sub esi, 4
1457                           sub edi, 16
1458                           sub ecx, 8
1459                           jnz loop1_pass4
1460                           EMMS
1461                        }
1462                     }
1463
1464                     sptr -= width_mmx;
1465                     dp -= width_mmx*2;
1466                     for (i = width; i; i--)
1467                     {
1468                        int j;
1469
1470                        for (j = 0; j < png_pass_inc[pass]; j++)
1471                        {
1472                           *dp-- = *sptr;
1473                        }
1474                        sptr --;
1475                     }
1476                  }
1477               } /* end of pixel_bytes == 1 */
1478
1479               else if (pixel_bytes == 2)
1480               {
1481                  if (((pass == 0) || (pass == 1)) && width)
1482                  {
1483                     int width_mmx = ((width >> 1) << 1);
1484                     width -= width_mmx;
1485                     if (width_mmx)
1486                     {
1487                        _asm
1488                        {
1489                           mov esi, sptr
1490                           mov edi, dp
1491                           mov ecx, width_mmx
1492                           sub esi, 2
1493                           sub edi, 30
1494loop2_pass0:
1495                           movd mm0, [esi]        ; X X X X v1 v0 v3 v2
1496                           punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
1497                           movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
1498                           punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
1499                           punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
1500                           movq [edi], mm0
1501                           movq [edi + 8], mm0
1502                           movq [edi + 16], mm1
1503                           movq [edi + 24], mm1
1504                           sub esi, 4
1505                           sub edi, 32
1506                           sub ecx, 2
1507                           jnz loop2_pass0
1508                           EMMS
1509                        }
1510                     }
1511
1512                     sptr -= (width_mmx*2 - 2);            // sign fixed
1513                     dp -= (width_mmx*16 - 2);            // sign fixed
1514                     for (i = width; i; i--)
1515                     {
1516                        png_byte v[8];
1517                        int j;
1518                        sptr -= 2;
1519                        png_memcpy(v, sptr, 2);
1520                        for (j = 0; j < png_pass_inc[pass]; j++)
1521                        {
1522                           dp -= 2;
1523                           png_memcpy(dp, v, 2);
1524                        }
1525                     }
1526                  }
1527                  else if (((pass == 2) || (pass == 3)) && width)
1528                  {
1529                     int width_mmx = ((width >> 1) << 1) ;
1530                     width -= width_mmx;
1531                     if (width_mmx)
1532                     {
1533                        _asm
1534                        {
1535                           mov esi, sptr
1536                           mov edi, dp
1537                           mov ecx, width_mmx
1538                           sub esi, 2
1539                           sub edi, 14
1540loop2_pass2:
1541                           movd mm0, [esi]        ; X X X X v1 v0 v3 v2
1542                           punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
1543                           movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
1544                           punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
1545                           punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
1546                           movq [edi], mm0
1547                           sub esi, 4
1548                           movq [edi + 8], mm1
1549                           //sub esi, 4
1550                           sub edi, 16
1551                           sub ecx, 2
1552                           jnz loop2_pass2
1553                           EMMS
1554                        }
1555                     }
1556
1557                     sptr -= (width_mmx*2 - 2);            // sign fixed
1558                     dp -= (width_mmx*8 - 2);            // sign fixed
1559                     for (i = width; i; i--)
1560                     {
1561                        png_byte v[8];
1562                        int j;
1563                        sptr -= 2;
1564                        png_memcpy(v, sptr, 2);
1565                        for (j = 0; j < png_pass_inc[pass]; j++)
1566                        {
1567                           dp -= 2;
1568                           png_memcpy(dp, v, 2);
1569                        }
1570                     }
1571                  }
1572                  else if (width)  // pass == 4 or 5
1573                  {
1574                     int width_mmx = ((width >> 1) << 1) ;
1575                     width -= width_mmx;
1576                     if (width_mmx)
1577                     {
1578                        _asm
1579                        {
1580                           mov esi, sptr
1581                           mov edi, dp
1582                           mov ecx, width_mmx
1583                           sub esi, 2
1584                           sub edi, 6
1585loop2_pass4:
1586                           movd mm0, [esi]        ; X X X X v1 v0 v3 v2
1587                           punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
1588                           sub esi, 4
1589                           movq [edi], mm0
1590                           sub edi, 8
1591                           sub ecx, 2
1592                           jnz loop2_pass4
1593                           EMMS
1594                        }
1595                     }
1596
1597                     sptr -= (width_mmx*2 - 2);            // sign fixed
1598                     dp -= (width_mmx*4 - 2);            // sign fixed
1599                     for (i = width; i; i--)
1600                     {
1601                        png_byte v[8];
1602                        int j;
1603                        sptr -= 2;
1604                        png_memcpy(v, sptr, 2);
1605                        for (j = 0; j < png_pass_inc[pass]; j++)
1606                        {
1607                           dp -= 2;
1608                           png_memcpy(dp, v, 2);
1609                        }
1610                     }
1611                  }
1612               } /* end of pixel_bytes == 2 */
1613
1614               else if (pixel_bytes == 4)
1615               {
1616                  if (((pass == 0) || (pass == 1)) && width)
1617                  {
1618                     int width_mmx = ((width >> 1) << 1) ;
1619                     width -= width_mmx;
1620                     if (width_mmx)
1621                     {
1622                        _asm
1623                        {
1624                           mov esi, sptr
1625                           mov edi, dp
1626                           mov ecx, width_mmx
1627                           sub esi, 4
1628                           sub edi, 60
1629loop4_pass0:
1630                           movq mm0, [esi]        ; v3 v2 v1 v0 v7 v6 v5 v4
1631                           movq mm1, mm0          ; v3 v2 v1 v0 v7 v6 v5 v4
1632                           punpckldq mm0, mm0     ; v7 v6 v5 v4 v7 v6 v5 v4
1633                           punpckhdq mm1, mm1     ; v3 v2 v1 v0 v3 v2 v1 v0
1634                           movq [edi], mm0
1635                           movq [edi + 8], mm0
1636                           movq [edi + 16], mm0
1637                           movq [edi + 24], mm0
1638                           movq [edi+32], mm1
1639                           movq [edi + 40], mm1
1640                           movq [edi+ 48], mm1
1641                           sub esi, 8
1642                           movq [edi + 56], mm1
1643                           sub edi, 64
1644                           sub ecx, 2
1645                           jnz loop4_pass0
1646                           EMMS
1647                        }
1648                     }
1649
1650                     sptr -= (width_mmx*4 - 4);            // sign fixed
1651                     dp -= (width_mmx*32 - 4);            // sign fixed
1652                     for (i = width; i; i--)
1653                     {
1654                        png_byte v[8];
1655                        int j;
1656                        sptr -= 4;
1657                        png_memcpy(v, sptr, 4);
1658                        for (j = 0; j < png_pass_inc[pass]; j++)
1659                        {
1660                           dp -= 4;
1661                           png_memcpy(dp, v, 4);
1662                        }
1663                     }
1664                  }
1665                  else if (((pass == 2) || (pass == 3)) && width)
1666                  {
1667                     int width_mmx = ((width >> 1) << 1) ;
1668                     width -= width_mmx;
1669                     if (width_mmx)
1670                     {
1671                        _asm
1672                        {
1673                           mov esi, sptr
1674                           mov edi, dp
1675                           mov ecx, width_mmx
1676                           sub esi, 4
1677                           sub edi, 28
1678loop4_pass2:
1679                           movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
1680                           movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
1681                           punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
1682                           punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
1683                           movq [edi], mm0
1684                           movq [edi + 8], mm0
1685                           movq [edi+16], mm1
1686                           movq [edi + 24], mm1
1687                           sub esi, 8
1688                           sub edi, 32
1689                           sub ecx, 2
1690                           jnz loop4_pass2
1691                           EMMS
1692                        }
1693                     }
1694
1695                     sptr -= (width_mmx*4 - 4);            // sign fixed
1696                     dp -= (width_mmx*16 - 4);            // sign fixed
1697                     for (i = width; i; i--)
1698                     {
1699                        png_byte v[8];
1700                        int j;
1701                        sptr -= 4;
1702                        png_memcpy(v, sptr, 4);
1703                        for (j = 0; j < png_pass_inc[pass]; j++)
1704                        {
1705                           dp -= 4;
1706                           png_memcpy(dp, v, 4);
1707                        }
1708                     }
1709                  }
1710                  else if (width)  // pass == 4 or 5
1711                  {
1712                     int width_mmx = ((width >> 1) << 1) ;
1713                     width -= width_mmx;
1714                     if (width_mmx)
1715                     {
1716                        _asm
1717                        {
1718                           mov esi, sptr
1719                           mov edi, dp
1720                           mov ecx, width_mmx
1721                           sub esi, 4
1722                           sub edi, 12
1723loop4_pass4:
1724                           movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
1725                           movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
1726                           punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
1727                           punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
1728                           movq [edi], mm0
1729                           sub esi, 8
1730                           movq [edi + 8], mm1
1731                           sub edi, 16
1732                           sub ecx, 2
1733                           jnz loop4_pass4
1734                           EMMS
1735                        }
1736                     }
1737
1738                     sptr -= (width_mmx*4 - 4);          // sign fixed
1739                     dp -= (width_mmx*8 - 4);            // sign fixed
1740                     for (i = width; i; i--)
1741                     {
1742                        png_byte v[8];
1743                        int j;
1744                        sptr -= 4;
1745                        png_memcpy(v, sptr, 4);
1746                        for (j = 0; j < png_pass_inc[pass]; j++)
1747                        {
1748                           dp -= 4;
1749                           png_memcpy(dp, v, 4);
1750                        }
1751                     }
1752                  }
1753
1754               } /* end of pixel_bytes == 4 */
1755
1756               else if (pixel_bytes == 6)
1757               {
1758                  for (i = width; i; i--)
1759                  {
1760                     png_byte v[8];
1761                     int j;
1762                     png_memcpy(v, sptr, 6);
1763                     for (j = 0; j < png_pass_inc[pass]; j++)
1764                     {
1765                        png_memcpy(dp, v, 6);
1766                        dp -= 6;
1767                     }
1768                     sptr -= 6;
1769                  }
1770               } /* end of pixel_bytes == 6 */
1771
1772               else
1773               {
1774                  for (i = width; i; i--)
1775                  {
1776                     png_byte v[8];
1777                     int j;
1778                     png_memcpy(v, sptr, pixel_bytes);
1779                     for (j = 0; j < png_pass_inc[pass]; j++)
1780                     {
1781                        png_memcpy(dp, v, pixel_bytes);
1782                        dp -= pixel_bytes;
1783                     }
1784                     sptr-= pixel_bytes;
1785                  }
1786               }
1787            } /* end of mmx_supported */
1788
1789            else /* MMX not supported:  use modified C code - takes advantage
1790                  * of inlining of memcpy for a constant */
1791            {
1792               if (pixel_bytes == 1)
1793               {
1794                  for (i = width; i; i--)
1795                  {
1796                     int j;
1797                     for (j = 0; j < png_pass_inc[pass]; j++)
1798                        *dp-- = *sptr;
1799                     sptr--;
1800                  }
1801               }
1802               else if (pixel_bytes == 3)
1803               {
1804                  for (i = width; i; i--)
1805                  {
1806                     png_byte v[8];
1807                     int j;
1808                     png_memcpy(v, sptr, pixel_bytes);
1809                     for (j = 0; j < png_pass_inc[pass]; j++)
1810                     {
1811                        png_memcpy(dp, v, pixel_bytes);
1812                        dp -= pixel_bytes;
1813                     }
1814                     sptr -= pixel_bytes;
1815                  }
1816               }
1817               else if (pixel_bytes == 2)
1818               {
1819                  for (i = width; i; i--)
1820                  {
1821                     png_byte v[8];
1822                     int j;
1823                     png_memcpy(v, sptr, pixel_bytes);
1824                     for (j = 0; j < png_pass_inc[pass]; j++)
1825                     {
1826                        png_memcpy(dp, v, pixel_bytes);
1827                        dp -= pixel_bytes;
1828                     }
1829                     sptr -= pixel_bytes;
1830                  }
1831               }
1832               else if (pixel_bytes == 4)
1833               {
1834                  for (i = width; i; i--)
1835                  {
1836                     png_byte v[8];
1837                     int j;
1838                     png_memcpy(v, sptr, pixel_bytes);
1839                     for (j = 0; j < png_pass_inc[pass]; j++)
1840                     {
1841                        png_memcpy(dp, v, pixel_bytes);
1842                        dp -= pixel_bytes;
1843                     }
1844                     sptr -= pixel_bytes;
1845                  }
1846               }
1847               else if (pixel_bytes == 6)
1848               {
1849                  for (i = width; i; i--)
1850                  {
1851                     png_byte v[8];
1852                     int j;
1853                     png_memcpy(v, sptr, pixel_bytes);
1854                     for (j = 0; j < png_pass_inc[pass]; j++)
1855                     {
1856                        png_memcpy(dp, v, pixel_bytes);
1857                        dp -= pixel_bytes;
1858                     }
1859                     sptr -= pixel_bytes;
1860                  }
1861               }
1862               else
1863               {
1864                  for (i = width; i; i--)
1865                  {
1866                     png_byte v[8];
1867                     int j;
1868                     png_memcpy(v, sptr, pixel_bytes);
1869                     for (j = 0; j < png_pass_inc[pass]; j++)
1870                     {
1871                        png_memcpy(dp, v, pixel_bytes);
1872                        dp -= pixel_bytes;
1873                     }
1874                     sptr -= pixel_bytes;
1875                  }
1876               }
1877
1878            } /* end of MMX not supported */
1879            break;
1880         }
1881      } /* end switch (row_info->pixel_depth) */
1882
1883      row_info->width = final_width;
1884      row_info->rowbytes = ((final_width *
1885         (png_uint_32)row_info->pixel_depth + 7) >> 3);
1886   }
1887
1888#ifdef DISABLE_PNGVCRD_INTERLACE
1889   mmx_supported = save_mmx_supported;
1890#endif
1891}
1892
1893#endif /* PNG_READ_INTERLACING_SUPPORTED */
1894
1895
1896// These variables are utilized in the functions below.  They are declared
1897// globally here to ensure alignment on 8-byte boundaries.
1898
1899union uAll {
1900   __int64 use;
1901   double  align;
1902} LBCarryMask = {0x0101010101010101},
1903  HBClearMask = {0x7f7f7f7f7f7f7f7f},
1904  ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
1905
1906
1907// Optimized code for PNG Average filter decoder
1908void /* PRIVATE */
1909png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1910                            , png_bytep prev_row)
1911{
1912   int bpp;
1913   png_uint_32 FullLength;
1914   png_uint_32 MMXLength;
1915   //png_uint_32 len;
1916   int diff;
1917
1918   bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
1919   FullLength  = row_info->rowbytes; // # of bytes to filter
1920   _asm {
1921         // Init address pointers and offset
1922         mov edi, row          // edi ==> Avg(x)
1923         xor ebx, ebx          // ebx ==> x
1924         mov edx, edi
1925         mov esi, prev_row           // esi ==> Prior(x)
1926         sub edx, bpp          // edx ==> Raw(x-bpp)
1927
1928         xor eax, eax
1929         // Compute the Raw value for the first bpp bytes
1930         //    Raw(x) = Avg(x) + (Prior(x)/2)
1931davgrlp:
1932         mov al, [esi + ebx]   // Load al with Prior(x)
1933         inc ebx
1934         shr al, 1             // divide by 2
1935         add al, [edi+ebx-1]   // Add Avg(x); -1 to offset inc ebx
1936         cmp ebx, bpp
1937         mov [edi+ebx-1], al    // Write back Raw(x);
1938                            // mov does not affect flags; -1 to offset inc ebx
1939         jb davgrlp
1940         // get # of bytes to alignment
1941         mov diff, edi         // take start of row
1942         add diff, ebx         // add bpp
1943         add diff, 0xf         // add 7 + 8 to incr past alignment boundary
1944         and diff, 0xfffffff8  // mask to alignment boundary
1945         sub diff, edi         // subtract from start ==> value ebx at alignment
1946         jz davggo
1947         // fix alignment
1948         // Compute the Raw value for the bytes upto the alignment boundary
1949         //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
1950         xor ecx, ecx
1951davglp1:
1952         xor eax, eax
1953         mov cl, [esi + ebx]        // load cl with Prior(x)
1954         mov al, [edx + ebx]  // load al with Raw(x-bpp)
1955         add ax, cx
1956         inc ebx
1957         shr ax, 1            // divide by 2
1958         add al, [edi+ebx-1]  // Add Avg(x); -1 to offset inc ebx
1959         cmp ebx, diff              // Check if at alignment boundary
1960         mov [edi+ebx-1], al        // Write back Raw(x);
1961                            // mov does not affect flags; -1 to offset inc ebx
1962         jb davglp1               // Repeat until at alignment boundary
1963davggo:
1964         mov eax, FullLength
1965         mov ecx, eax
1966         sub eax, ebx          // subtract alignment fix
1967         and eax, 0x00000007   // calc bytes over mult of 8
1968         sub ecx, eax          // drop over bytes from original length
1969         mov MMXLength, ecx
1970   } // end _asm block
1971   // Now do the math for the rest of the row
1972   switch ( bpp )
1973   {
1974      case 3:
1975      {
1976         ActiveMask.use  = 0x0000000000ffffff;
1977         ShiftBpp.use = 24;    // == 3 * 8
1978         ShiftRem.use = 40;    // == 64 - 24
1979         _asm {
1980            // Re-init address pointers and offset
1981            movq mm7, ActiveMask
1982            mov ebx, diff      // ebx ==> x = offset to alignment boundary
1983            movq mm5, LBCarryMask
1984            mov edi, row       // edi ==> Avg(x)
1985            movq mm4, HBClearMask
1986            mov esi, prev_row        // esi ==> Prior(x)
1987            // PRIME the pump (load the first Raw(x-bpp) data set
1988            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
1989                               // (we correct position in loop below)
1990davg3lp:
1991            movq mm0, [edi + ebx]      // Load mm0 with Avg(x)
1992            // Add (Prev_row/2) to Average
1993            movq mm3, mm5
1994            psrlq mm2, ShiftRem      // Correct position Raw(x-bpp) data
1995            movq mm1, [esi + ebx]    // Load mm1 with Prior(x)
1996            movq mm6, mm7
1997            pand mm3, mm1      // get lsb for each prev_row byte
1998            psrlq mm1, 1       // divide prev_row bytes by 2
1999            pand  mm1, mm4     // clear invalid bit 7 of each byte
2000            paddb mm0, mm1     // add (Prev_row/2) to Avg for each byte
2001            // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2002            movq mm1, mm3      // now use mm1 for getting LBCarrys
2003            pand mm1, mm2      // get LBCarrys for each byte where both
2004                               // lsb's were == 1 (Only valid for active group)
2005            psrlq mm2, 1       // divide raw bytes by 2
2006            pand  mm2, mm4     // clear invalid bit 7 of each byte
2007            paddb mm2, mm1     // add LBCarrys to (Raw(x-bpp)/2) for each byte
2008            pand mm2, mm6      // Leave only Active Group 1 bytes to add to Avg
2009            paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active
2010                               //  byte
2011            // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2012            psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 3-5
2013            movq mm2, mm0        // mov updated Raws to mm2
2014            psllq mm2, ShiftBpp  // shift data to position correctly
2015            movq mm1, mm3        // now use mm1 for getting LBCarrys
2016            pand mm1, mm2      // get LBCarrys for each byte where both
2017                               // lsb's were == 1 (Only valid for active group)
2018            psrlq mm2, 1       // divide raw bytes by 2
2019            pand  mm2, mm4     // clear invalid bit 7 of each byte
2020            paddb mm2, mm1     // add LBCarrys to (Raw(x-bpp)/2) for each byte
2021            pand mm2, mm6      // Leave only Active Group 2 bytes to add to Avg
2022            paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active
2023                               //  byte
2024
2025            // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2026            psllq mm6, ShiftBpp  // shift the mm6 mask to cover the last two
2027                                 // bytes
2028            movq mm2, mm0        // mov updated Raws to mm2
2029            psllq mm2, ShiftBpp  // shift data to position correctly
2030                              // Data only needs to be shifted once here to
2031                              // get the correct x-bpp offset.
2032            movq mm1, mm3     // now use mm1 for getting LBCarrys
2033            pand mm1, mm2     // get LBCarrys for each byte where both
2034                              // lsb's were == 1 (Only valid for active group)
2035            psrlq mm2, 1      // divide raw bytes by 2
2036            pand  mm2, mm4    // clear invalid bit 7 of each byte
2037            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
2038            pand mm2, mm6     // Leave only Active Group 2 bytes to add to Avg
2039            add ebx, 8
2040            paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
2041                              // byte
2042
2043            // Now ready to write back to memory
2044            movq [edi + ebx - 8], mm0
2045            // Move updated Raw(x) to use as Raw(x-bpp) for next loop
2046            cmp ebx, MMXLength
2047            movq mm2, mm0     // mov updated Raw(x) to mm2
2048            jb davg3lp
2049         } // end _asm block
2050      }
2051      break;
2052
2053      case 6:
2054      case 4:
2055      case 7:
2056      case 5:
2057      {
2058         ActiveMask.use  = 0xffffffffffffffff;  // use shift below to clear
2059                                                // appropriate inactive bytes
2060         ShiftBpp.use = bpp << 3;
2061         ShiftRem.use = 64 - ShiftBpp.use;
2062         _asm {
2063            movq mm4, HBClearMask
2064            // Re-init address pointers and offset
2065            mov ebx, diff       // ebx ==> x = offset to alignment boundary
2066            // Load ActiveMask and clear all bytes except for 1st active group
2067            movq mm7, ActiveMask
2068            mov edi, row         // edi ==> Avg(x)
2069            psrlq mm7, ShiftRem
2070            mov esi, prev_row    // esi ==> Prior(x)
2071            movq mm6, mm7
2072            movq mm5, LBCarryMask
2073            psllq mm6, ShiftBpp  // Create mask for 2nd active group
2074            // PRIME the pump (load the first Raw(x-bpp) data set
2075            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
2076                                 // (we correct position in loop below)
2077davg4lp:
2078            movq mm0, [edi + ebx]
2079            psrlq mm2, ShiftRem  // shift data to position correctly
2080            movq mm1, [esi + ebx]
2081            // Add (Prev_row/2) to Average
2082            movq mm3, mm5
2083            pand mm3, mm1     // get lsb for each prev_row byte
2084            psrlq mm1, 1      // divide prev_row bytes by 2
2085            pand  mm1, mm4    // clear invalid bit 7 of each byte
2086            paddb mm0, mm1    // add (Prev_row/2) to Avg for each byte
2087            // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2088            movq mm1, mm3     // now use mm1 for getting LBCarrys
2089            pand mm1, mm2     // get LBCarrys for each byte where both
2090                              // lsb's were == 1 (Only valid for active group)
2091            psrlq mm2, 1      // divide raw bytes by 2
2092            pand  mm2, mm4    // clear invalid bit 7 of each byte
2093            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
2094            pand mm2, mm7     // Leave only Active Group 1 bytes to add to Avg
2095            paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
2096                              // byte
2097            // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2098            movq mm2, mm0     // mov updated Raws to mm2
2099            psllq mm2, ShiftBpp // shift data to position correctly
2100            add ebx, 8
2101            movq mm1, mm3     // now use mm1 for getting LBCarrys
2102            pand mm1, mm2     // get LBCarrys for each byte where both
2103                              // lsb's were == 1 (Only valid for active group)
2104            psrlq mm2, 1      // divide raw bytes by 2
2105            pand  mm2, mm4    // clear invalid bit 7 of each byte
2106            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
2107            pand mm2, mm6     // Leave only Active Group 2 bytes to add to Avg
2108            paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
2109                              // byte
2110            cmp ebx, MMXLength
2111            // Now ready to write back to memory
2112            movq [edi + ebx - 8], mm0
2113            // Prep Raw(x-bpp) for next loop
2114            movq mm2, mm0     // mov updated Raws to mm2
2115            jb davg4lp
2116         } // end _asm block
2117      }
2118      break;
2119      case 2:
2120      {
2121         ActiveMask.use  = 0x000000000000ffff;
2122         ShiftBpp.use = 24;   // == 3 * 8
2123         ShiftRem.use = 40;   // == 64 - 24
2124         _asm {
2125            // Load ActiveMask
2126            movq mm7, ActiveMask
2127            // Re-init address pointers and offset
2128            mov ebx, diff     // ebx ==> x = offset to alignment boundary
2129            movq mm5, LBCarryMask
2130            mov edi, row      // edi ==> Avg(x)
2131            movq mm4, HBClearMask
2132            mov esi, prev_row  // esi ==> Prior(x)
2133            // PRIME the pump (load the first Raw(x-bpp) data set
2134            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
2135                              // (we correct position in loop below)
2136davg2lp:
2137            movq mm0, [edi + ebx]
2138            psllq mm2, ShiftRem  // shift data to position correctly
2139            movq mm1, [esi + ebx]
2140            // Add (Prev_row/2) to Average
2141            movq mm3, mm5
2142            pand mm3, mm1     // get lsb for each prev_row byte
2143            psrlq mm1, 1      // divide prev_row bytes by 2
2144            pand  mm1, mm4    // clear invalid bit 7 of each byte
2145            movq mm6, mm7
2146            paddb mm0, mm1    // add (Prev_row/2) to Avg for each byte
2147            // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2148            movq mm1, mm3     // now use mm1 for getting LBCarrys
2149            pand mm1, mm2     // get LBCarrys for each byte where both
2150                              // lsb's were == 1 (Only valid for active group)
2151            psrlq mm2, 1      // divide raw bytes by 2
2152            pand  mm2, mm4    // clear invalid bit 7 of each byte
2153            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
2154            pand mm2, mm6     // Leave only Active Group 1 bytes to add to Avg
2155            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2156            // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2157            psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
2158            movq mm2, mm0       // mov updated Raws to mm2
2159            psllq mm2, ShiftBpp // shift data to position correctly
2160            movq mm1, mm3       // now use mm1 for getting LBCarrys
2161            pand mm1, mm2       // get LBCarrys for each byte where both
2162                                // lsb's were == 1 (Only valid for active group)
2163            psrlq mm2, 1        // divide raw bytes by 2
2164            pand  mm2, mm4      // clear invalid bit 7 of each byte
2165            paddb mm2, mm1      // add LBCarrys to (Raw(x-bpp)/2) for each byte
2166            pand mm2, mm6       // Leave only Active Group 2 bytes to add to Avg
2167            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2168
2169            // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2170            psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
2171            movq mm2, mm0       // mov updated Raws to mm2
2172            psllq mm2, ShiftBpp // shift data to position correctly
2173                                // Data only needs to be shifted once here to
2174                                // get the correct x-bpp offset.
2175            movq mm1, mm3       // now use mm1 for getting LBCarrys
2176            pand mm1, mm2       // get LBCarrys for each byte where both
2177                                // lsb's were == 1 (Only valid for active group)
2178            psrlq mm2, 1        // divide raw bytes by 2
2179            pand  mm2, mm4      // clear invalid bit 7 of each byte
2180            paddb mm2, mm1      // add LBCarrys to (Raw(x-bpp)/2) for each byte
2181            pand mm2, mm6       // Leave only Active Group 2 bytes to add to Avg
2182            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2183
2184            // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2185            psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 6 & 7
2186            movq mm2, mm0        // mov updated Raws to mm2
2187            psllq mm2, ShiftBpp  // shift data to position correctly
2188                                 // Data only needs to be shifted once here to
2189                                 // get the correct x-bpp offset.
2190            add ebx, 8
2191            movq mm1, mm3    // now use mm1 for getting LBCarrys
2192            pand mm1, mm2    // get LBCarrys for each byte where both
2193                             // lsb's were == 1 (Only valid for active group)
2194            psrlq mm2, 1     // divide raw bytes by 2
2195            pand  mm2, mm4   // clear invalid bit 7 of each byte
2196            paddb mm2, mm1   // add LBCarrys to (Raw(x-bpp)/2) for each byte
2197            pand mm2, mm6    // Leave only Active Group 2 bytes to add to Avg
2198            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2199
2200            cmp ebx, MMXLength
2201            // Now ready to write back to memory
2202            movq [edi + ebx - 8], mm0
2203            // Prep Raw(x-bpp) for next loop
2204            movq mm2, mm0    // mov updated Raws to mm2
2205            jb davg2lp
2206        } // end _asm block
2207      }
2208      break;
2209
2210      case 1:                 // bpp == 1
2211      {
2212         _asm {
2213            // Re-init address pointers and offset
2214            mov ebx, diff     // ebx ==> x = offset to alignment boundary
2215            mov edi, row      // edi ==> Avg(x)
2216            cmp ebx, FullLength  // Test if offset at end of array
2217            jnb davg1end
2218            // Do Paeth decode for remaining bytes
2219            mov esi, prev_row    // esi ==> Prior(x)
2220            mov edx, edi
2221            xor ecx, ecx         // zero ecx before using cl & cx in loop below
2222            sub edx, bpp         // edx ==> Raw(x-bpp)
2223davg1lp:
2224            // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2225            xor eax, eax
2226            mov cl, [esi + ebx]  // load cl with Prior(x)
2227            mov al, [edx + ebx]  // load al with Raw(x-bpp)
2228            add ax, cx
2229            inc ebx
2230            shr ax, 1            // divide by 2
2231            add al, [edi+ebx-1]  // Add Avg(x); -1 to offset inc ebx
2232            cmp ebx, FullLength  // Check if at end of array
2233            mov [edi+ebx-1], al  // Write back Raw(x);
2234                         // mov does not affect flags; -1 to offset inc ebx
2235            jb davg1lp
2236davg1end:
2237         } // end _asm block
2238      }
2239      return;
2240
2241      case 8:             // bpp == 8
2242      {
2243         _asm {
2244            // Re-init address pointers and offset
2245            mov ebx, diff           // ebx ==> x = offset to alignment boundary
2246            movq mm5, LBCarryMask
2247            mov edi, row            // edi ==> Avg(x)
2248            movq mm4, HBClearMask
2249            mov esi, prev_row       // esi ==> Prior(x)
2250            // PRIME the pump (load the first Raw(x-bpp) data set
2251            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
2252                                // (NO NEED to correct position in loop below)
2253davg8lp:
2254            movq mm0, [edi + ebx]
2255            movq mm3, mm5
2256            movq mm1, [esi + ebx]
2257            add ebx, 8
2258            pand mm3, mm1       // get lsb for each prev_row byte
2259            psrlq mm1, 1        // divide prev_row bytes by 2
2260            pand mm3, mm2       // get LBCarrys for each byte where both
2261                                // lsb's were == 1
2262            psrlq mm2, 1        // divide raw bytes by 2
2263            pand  mm1, mm4      // clear invalid bit 7 of each byte
2264            paddb mm0, mm3      // add LBCarrys to Avg for each byte
2265            pand  mm2, mm4      // clear invalid bit 7 of each byte
2266            paddb mm0, mm1      // add (Prev_row/2) to Avg for each byte
2267            paddb mm0, mm2      // add (Raw/2) to Avg for each byte
2268            cmp ebx, MMXLength
2269            movq [edi + ebx - 8], mm0
2270            movq mm2, mm0       // reuse as Raw(x-bpp)
2271            jb davg8lp
2272        } // end _asm block
2273      }
2274      break;
2275      default:                  // bpp greater than 8
2276      {
2277        _asm {
2278            movq mm5, LBCarryMask
2279            // Re-init address pointers and offset
2280            mov ebx, diff       // ebx ==> x = offset to alignment boundary
2281            mov edi, row        // edi ==> Avg(x)
2282            movq mm4, HBClearMask
2283            mov edx, edi
2284            mov esi, prev_row   // esi ==> Prior(x)
2285            sub edx, bpp        // edx ==> Raw(x-bpp)
2286davgAlp:
2287            movq mm0, [edi + ebx]
2288            movq mm3, mm5
2289            movq mm1, [esi + ebx]
2290            pand mm3, mm1       // get lsb for each prev_row byte
2291            movq mm2, [edx + ebx]
2292            psrlq mm1, 1        // divide prev_row bytes by 2
2293            pand mm3, mm2       // get LBCarrys for each byte where both
2294                                // lsb's were == 1
2295            psrlq mm2, 1        // divide raw bytes by 2
2296            pand  mm1, mm4      // clear invalid bit 7 of each byte
2297            paddb mm0, mm3      // add LBCarrys to Avg for each byte
2298            pand  mm2, mm4      // clear invalid bit 7 of each byte
2299            paddb mm0, mm1      // add (Prev_row/2) to Avg for each byte
2300            add ebx, 8
2301            paddb mm0, mm2      // add (Raw/2) to Avg for each byte
2302            cmp ebx, MMXLength
2303            movq [edi + ebx - 8], mm0
2304            jb davgAlp
2305        } // end _asm block
2306      }
2307      break;
2308   }                         // end switch ( bpp )
2309
2310   _asm {
2311         // MMX acceleration complete now do clean-up
2312         // Check if any remaining bytes left to decode
2313         mov ebx, MMXLength    // ebx ==> x = offset bytes remaining after MMX
2314         mov edi, row          // edi ==> Avg(x)
2315         cmp ebx, FullLength   // Test if offset at end of array
2316         jnb davgend
2317         // Do Paeth decode for remaining bytes
2318         mov esi, prev_row     // esi ==> Prior(x)
2319         mov edx, edi
2320         xor ecx, ecx          // zero ecx before using cl & cx in loop below
2321         sub edx, bpp          // edx ==> Raw(x-bpp)
2322davglp2:
2323         // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2324         xor eax, eax
2325         mov cl, [esi + ebx]   // load cl with Prior(x)
2326         mov al, [edx + ebx]   // load al with Raw(x-bpp)
2327         add ax, cx
2328         inc ebx
2329         shr ax, 1              // divide by 2
2330         add al, [edi+ebx-1]    // Add Avg(x); -1 to offset inc ebx
2331         cmp ebx, FullLength    // Check if at end of array
2332         mov [edi+ebx-1], al    // Write back Raw(x);
2333                          // mov does not affect flags; -1 to offset inc ebx
2334         jb davglp2
2335davgend:
2336         emms             // End MMX instructions; prep for possible FP instrs.
2337   } // end _asm block
2338}
2339
2340// Optimized code for PNG Paeth filter decoder
2341void /* PRIVATE */
2342png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2343                              png_bytep prev_row)
2344{
2345   png_uint_32 FullLength;
2346   png_uint_32 MMXLength;
2347   //png_uint_32 len;
2348   int bpp;
2349   int diff;
2350   //int ptemp;
2351   int patemp, pbtemp, pctemp;
2352
2353   bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2354   FullLength  = row_info->rowbytes; // # of bytes to filter
2355   _asm
2356   {
2357         xor ebx, ebx        // ebx ==> x offset
2358         mov edi, row
2359         xor edx, edx        // edx ==> x-bpp offset
2360         mov esi, prev_row
2361         xor eax, eax
2362
2363         // Compute the Raw value for the first bpp bytes
2364         // Note: the formula works out to be always
2365         //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
2366dpthrlp:
2367         mov al, [edi + ebx]
2368         add al, [esi + ebx]
2369         inc ebx
2370         cmp ebx, bpp
2371         mov [edi + ebx - 1], al
2372         jb dpthrlp
2373         // get # of bytes to alignment
2374         mov diff, edi         // take start of row
2375         add diff, ebx         // add bpp
2376         xor ecx, ecx
2377         add diff, 0xf         // add 7 + 8 to incr past alignment boundary
2378         and diff, 0xfffffff8  // mask to alignment boundary
2379         sub diff, edi         // subtract from start ==> value ebx at alignment
2380         jz dpthgo
2381         // fix alignment
2382dpthlp1:
2383         xor eax, eax
2384         // pav = p - a = (a + b - c) - a = b - c
2385         mov al, [esi + ebx]   // load Prior(x) into al
2386         mov cl, [esi + edx]   // load Prior(x-bpp) into cl
2387         sub eax, ecx          // subtract Prior(x-bpp)
2388         mov patemp, eax       // Save pav for later use
2389         xor eax, eax
2390         // pbv = p - b = (a + b - c) - b = a - c
2391         mov al, [edi + edx]   // load Raw(x-bpp) into al
2392         sub eax, ecx          // subtract Prior(x-bpp)
2393         mov ecx, eax
2394         // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2395         add eax, patemp       // pcv = pav + pbv
2396         // pc = abs(pcv)
2397         test eax, 0x80000000
2398         jz dpthpca
2399         neg eax               // reverse sign of neg values
2400dpthpca:
2401         mov pctemp, eax       // save pc for later use
2402         // pb = abs(pbv)
2403         test ecx, 0x80000000
2404         jz dpthpba
2405         neg ecx               // reverse sign of neg values
2406dpthpba:
2407         mov pbtemp, ecx       // save pb for later use
2408         // pa = abs(pav)
2409         mov eax, patemp
2410         test eax, 0x80000000
2411         jz dpthpaa
2412         neg eax               // reverse sign of neg values
2413dpthpaa:
2414         mov patemp, eax       // save pa for later use
2415         // test if pa <= pb
2416         cmp eax, ecx
2417         jna dpthabb
2418         // pa > pb; now test if pb <= pc
2419         cmp ecx, pctemp
2420         jna dpthbbc
2421         // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2422         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
2423         jmp dpthpaeth
2424dpthbbc:
2425         // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
2426         mov cl, [esi + ebx]   // load Prior(x) into cl
2427         jmp dpthpaeth
2428dpthabb:
2429         // pa <= pb; now test if pa <= pc
2430         cmp eax, pctemp
2431         jna dpthabc
2432         // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2433         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
2434         jmp dpthpaeth
2435dpthabc:
2436         // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
2437         mov cl, [edi + edx]  // load Raw(x-bpp) into cl
2438dpthpaeth:
2439         inc ebx
2440         inc edx
2441         // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
2442         add [edi + ebx - 1], cl
2443         cmp ebx, diff
2444         jb dpthlp1
2445dpthgo:
2446         mov ecx, FullLength
2447         mov eax, ecx
2448         sub eax, ebx          // subtract alignment fix
2449         and eax, 0x00000007   // calc bytes over mult of 8
2450         sub ecx, eax          // drop over bytes from original length
2451         mov MMXLength, ecx
2452   } // end _asm block
2453   // Now do the math for the rest of the row
2454   switch ( bpp )
2455   {
2456      case 3:
2457      {
2458         ActiveMask.use = 0x0000000000ffffff;
2459         ActiveMaskEnd.use = 0xffff000000000000;
2460         ShiftBpp.use = 24;    // == bpp(3) * 8
2461         ShiftRem.use = 40;    // == 64 - 24
2462         _asm
2463         {
2464            mov ebx, diff
2465            mov edi, row
2466            mov esi, prev_row
2467            pxor mm0, mm0
2468            // PRIME the pump (load the first Raw(x-bpp) data set
2469            movq mm1, [edi+ebx-8]
2470dpth3lp:
2471            psrlq mm1, ShiftRem     // shift last 3 bytes to 1st 3 bytes
2472            movq mm2, [esi + ebx]   // load b=Prior(x)
2473            punpcklbw mm1, mm0      // Unpack High bytes of a
2474            movq mm3, [esi+ebx-8]   // Prep c=Prior(x-bpp) bytes
2475            punpcklbw mm2, mm0      // Unpack High bytes of b
2476            psrlq mm3, ShiftRem     // shift last 3 bytes to 1st 3 bytes
2477            // pav = p - a = (a + b - c) - a = b - c
2478            movq mm4, mm2
2479            punpcklbw mm3, mm0      // Unpack High bytes of c
2480            // pbv = p - b = (a + b - c) - b = a - c
2481            movq mm5, mm1
2482            psubw mm4, mm3
2483            pxor mm7, mm7
2484            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2485            movq mm6, mm4
2486            psubw mm5, mm3
2487
2488            // pa = abs(p-a) = abs(pav)
2489            // pb = abs(p-b) = abs(pbv)
2490            // pc = abs(p-c) = abs(pcv)
2491            pcmpgtw mm0, mm4    // Create mask pav bytes < 0
2492            paddw mm6, mm5
2493            pand mm0, mm4       // Only pav bytes < 0 in mm7
2494            pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
2495            psubw mm4, mm0
2496            pand mm7, mm5       // Only pbv bytes < 0 in mm0
2497            psubw mm4, mm0
2498            psubw mm5, mm7
2499            pxor mm0, mm0
2500            pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
2501            pand mm0, mm6       // Only pav bytes < 0 in mm7
2502            psubw mm5, mm7
2503            psubw mm6, mm0
2504            //  test pa <= pb
2505            movq mm7, mm4
2506            psubw mm6, mm0
2507            pcmpgtw mm7, mm5    // pa > pb?
2508            movq mm0, mm7
2509            // use mm7 mask to merge pa & pb
2510            pand mm5, mm7
2511            // use mm0 mask copy to merge a & b
2512            pand mm2, mm0
2513            pandn mm7, mm4
2514            pandn mm0, mm1
2515            paddw mm7, mm5
2516            paddw mm0, mm2
2517            //  test  ((pa <= pb)? pa:pb) <= pc
2518            pcmpgtw mm7, mm6       // pab > pc?
2519            pxor mm1, mm1
2520            pand mm3, mm7
2521            pandn mm7, mm0
2522            paddw mm7, mm3
2523            pxor mm0, mm0
2524            packuswb mm7, mm1
2525            movq mm3, [esi + ebx]   // load c=Prior(x-bpp)
2526            pand mm7, ActiveMask
2527            movq mm2, mm3           // load b=Prior(x) step 1
2528            paddb mm7, [edi + ebx]  // add Paeth predictor with Raw(x)
2529            punpcklbw mm3, mm0      // Unpack High bytes of c
2530            movq [edi + ebx], mm7   // write back updated value
2531            movq mm1, mm7           // Now mm1 will be used as Raw(x-bpp)
2532            // Now do Paeth for 2nd set of bytes (3-5)
2533            psrlq mm2, ShiftBpp     // load b=Prior(x) step 2
2534            punpcklbw mm1, mm0      // Unpack High bytes of a
2535            pxor mm7, mm7
2536            punpcklbw mm2, mm0      // Unpack High bytes of b
2537            // pbv = p - b = (a + b - c) - b = a - c
2538            movq mm5, mm1
2539            // pav = p - a = (a + b - c) - a = b - c
2540            movq mm4, mm2
2541            psubw mm5, mm3
2542            psubw mm4, mm3
2543            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
2544            //       pav + pbv = pbv + pav
2545            movq mm6, mm5
2546            paddw mm6, mm4
2547
2548            // pa = abs(p-a) = abs(pav)
2549            // pb = abs(p-b) = abs(pbv)
2550            // pc = abs(p-c) = abs(pcv)
2551            pcmpgtw mm0, mm5       // Create mask pbv bytes < 0
2552            pcmpgtw mm7, mm4       // Create mask pav bytes < 0
2553            pand mm0, mm5          // Only pbv bytes < 0 in mm0
2554            pand mm7, mm4          // Only pav bytes < 0 in mm7
2555            psubw mm5, mm0
2556            psubw mm4, mm7
2557            psubw mm5, mm0
2558            psubw mm4, mm7
2559            pxor mm0, mm0
2560            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
2561            pand mm0, mm6          // Only pav bytes < 0 in mm7
2562            psubw mm6, mm0
2563            //  test pa <= pb
2564            movq mm7, mm4
2565            psubw mm6, mm0
2566            pcmpgtw mm7, mm5       // pa > pb?
2567            movq mm0, mm7
2568            // use mm7 mask to merge pa & pb
2569            pand mm5, mm7
2570            // use mm0 mask copy to merge a & b
2571            pand mm2, mm0
2572            pandn mm7, mm4
2573            pandn mm0, mm1
2574            paddw mm7, mm5
2575            paddw mm0, mm2
2576            //  test  ((pa <= pb)? pa:pb) <= pc
2577            pcmpgtw mm7, mm6       // pab > pc?
2578            movq mm2, [esi + ebx]  // load b=Prior(x)
2579            pand mm3, mm7
2580            pandn mm7, mm0
2581            pxor mm1, mm1
2582            paddw mm7, mm3
2583            pxor mm0, mm0
2584            packuswb mm7, mm1
2585            movq mm3, mm2           // load c=Prior(x-bpp) step 1
2586            pand mm7, ActiveMask
2587            punpckhbw mm2, mm0      // Unpack High bytes of b
2588            psllq mm7, ShiftBpp     // Shift bytes to 2nd group of 3 bytes
2589             // pav = p - a = (a + b - c) - a = b - c
2590            movq mm4, mm2
2591            paddb mm7, [edi + ebx]  // add Paeth predictor with Raw(x)
2592            psllq mm3, ShiftBpp     // load c=Prior(x-bpp) step 2
2593            movq [edi + ebx], mm7   // write back updated value
2594            movq mm1, mm7
2595            punpckhbw mm3, mm0      // Unpack High bytes of c
2596            psllq mm1, ShiftBpp     // Shift bytes
2597                                    // Now mm1 will be used as Raw(x-bpp)
2598            // Now do Paeth for 3rd, and final, set of bytes (6-7)
2599            pxor mm7, mm7
2600            punpckhbw mm1, mm0      // Unpack High bytes of a
2601            psubw mm4, mm3
2602            // pbv = p - b = (a + b - c) - b = a - c
2603            movq mm5, mm1
2604            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2605            movq mm6, mm4
2606            psubw mm5, mm3
2607            pxor mm0, mm0
2608            paddw mm6, mm5
2609
2610            // pa = abs(p-a) = abs(pav)
2611            // pb = abs(p-b) = abs(pbv)
2612            // pc = abs(p-c) = abs(pcv)
2613            pcmpgtw mm0, mm4    // Create mask pav bytes < 0
2614            pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
2615            pand mm0, mm4       // Only pav bytes < 0 in mm7
2616            pand mm7, mm5       // Only pbv bytes < 0 in mm0
2617            psubw mm4, mm0
2618            psubw mm5, mm7
2619            psubw mm4, mm0
2620            psubw mm5, mm7
2621            pxor mm0, mm0
2622            pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
2623            pand mm0, mm6       // Only pav bytes < 0 in mm7
2624            psubw mm6, mm0
2625            //  test pa <= pb
2626            movq mm7, mm4
2627            psubw mm6, mm0
2628            pcmpgtw mm7, mm5    // pa > pb?
2629            movq mm0, mm7
2630            // use mm0 mask copy to merge a & b
2631            pand mm2, mm0
2632            // use mm7 mask to merge pa & pb
2633            pand mm5, mm7
2634            pandn mm0, mm1
2635            pandn mm7, mm4
2636            paddw mm0, mm2
2637            paddw mm7, mm5
2638            //  test  ((pa <= pb)? pa:pb) <= pc
2639            pcmpgtw mm7, mm6    // pab > pc?
2640            pand mm3, mm7
2641            pandn mm7, mm0
2642            paddw mm7, mm3
2643            pxor mm1, mm1
2644            packuswb mm1, mm7
2645            // Step ebx to next set of 8 bytes and repeat loop til done
2646            add ebx, 8
2647            pand mm1, ActiveMaskEnd
2648            paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2649
2650            cmp ebx, MMXLength
2651            pxor mm0, mm0              // pxor does not affect flags
2652            movq [edi + ebx - 8], mm1  // write back updated value
2653                                 // mm1 will be used as Raw(x-bpp) next loop
2654                           // mm3 ready to be used as Prior(x-bpp) next loop
2655            jb dpth3lp
2656         } // end _asm block
2657      }
2658      break;
2659
2660      case 6:
2661      case 7:
2662      case 5:
2663      {
2664         ActiveMask.use  = 0x00000000ffffffff;
2665         ActiveMask2.use = 0xffffffff00000000;
2666         ShiftBpp.use = bpp << 3;    // == bpp * 8
2667         ShiftRem.use = 64 - ShiftBpp.use;
2668         _asm
2669         {
2670            mov ebx, diff
2671            mov edi, row
2672            mov esi, prev_row
2673            // PRIME the pump (load the first Raw(x-bpp) data set
2674            movq mm1, [edi+ebx-8]
2675            pxor mm0, mm0
2676dpth6lp:
2677            // Must shift to position Raw(x-bpp) data
2678            psrlq mm1, ShiftRem
2679            // Do first set of 4 bytes
2680            movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
2681            punpcklbw mm1, mm0      // Unpack Low bytes of a
2682            movq mm2, [esi + ebx]   // load b=Prior(x)
2683            punpcklbw mm2, mm0      // Unpack Low bytes of b
2684            // Must shift to position Prior(x-bpp) data
2685            psrlq mm3, ShiftRem
2686            // pav = p - a = (a + b - c) - a = b - c
2687            movq mm4, mm2
2688            punpcklbw mm3, mm0      // Unpack Low bytes of c
2689            // pbv = p - b = (a + b - c) - b = a - c
2690            movq mm5, mm1
2691            psubw mm4, mm3
2692            pxor mm7, mm7
2693            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2694            movq mm6, mm4
2695            psubw mm5, mm3
2696            // pa = abs(p-a) = abs(pav)
2697            // pb = abs(p-b) = abs(pbv)
2698            // pc = abs(p-c) = abs(pcv)
2699            pcmpgtw mm0, mm4    // Create mask pav bytes < 0
2700            paddw mm6, mm5
2701            pand mm0, mm4       // Only pav bytes < 0 in mm7
2702            pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
2703            psubw mm4, mm0
2704            pand mm7, mm5       // Only pbv bytes < 0 in mm0
2705            psubw mm4, mm0
2706            psubw mm5, mm7
2707            pxor mm0, mm0
2708            pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
2709            pand mm0, mm6       // Only pav bytes < 0 in mm7
2710            psubw mm5, mm7
2711            psubw mm6, mm0
2712            //  test pa <= pb
2713            movq mm7, mm4
2714            psubw mm6, mm0
2715            pcmpgtw mm7, mm5    // pa > pb?
2716            movq mm0, mm7
2717            // use mm7 mask to merge pa & pb
2718            pand mm5, mm7
2719            // use mm0 mask copy to merge a & b
2720            pand mm2, mm0
2721            pandn mm7, mm4
2722            pandn mm0, mm1
2723            paddw mm7, mm5
2724            paddw mm0, mm2
2725            //  test  ((pa <= pb)? pa:pb) <= pc
2726            pcmpgtw mm7, mm6    // pab > pc?
2727            pxor mm1, mm1
2728            pand mm3, mm7
2729            pandn mm7, mm0
2730            paddw mm7, mm3
2731            pxor mm0, mm0
2732            packuswb mm7, mm1
2733            movq mm3, [esi + ebx - 8]  // load c=Prior(x-bpp)
2734            pand mm7, ActiveMask
2735            psrlq mm3, ShiftRem
2736            movq mm2, [esi + ebx]      // load b=Prior(x) step 1
2737            paddb mm7, [edi + ebx]     // add Paeth predictor with Raw(x)
2738            movq mm6, mm2
2739            movq [edi + ebx], mm7      // write back updated value
2740            movq mm1, [edi+ebx-8]
2741            psllq mm6, ShiftBpp
2742            movq mm5, mm7
2743            psrlq mm1, ShiftRem
2744            por mm3, mm6
2745            psllq mm5, ShiftBpp
2746            punpckhbw mm3, mm0         // Unpack High bytes of c
2747            por mm1, mm5
2748            // Do second set of 4 bytes
2749            punpckhbw mm2, mm0         // Unpack High bytes of b
2750            punpckhbw mm1, mm0         // Unpack High bytes of a
2751            // pav = p - a = (a + b - c) - a = b - c
2752            movq mm4, mm2
2753            // pbv = p - b = (a + b - c) - b = a - c
2754            movq mm5, mm1
2755            psubw mm4, mm3
2756            pxor mm7, mm7
2757            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2758            movq mm6, mm4
2759            psubw mm5, mm3
2760            // pa = abs(p-a) = abs(pav)
2761            // pb = abs(p-b) = abs(pbv)
2762            // pc = abs(p-c) = abs(pcv)
2763            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
2764            paddw mm6, mm5
2765            pand mm0, mm4          // Only pav bytes < 0 in mm7
2766            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
2767            psubw mm4, mm0
2768            pand mm7, mm5          // Only pbv bytes < 0 in mm0
2769            psubw mm4, mm0
2770            psubw mm5, mm7
2771            pxor mm0, mm0
2772            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
2773            pand mm0, mm6          // Only pav bytes < 0 in mm7
2774            psubw mm5, mm7
2775            psubw mm6, mm0
2776            //  test pa <= pb
2777            movq mm7, mm4
2778            psubw mm6, mm0
2779            pcmpgtw mm7, mm5       // pa > pb?
2780            movq mm0, mm7
2781            // use mm7 mask to merge pa & pb
2782            pand mm5, mm7
2783            // use mm0 mask copy to merge a & b
2784            pand mm2, mm0
2785            pandn mm7, mm4
2786            pandn mm0, mm1
2787            paddw mm7, mm5
2788            paddw mm0, mm2
2789            //  test  ((pa <= pb)? pa:pb) <= pc
2790            pcmpgtw mm7, mm6           // pab > pc?
2791            pxor mm1, mm1
2792            pand mm3, mm7
2793            pandn mm7, mm0
2794            pxor mm1, mm1
2795            paddw mm7, mm3
2796            pxor mm0, mm0
2797            // Step ex to next set of 8 bytes and repeat loop til done
2798            add ebx, 8
2799            packuswb mm1, mm7
2800            paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
2801            cmp ebx, MMXLength
2802            movq [edi + ebx - 8], mm1      // write back updated value
2803                                // mm1 will be used as Raw(x-bpp) next loop
2804            jb dpth6lp
2805         } // end _asm block
2806      }
2807      break;
2808
2809      case 4:
2810      {
2811         ActiveMask.use  = 0x00000000ffffffff;
2812         _asm {
2813            mov ebx, diff
2814            mov edi, row
2815            mov esi, prev_row
2816            pxor mm0, mm0
2817            // PRIME the pump (load the first Raw(x-bpp) data set
2818            movq mm1, [edi+ebx-8]    // Only time should need to read
2819                                     //  a=Raw(x-bpp) bytes
2820dpth4lp:
2821            // Do first set of 4 bytes
2822            movq mm3, [esi+ebx-8]    // read c=Prior(x-bpp) bytes
2823            punpckhbw mm1, mm0       // Unpack Low bytes of a
2824            movq mm2, [esi + ebx]    // load b=Prior(x)
2825            punpcklbw mm2, mm0       // Unpack High bytes of b
2826            // pav = p - a = (a + b - c) - a = b - c
2827            movq mm4, mm2
2828            punpckhbw mm3, mm0       // Unpack High bytes of c
2829            // pbv = p - b = (a + b - c) - b = a - c
2830            movq mm5, mm1
2831            psubw mm4, mm3
2832            pxor mm7, mm7
2833            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2834            movq mm6, mm4
2835            psubw mm5, mm3
2836            // pa = abs(p-a) = abs(pav)
2837            // pb = abs(p-b) = abs(pbv)
2838            // pc = abs(p-c) = abs(pcv)
2839            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
2840            paddw mm6, mm5
2841            pand mm0, mm4          // Only pav bytes < 0 in mm7
2842            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
2843            psubw mm4, mm0
2844            pand mm7, mm5          // Only pbv bytes < 0 in mm0
2845            psubw mm4, mm0
2846            psubw mm5, mm7
2847            pxor mm0, mm0
2848            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
2849            pand mm0, mm6          // Only pav bytes < 0 in mm7
2850            psubw mm5, mm7
2851            psubw mm6, mm0
2852            //  test pa <= pb
2853            movq mm7, mm4
2854            psubw mm6, mm0
2855            pcmpgtw mm7, mm5       // pa > pb?
2856            movq mm0, mm7
2857            // use mm7 mask to merge pa & pb
2858            pand mm5, mm7
2859            // use mm0 mask copy to merge a & b
2860            pand mm2, mm0
2861            pandn mm7, mm4
2862            pandn mm0, mm1
2863            paddw mm7, mm5
2864            paddw mm0, mm2
2865            //  test  ((pa <= pb)? pa:pb) <= pc
2866            pcmpgtw mm7, mm6       // pab > pc?
2867            pxor mm1, mm1
2868            pand mm3, mm7
2869            pandn mm7, mm0
2870            paddw mm7, mm3
2871            pxor mm0, mm0
2872            packuswb mm7, mm1
2873            movq mm3, [esi + ebx]      // load c=Prior(x-bpp)
2874            pand mm7, ActiveMask
2875            movq mm2, mm3              // load b=Prior(x) step 1
2876            paddb mm7, [edi + ebx]     // add Paeth predictor with Raw(x)
2877            punpcklbw mm3, mm0         // Unpack High bytes of c
2878            movq [edi + ebx], mm7      // write back updated value
2879            movq mm1, mm7              // Now mm1 will be used as Raw(x-bpp)
2880            // Do second set of 4 bytes
2881            punpckhbw mm2, mm0         // Unpack Low bytes of b
2882            punpcklbw mm1, mm0         // Unpack Low bytes of a
2883            // pav = p - a = (a + b - c) - a = b - c
2884            movq mm4, mm2
2885            // pbv = p - b = (a + b - c) - b = a - c
2886            movq mm5, mm1
2887            psubw mm4, mm3
2888            pxor mm7, mm7
2889            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2890            movq mm6, mm4
2891            psubw mm5, mm3
2892            // pa = abs(p-a) = abs(pav)
2893            // pb = abs(p-b) = abs(pbv)
2894            // pc = abs(p-c) = abs(pcv)
2895            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
2896            paddw mm6, mm5
2897            pand mm0, mm4          // Only pav bytes < 0 in mm7
2898            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
2899            psubw mm4, mm0
2900            pand mm7, mm5          // Only pbv bytes < 0 in mm0
2901            psubw mm4, mm0
2902            psubw mm5, mm7
2903            pxor mm0, mm0
2904            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
2905            pand mm0, mm6          // Only pav bytes < 0 in mm7
2906            psubw mm5, mm7
2907            psubw mm6, mm0
2908            //  test pa <= pb
2909            movq mm7, mm4
2910            psubw mm6, mm0
2911            pcmpgtw mm7, mm5       // pa > pb?
2912            movq mm0, mm7
2913            // use mm7 mask to merge pa & pb
2914            pand mm5, mm7
2915            // use mm0 mask copy to merge a & b
2916            pand mm2, mm0
2917            pandn mm7, mm4
2918            pandn mm0, mm1
2919            paddw mm7, mm5
2920            paddw mm0, mm2
2921            //  test  ((pa <= pb)? pa:pb) <= pc
2922            pcmpgtw mm7, mm6       // pab > pc?
2923            pxor mm1, mm1
2924            pand mm3, mm7
2925            pandn mm7, mm0
2926            pxor mm1, mm1
2927            paddw mm7, mm3
2928            pxor mm0, mm0
2929            // Step ex to next set of 8 bytes and repeat loop til done
2930            add ebx, 8
2931            packuswb mm1, mm7
2932            paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
2933            cmp ebx, MMXLength
2934            movq [edi + ebx - 8], mm1      // write back updated value
2935                                // mm1 will be used as Raw(x-bpp) next loop
2936            jb dpth4lp
2937         } // end _asm block
2938      }
2939      break;
2940      case 8:                          // bpp == 8
2941      {
2942         ActiveMask.use  = 0x00000000ffffffff;
2943         _asm {
2944            mov ebx, diff
2945            mov edi, row
2946            mov esi, prev_row
2947            pxor mm0, mm0
2948            // PRIME the pump (load the first Raw(x-bpp) data set
2949            movq mm1, [edi+ebx-8]      // Only time should need to read
2950                                       //  a=Raw(x-bpp) bytes
2951dpth8lp:
2952            // Do first set of 4 bytes
2953            movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
2954            punpcklbw mm1, mm0         // Unpack Low bytes of a
2955            movq mm2, [esi + ebx]      // load b=Prior(x)
2956            punpcklbw mm2, mm0         // Unpack Low bytes of b
2957            // pav = p - a = (a + b - c) - a = b - c
2958            movq mm4, mm2
2959            punpcklbw mm3, mm0         // Unpack Low bytes of c
2960            // pbv = p - b = (a + b - c) - b = a - c
2961            movq mm5, mm1
2962            psubw mm4, mm3
2963            pxor mm7, mm7
2964            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2965            movq mm6, mm4
2966            psubw mm5, mm3
2967            // pa = abs(p-a) = abs(pav)
2968            // pb = abs(p-b) = abs(pbv)
2969            // pc = abs(p-c) = abs(pcv)
2970            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
2971            paddw mm6, mm5
2972            pand mm0, mm4          // Only pav bytes < 0 in mm7
2973            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
2974            psubw mm4, mm0
2975            pand mm7, mm5          // Only pbv bytes < 0 in mm0
2976            psubw mm4, mm0
2977            psubw mm5, mm7
2978            pxor mm0, mm0
2979            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
2980            pand mm0, mm6          // Only pav bytes < 0 in mm7
2981            psubw mm5, mm7
2982            psubw mm6, mm0
2983            //  test pa <= pb
2984            movq mm7, mm4
2985            psubw mm6, mm0
2986            pcmpgtw mm7, mm5       // pa > pb?
2987            movq mm0, mm7
2988            // use mm7 mask to merge pa & pb
2989            pand mm5, mm7
2990            // use mm0 mask copy to merge a & b
2991            pand mm2, mm0
2992            pandn mm7, mm4
2993            pandn mm0, mm1
2994            paddw mm7, mm5
2995            paddw mm0, mm2
2996            //  test  ((pa <= pb)? pa:pb) <= pc
2997            pcmpgtw mm7, mm6       // pab > pc?
2998            pxor mm1, mm1
2999            pand mm3, mm7
3000            pandn mm7, mm0
3001            paddw mm7, mm3
3002            pxor mm0, mm0
3003            packuswb mm7, mm1
3004            movq mm3, [esi+ebx-8]    // read c=Prior(x-bpp) bytes
3005            pand mm7, ActiveMask
3006            movq mm2, [esi + ebx]    // load b=Prior(x)
3007            paddb mm7, [edi + ebx]   // add Paeth predictor with Raw(x)
3008            punpckhbw mm3, mm0       // Unpack High bytes of c
3009            movq [edi + ebx], mm7    // write back updated value
3010            movq mm1, [edi+ebx-8]    // read a=Raw(x-bpp) bytes
3011
3012            // Do second set of 4 bytes
3013            punpckhbw mm2, mm0       // Unpack High bytes of b
3014            punpckhbw mm1, mm0       // Unpack High bytes of a
3015            // pav = p - a = (a + b - c) - a = b - c
3016            movq mm4, mm2
3017            // pbv = p - b = (a + b - c) - b = a - c
3018            movq mm5, mm1
3019            psubw mm4, mm3
3020            pxor mm7, mm7
3021            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3022            movq mm6, mm4
3023            psubw mm5, mm3
3024            // pa = abs(p-a) = abs(pav)
3025            // pb = abs(p-b) = abs(pbv)
3026            // pc = abs(p-c) = abs(pcv)
3027            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
3028            paddw mm6, mm5
3029            pand mm0, mm4          // Only pav bytes < 0 in mm7
3030            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
3031            psubw mm4, mm0
3032            pand mm7, mm5          // Only pbv bytes < 0 in mm0
3033            psubw mm4, mm0
3034            psubw mm5, mm7
3035            pxor mm0, mm0
3036            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
3037            pand mm0, mm6          // Only pav bytes < 0 in mm7
3038            psubw mm5, mm7
3039            psubw mm6, mm0
3040            //  test pa <= pb
3041            movq mm7, mm4
3042            psubw mm6, mm0
3043            pcmpgtw mm7, mm5       // pa > pb?
3044            movq mm0, mm7
3045            // use mm7 mask to merge pa & pb
3046            pand mm5, mm7
3047            // use mm0 mask copy to merge a & b
3048            pand mm2, mm0
3049            pandn mm7, mm4
3050            pandn mm0, mm1
3051            paddw mm7, mm5
3052            paddw mm0, mm2
3053            //  test  ((pa <= pb)? pa:pb) <= pc
3054            pcmpgtw mm7, mm6       // pab > pc?
3055            pxor mm1, mm1
3056            pand mm3, mm7
3057            pandn mm7, mm0
3058            pxor mm1, mm1
3059            paddw mm7, mm3
3060            pxor mm0, mm0
3061            // Step ex to next set of 8 bytes and repeat loop til done
3062            add ebx, 8
3063            packuswb mm1, mm7
3064            paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
3065            cmp ebx, MMXLength
3066            movq [edi + ebx - 8], mm1      // write back updated value
3067                            // mm1 will be used as Raw(x-bpp) next loop
3068            jb dpth8lp
3069         } // end _asm block
3070      }
3071      break;
3072
3073      case 1:                // bpp = 1
3074      case 2:                // bpp = 2
3075      default:               // bpp > 8
3076      {
3077         _asm {
3078            mov ebx, diff
3079            cmp ebx, FullLength
3080            jnb dpthdend
3081            mov edi, row
3082            mov esi, prev_row
3083            // Do Paeth decode for remaining bytes
3084            mov edx, ebx
3085            xor ecx, ecx        // zero ecx before using cl & cx in loop below
3086            sub edx, bpp        // Set edx = ebx - bpp
3087dpthdlp:
3088            xor eax, eax
3089            // pav = p - a = (a + b - c) - a = b - c
3090            mov al, [esi + ebx]        // load Prior(x) into al
3091            mov cl, [esi + edx]        // load Prior(x-bpp) into cl
3092            sub eax, ecx                 // subtract Prior(x-bpp)
3093            mov patemp, eax                 // Save pav for later use
3094            xor eax, eax
3095            // pbv = p - b = (a + b - c) - b = a - c
3096            mov al, [edi + edx]        // load Raw(x-bpp) into al
3097            sub eax, ecx                 // subtract Prior(x-bpp)
3098            mov ecx, eax
3099            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3100            add eax, patemp                 // pcv = pav + pbv
3101            // pc = abs(pcv)
3102            test eax, 0x80000000
3103            jz dpthdpca
3104            neg eax                     // reverse sign of neg values
3105dpthdpca:
3106            mov pctemp, eax             // save pc for later use
3107            // pb = abs(pbv)
3108            test ecx, 0x80000000
3109            jz dpthdpba
3110            neg ecx                     // reverse sign of neg values
3111dpthdpba:
3112            mov pbtemp, ecx             // save pb for later use
3113            // pa = abs(pav)
3114            mov eax, patemp
3115            test eax, 0x80000000
3116            jz dpthdpaa
3117            neg eax                     // reverse sign of neg values
3118dpthdpaa:
3119            mov patemp, eax             // save pa for later use
3120            // test if pa <= pb
3121            cmp eax, ecx
3122            jna dpthdabb
3123            // pa > pb; now test if pb <= pc
3124            cmp ecx, pctemp
3125            jna dpthdbbc
3126            // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3127            mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3128            jmp dpthdpaeth
3129dpthdbbc:
3130            // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3131            mov cl, [esi + ebx]        // load Prior(x) into cl
3132            jmp dpthdpaeth
3133dpthdabb:
3134            // pa <= pb; now test if pa <= pc
3135            cmp eax, pctemp
3136            jna dpthdabc
3137            // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3138            mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3139            jmp dpthdpaeth
3140dpthdabc:
3141            // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3142            mov cl, [edi + edx]  // load Raw(x-bpp) into cl
3143dpthdpaeth:
3144            inc ebx
3145            inc edx
3146            // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3147            add [edi + ebx - 1], cl
3148            cmp ebx, FullLength
3149            jb dpthdlp
3150dpthdend:
3151         } // end _asm block
3152      }
3153      return;                   // No need to go further with this one
3154   }                         // end switch ( bpp )
3155   _asm
3156   {
3157         // MMX acceleration complete now do clean-up
3158         // Check if any remaining bytes left to decode
3159         mov ebx, MMXLength
3160         cmp ebx, FullLength
3161         jnb dpthend
3162         mov edi, row
3163         mov esi, prev_row
3164         // Do Paeth decode for remaining bytes
3165         mov edx, ebx
3166         xor ecx, ecx         // zero ecx before using cl & cx in loop below
3167         sub edx, bpp         // Set edx = ebx - bpp
3168dpthlp2:
3169         xor eax, eax
3170         // pav = p - a = (a + b - c) - a = b - c
3171         mov al, [esi + ebx]  // load Prior(x) into al
3172         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3173         sub eax, ecx         // subtract Prior(x-bpp)
3174         mov patemp, eax      // Save pav for later use
3175         xor eax, eax
3176         // pbv = p - b = (a + b - c) - b = a - c
3177         mov al, [edi + edx]  // load Raw(x-bpp) into al
3178         sub eax, ecx         // subtract Prior(x-bpp)
3179         mov ecx, eax
3180         // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3181         add eax, patemp      // pcv = pav + pbv
3182         // pc = abs(pcv)
3183         test eax, 0x80000000
3184         jz dpthpca2
3185         neg eax              // reverse sign of neg values
3186dpthpca2:
3187         mov pctemp, eax      // save pc for later use
3188         // pb = abs(pbv)
3189         test ecx, 0x80000000
3190         jz dpthpba2
3191         neg ecx              // reverse sign of neg values
3192dpthpba2:
3193         mov pbtemp, ecx      // save pb for later use
3194         // pa = abs(pav)
3195         mov eax, patemp
3196         test eax, 0x80000000
3197         jz dpthpaa2
3198         neg eax              // reverse sign of neg values
3199dpthpaa2:
3200         mov patemp, eax      // save pa for later use
3201         // test if pa <= pb
3202         cmp eax, ecx
3203         jna dpthabb2
3204         // pa > pb; now test if pb <= pc
3205         cmp ecx, pctemp
3206         jna dpthbbc2
3207         // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3208         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3209         jmp dpthpaeth2
3210dpthbbc2:
3211         // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3212         mov cl, [esi + ebx]        // load Prior(x) into cl
3213         jmp dpthpaeth2
3214dpthabb2:
3215         // pa <= pb; now test if pa <= pc
3216         cmp eax, pctemp
3217         jna dpthabc2
3218         // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3219         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3220         jmp dpthpaeth2
3221dpthabc2:
3222         // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3223         mov cl, [edi + edx]  // load Raw(x-bpp) into cl
3224dpthpaeth2:
3225         inc ebx
3226         inc edx
3227         // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3228         add [edi + ebx - 1], cl
3229         cmp ebx, FullLength
3230         jb dpthlp2
3231dpthend:
3232         emms             // End MMX instructions; prep for possible FP instrs.
3233   } // end _asm block
3234}
3235
3236// Optimized code for PNG Sub filter decoder
3237void /* PRIVATE */
3238png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3239{
3240   //int test;
3241   int bpp;
3242   png_uint_32 FullLength;
3243   png_uint_32 MMXLength;
3244   int diff;
3245
3246   bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3247   FullLength  = row_info->rowbytes - bpp; // # of bytes to filter
3248   _asm {
3249        mov edi, row
3250        mov esi, edi               // lp = row
3251        add edi, bpp               // rp = row + bpp
3252        xor eax, eax
3253        // get # of bytes to alignment
3254        mov diff, edi               // take start of row
3255        add diff, 0xf               // add 7 + 8 to incr past
3256                                        // alignment boundary
3257        xor ebx, ebx
3258        and diff, 0xfffffff8        // mask to alignment boundary
3259        sub diff, edi               // subtract from start ==> value
3260                                        //  ebx at alignment
3261        jz dsubgo
3262        // fix alignment
3263dsublp1:
3264        mov al, [esi+ebx]
3265        add [edi+ebx], al
3266        inc ebx
3267        cmp ebx, diff
3268        jb dsublp1
3269dsubgo:
3270        mov ecx, FullLength
3271        mov edx, ecx
3272        sub edx, ebx                  // subtract alignment fix
3273        and edx, 0x00000007           // calc bytes over mult of 8
3274        sub ecx, edx                  // drop over bytes from length
3275        mov MMXLength, ecx
3276   } // end _asm block
3277
3278   // Now do the math for the rest of the row
3279   switch ( bpp )
3280   {
3281        case 3:
3282        {
3283         ActiveMask.use  = 0x0000ffffff000000;
3284         ShiftBpp.use = 24;       // == 3 * 8
3285         ShiftRem.use  = 40;      // == 64 - 24
3286         _asm {
3287            mov edi, row
3288            movq mm7, ActiveMask  // Load ActiveMask for 2nd active byte group
3289            mov esi, edi              // lp = row
3290            add edi, bpp          // rp = row + bpp
3291            movq mm6, mm7
3292            mov ebx, diff
3293            psllq mm6, ShiftBpp   // Move mask in mm6 to cover 3rd active
3294                                  // byte group
3295            // PRIME the pump (load the first Raw(x-bpp) data set
3296            movq mm1, [edi+ebx-8]
3297dsub3lp:
3298            psrlq mm1, ShiftRem   // Shift data for adding 1st bpp bytes
3299                          // no need for mask; shift clears inactive bytes
3300            // Add 1st active group
3301            movq mm0, [edi+ebx]
3302            paddb mm0, mm1
3303            // Add 2nd active group
3304            movq mm1, mm0         // mov updated Raws to mm1
3305            psllq mm1, ShiftBpp   // shift data to position correctly
3306            pand mm1, mm7         // mask to use only 2nd active group
3307            paddb mm0, mm1
3308            // Add 3rd active group
3309            movq mm1, mm0         // mov updated Raws to mm1
3310            psllq mm1, ShiftBpp   // shift data to position correctly
3311            pand mm1, mm6         // mask to use only 3rd active group
3312            add ebx, 8
3313            paddb mm0, mm1
3314            cmp ebx, MMXLength
3315            movq [edi+ebx-8], mm0     // Write updated Raws back to array
3316            // Prep for doing 1st add at top of loop
3317            movq mm1, mm0
3318            jb dsub3lp
3319         } // end _asm block
3320      }
3321      break;
3322
3323      case 1:
3324      {
3325         // Placed here just in case this is a duplicate of the
3326         // non-MMX code for the SUB filter in png_read_filter_row above
3327         //
3328         //         png_bytep rp;
3329         //         png_bytep lp;
3330         //         png_uint_32 i;
3331         //         bpp = (row_info->pixel_depth + 7) >> 3;
3332         //         for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3333         //            i < row_info->rowbytes; i++, rp++, lp++)
3334         //      {
3335         //            *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3336         //      }
3337         _asm {
3338            mov ebx, diff
3339            mov edi, row
3340            cmp ebx, FullLength
3341            jnb dsub1end
3342            mov esi, edi          // lp = row
3343            xor eax, eax
3344            add edi, bpp      // rp = row + bpp
3345dsub1lp:
3346            mov al, [esi+ebx]
3347            add [edi+ebx], al
3348            inc ebx
3349            cmp ebx, FullLength
3350            jb dsub1lp
3351dsub1end:
3352         } // end _asm block
3353      }
3354      return;
3355
3356      case 6:
3357      case 7:
3358      case 4:
3359      case 5:
3360      {
3361         ShiftBpp.use = bpp << 3;
3362         ShiftRem.use = 64 - ShiftBpp.use;
3363         _asm {
3364            mov edi, row
3365            mov ebx, diff
3366            mov esi, edi               // lp = row
3367            add edi, bpp           // rp = row + bpp
3368            // PRIME the pump (load the first Raw(x-bpp) data set
3369            movq mm1, [edi+ebx-8]
3370dsub4lp:
3371            psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3372                          // no need for mask; shift clears inactive bytes
3373            movq mm0, [edi+ebx]
3374            paddb mm0, mm1
3375            // Add 2nd active group
3376            movq mm1, mm0          // mov updated Raws to mm1
3377            psllq mm1, ShiftBpp    // shift data to position correctly
3378                                   // there is no need for any mask
3379                                   // since shift clears inactive bits/bytes
3380            add ebx, 8
3381            paddb mm0, mm1
3382            cmp ebx, MMXLength
3383            movq [edi+ebx-8], mm0
3384            movq mm1, mm0          // Prep for doing 1st add at top of loop
3385            jb dsub4lp
3386         } // end _asm block
3387      }
3388      break;
3389
3390      case 2:
3391      {
3392         ActiveMask.use  = 0x00000000ffff0000;
3393         ShiftBpp.use = 16;       // == 2 * 8
3394         ShiftRem.use = 48;       // == 64 - 16
3395         _asm {
3396            movq mm7, ActiveMask  // Load ActiveMask for 2nd active byte group
3397            mov ebx, diff
3398            movq mm6, mm7
3399            mov edi, row
3400            psllq mm6, ShiftBpp     // Move mask in mm6 to cover 3rd active
3401                                    //  byte group
3402            mov esi, edi            // lp = row
3403            movq mm5, mm6
3404            add edi, bpp            // rp = row + bpp
3405            psllq mm5, ShiftBpp     // Move mask in mm5 to cover 4th active
3406                                    //  byte group
3407            // PRIME the pump (load the first Raw(x-bpp) data set
3408            movq mm1, [edi+ebx-8]
3409dsub2lp:
3410            // Add 1st active group
3411            psrlq mm1, ShiftRem     // Shift data for adding 1st bpp bytes
3412                                    // no need for mask; shift clears inactive
3413                                    //  bytes
3414            movq mm0, [edi+ebx]
3415            paddb mm0, mm1
3416            // Add 2nd active group
3417            movq mm1, mm0           // mov updated Raws to mm1
3418            psllq mm1, ShiftBpp     // shift data to position correctly
3419            pand mm1, mm7           // mask to use only 2nd active group
3420            paddb mm0, mm1
3421            // Add 3rd active group
3422            movq mm1, mm0           // mov updated Raws to mm1
3423            psllq mm1, ShiftBpp     // shift data to position correctly
3424            pand mm1, mm6           // mask to use only 3rd active group
3425            paddb mm0, mm1
3426            // Add 4th active group
3427            movq mm1, mm0           // mov updated Raws to mm1
3428            psllq mm1, ShiftBpp     // shift data to position correctly
3429            pand mm1, mm5           // mask to use only 4th active group
3430            add ebx, 8
3431            paddb mm0, mm1
3432            cmp ebx, MMXLength
3433            movq [edi+ebx-8], mm0   // Write updated Raws back to array
3434            movq mm1, mm0           // Prep for doing 1st add at top of loop
3435            jb dsub2lp
3436         } // end _asm block
3437      }
3438      break;
3439      case 8:
3440      {
3441         _asm {
3442            mov edi, row
3443            mov ebx, diff
3444            mov esi, edi            // lp = row
3445            add edi, bpp            // rp = row + bpp
3446            mov ecx, MMXLength
3447            movq mm7, [edi+ebx-8]   // PRIME the pump (load the first
3448                                    // Raw(x-bpp) data set
3449            and ecx, 0x0000003f     // calc bytes over mult of 64
3450dsub8lp:
3451            movq mm0, [edi+ebx]     // Load Sub(x) for 1st 8 bytes
3452            paddb mm0, mm7
3453            movq mm1, [edi+ebx+8]   // Load Sub(x) for 2nd 8 bytes
3454            movq [edi+ebx], mm0    // Write Raw(x) for 1st 8 bytes
3455                                   // Now mm0 will be used as Raw(x-bpp) for
3456                                   // the 2nd group of 8 bytes.  This will be
3457                                   // repeated for each group of 8 bytes with
3458                                   // the 8th group being used as the Raw(x-bpp)
3459                                   // for the 1st group of the next loop.
3460            paddb mm1, mm0
3461            movq mm2, [edi+ebx+16]  // Load Sub(x) for 3rd 8 bytes
3462            movq [edi+ebx+8], mm1   // Write Raw(x) for 2nd 8 bytes
3463            paddb mm2, mm1
3464            movq mm3, [edi+ebx+24]  // Load Sub(x) for 4th 8 bytes
3465            movq [edi+ebx+16], mm2  // Write Raw(x) for 3rd 8 bytes
3466            paddb mm3, mm2
3467            movq mm4, [edi+ebx+32]  // Load Sub(x) for 5th 8 bytes
3468            movq [edi+ebx+24], mm3  // Write Raw(x) for 4th 8 bytes
3469            paddb mm4, mm3
3470            movq mm5, [edi+ebx+40]  // Load Sub(x) for 6th 8 bytes
3471            movq [edi+ebx+32], mm4  // Write Raw(x) for 5th 8 bytes
3472            paddb mm5, mm4
3473            movq mm6, [edi+ebx+48]  // Load Sub(x) for 7th 8 bytes
3474            movq [edi+ebx+40], mm5  // Write Raw(x) for 6th 8 bytes
3475            paddb mm6, mm5
3476            movq mm7, [edi+ebx+56]  // Load Sub(x) for 8th 8 bytes
3477            movq [edi+ebx+48], mm6  // Write Raw(x) for 7th 8 bytes
3478            add ebx, 64
3479            paddb mm7, mm6
3480            cmp ebx, ecx
3481            movq [edi+ebx-8], mm7   // Write Raw(x) for 8th 8 bytes
3482            jb dsub8lp
3483            cmp ebx, MMXLength
3484            jnb dsub8lt8
3485dsub8lpA:
3486            movq mm0, [edi+ebx]
3487            add ebx, 8
3488            paddb mm0, mm7
3489            cmp ebx, MMXLength
3490            movq [edi+ebx-8], mm0   // use -8 to offset early add to ebx
3491            movq mm7, mm0           // Move calculated Raw(x) data to mm1 to
3492                                    // be the new Raw(x-bpp) for the next loop
3493            jb dsub8lpA
3494dsub8lt8:
3495         } // end _asm block
3496      }
3497      break;
3498
3499      default:                // bpp greater than 8 bytes
3500      {
3501         _asm {
3502            mov ebx, diff
3503            mov edi, row
3504            mov esi, edi           // lp = row
3505            add edi, bpp           // rp = row + bpp
3506dsubAlp:
3507            movq mm0, [edi+ebx]
3508            movq mm1, [esi+ebx]
3509            add ebx, 8
3510            paddb mm0, mm1
3511            cmp ebx, MMXLength
3512            movq [edi+ebx-8], mm0  // mov does not affect flags; -8 to offset
3513                                   //  add ebx
3514            jb dsubAlp
3515         } // end _asm block
3516      }
3517      break;
3518
3519   } // end switch ( bpp )
3520
3521   _asm {
3522        mov ebx, MMXLength
3523        mov edi, row
3524        cmp ebx, FullLength
3525        jnb dsubend
3526        mov esi, edi               // lp = row
3527        xor eax, eax
3528        add edi, bpp               // rp = row + bpp
3529dsublp2:
3530        mov al, [esi+ebx]
3531        add [edi+ebx], al
3532        inc ebx
3533        cmp ebx, FullLength
3534        jb dsublp2
3535dsubend:
3536        emms             // End MMX instructions; prep for possible FP instrs.
3537   } // end _asm block
3538}
3539
3540// Optimized code for PNG Up filter decoder
3541void /* PRIVATE */
3542png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3543   png_bytep prev_row)
3544{
3545   png_uint_32 len;
3546   len  = row_info->rowbytes;       // # of bytes to filter
3547   _asm {
3548      mov edi, row
3549      // get # of bytes to alignment
3550      mov ecx, edi
3551      xor ebx, ebx
3552      add ecx, 0x7
3553      xor eax, eax
3554      and ecx, 0xfffffff8
3555      mov esi, prev_row
3556      sub ecx, edi
3557      jz dupgo
3558      // fix alignment
3559duplp1:
3560      mov al, [edi+ebx]
3561      add al, [esi+ebx]
3562      inc ebx
3563      cmp ebx, ecx
3564      mov [edi + ebx-1], al  // mov does not affect flags; -1 to offset inc ebx
3565      jb duplp1
3566dupgo:
3567      mov ecx, len
3568      mov edx, ecx
3569      sub edx, ebx                  // subtract alignment fix
3570      and edx, 0x0000003f           // calc bytes over mult of 64
3571      sub ecx, edx                  // drop over bytes from length
3572      // Unrolled loop - use all MMX registers and interleave to reduce
3573      // number of branch instructions (loops) and reduce partial stalls
3574duploop:
3575      movq mm1, [esi+ebx]
3576      movq mm0, [edi+ebx]
3577      movq mm3, [esi+ebx+8]
3578      paddb mm0, mm1
3579      movq mm2, [edi+ebx+8]
3580      movq [edi+ebx], mm0
3581      paddb mm2, mm3
3582      movq mm5, [esi+ebx+16]
3583      movq [edi+ebx+8], mm2
3584      movq mm4, [edi+ebx+16]
3585      movq mm7, [esi+ebx+24]
3586      paddb mm4, mm5
3587      movq mm6, [edi+ebx+24]
3588      movq [edi+ebx+16], mm4
3589      paddb mm6, mm7
3590      movq mm1, [esi+ebx+32]
3591      movq [edi+ebx+24], mm6
3592      movq mm0, [edi+ebx+32]
3593      movq mm3, [esi+ebx+40]
3594      paddb mm0, mm1
3595      movq mm2, [edi+ebx+40]
3596      movq [edi+ebx+32], mm0
3597      paddb mm2, mm3
3598      movq mm5, [esi+ebx+48]
3599      movq [edi+ebx+40], mm2
3600      movq mm4, [edi+ebx+48]
3601      movq mm7, [esi+ebx+56]
3602      paddb mm4, mm5
3603      movq mm6, [edi+ebx+56]
3604      movq [edi+ebx+48], mm4
3605      add ebx, 64
3606      paddb mm6, mm7
3607      cmp ebx, ecx
3608      movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
3609                                     // -8 to offset add ebx
3610      jb duploop
3611
3612      cmp edx, 0                     // Test for bytes over mult of 64
3613      jz dupend
3614
3615
3616      // 2 lines added by lcreeve@netins.net
3617      // (mail 11 Jul 98 in png-implement list)
3618      cmp edx, 8 //test for less than 8 bytes
3619      jb duplt8
3620
3621
3622      add ecx, edx
3623      and edx, 0x00000007           // calc bytes over mult of 8
3624      sub ecx, edx                  // drop over bytes from length
3625      jz duplt8
3626      // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
3627duplpA:
3628      movq mm1, [esi+ebx]
3629      movq mm0, [edi+ebx]
3630      add ebx, 8
3631      paddb mm0, mm1
3632      cmp ebx, ecx
3633      movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
3634      jb duplpA
3635      cmp edx, 0            // Test for bytes over mult of 8
3636      jz dupend
3637duplt8:
3638      xor eax, eax
3639      add ecx, edx          // move over byte count into counter
3640      // Loop using x86 registers to update remaining bytes
3641duplp2:
3642      mov al, [edi + ebx]
3643      add al, [esi + ebx]
3644      inc ebx
3645      cmp ebx, ecx
3646      mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3647      jb duplp2
3648dupend:
3649      // Conversion of filtered row completed
3650      emms          // End MMX instructions; prep for possible FP instrs.
3651   } // end _asm block
3652}
3653
3654
3655// Optimized png_read_filter_row routines
3656void /* PRIVATE */
3657png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3658   row, png_bytep prev_row, int filter)
3659{
3660#ifdef PNG_DEBUG
3661   char filnm[6];
3662#endif
3663#define UseMMX 1
3664
3665   if (mmx_supported == 2)
3666       mmx_supported = mmxsupport();
3667
3668   if (!mmx_supported)
3669   {
3670       png_read_filter_row_c(png_ptr, row_info, row, prev_row, filter);
3671       return ;
3672   }
3673
3674#ifdef PNG_DEBUG
3675   png_debug(1, "in png_read_filter_row\n");
3676#  if (UseMMX == 1)
3677   png_debug1(0,"%s, ", "MMX");
3678#  else
3679   png_debug1(0,"%s, ", "x86");
3680#  endif
3681   switch (filter)
3682   {
3683      case 0: sprintf(filnm, "None ");
3684         break;
3685      case 1: sprintf(filnm, "Sub  ");
3686         break;
3687      case 2: sprintf(filnm, "Up   ");
3688         break;
3689      case 3: sprintf(filnm, "Avg  ");
3690         break;
3691      case 4: sprintf(filnm, "Paeth");
3692         break;
3693      default: sprintf(filnm, "Unknw");
3694         break;
3695   }
3696   png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3697   png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3698      (int)((row_info->pixel_depth + 7) >> 3));
3699   png_debug1(0,"len=%8d, ", row_info->rowbytes);
3700#endif
3701
3702   switch (filter)
3703   {
3704      case PNG_FILTER_VALUE_NONE:
3705         break;
3706      case PNG_FILTER_VALUE_SUB:
3707      {
3708#if (UseMMX == 1)
3709         if ((row_info->pixel_depth > 8) &&
3710            (row_info->rowbytes >= 128) )
3711         {
3712            png_read_filter_row_mmx_sub(row_info, row);
3713         }
3714         else
3715#endif
3716         {
3717            png_uint_32 i;
3718            png_uint_32 istop = row_info->rowbytes;
3719            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3720            png_bytep rp = row + bpp;
3721            png_bytep lp = row;
3722
3723            for (i = bpp; i < istop; i++)
3724            {
3725               *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3726               rp++;
3727            }
3728         }  //end !UseMMX
3729         break;
3730      }
3731      case PNG_FILTER_VALUE_UP:
3732      {
3733#if (UseMMX == 1)
3734         if ((row_info->pixel_depth > 8) &&
3735             (row_info->rowbytes >= 128) )
3736         {
3737            png_read_filter_row_mmx_up(row_info, row, prev_row);
3738         }  //end if UseMMX
3739         else
3740#endif
3741         {
3742            png_bytep rp;
3743            png_bytep pp;
3744            png_uint_32 i;
3745            for (i = 0, rp = row, pp = prev_row;
3746               i < row_info->rowbytes; i++, rp++, pp++)
3747            {
3748                  *rp = (png_byte)(((int)(*rp) + (int)(*pp)) & 0xff);
3749            }
3750         }  //end !UseMMX
3751         break;
3752      }
3753      case PNG_FILTER_VALUE_AVG:
3754      {
3755#if (UseMMX == 1)
3756         if ((row_info->pixel_depth > 8) &&
3757             (row_info->rowbytes >= 128) )
3758         {
3759            png_read_filter_row_mmx_avg(row_info, row, prev_row);
3760         }  //end if UseMMX
3761         else
3762#endif
3763         {
3764            png_uint_32 i;
3765            png_bytep rp = row;
3766            png_bytep pp = prev_row;
3767            png_bytep lp = row;
3768            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3769            png_uint_32 istop = row_info->rowbytes - bpp;
3770
3771            for (i = 0; i < bpp; i++)
3772            {
3773               *rp = (png_byte)(((int)(*rp) +
3774                  ((int)(*pp++) >> 1)) & 0xff);
3775               rp++;
3776            }
3777
3778            for (i = 0; i < istop; i++)
3779            {
3780               *rp = (png_byte)(((int)(*rp) +
3781                  ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3782               rp++;
3783            }
3784         }  //end !UseMMX
3785         break;
3786      }
3787      case PNG_FILTER_VALUE_PAETH:
3788      {
3789#if (UseMMX == 1)
3790         if ((row_info->pixel_depth > 8) &&
3791             (row_info->rowbytes >= 128) )
3792         {
3793            png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3794         }  //end if UseMMX
3795         else
3796#endif
3797         {
3798            png_uint_32 i;
3799            png_bytep rp = row;
3800            png_bytep pp = prev_row;
3801            png_bytep lp = row;
3802            png_bytep cp = prev_row;
3803            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3804            png_uint_32 istop=row_info->rowbytes - bpp;
3805
3806            for (i = 0; i < bpp; i++)
3807            {
3808               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3809               rp++;
3810            }
3811
3812            for (i = 0; i < istop; i++)   // use leftover rp,pp
3813            {
3814               int a, b, c, pa, pb, pc, p;
3815
3816               a = *lp++;
3817               b = *pp++;
3818               c = *cp++;
3819
3820               p = b - c;
3821               pc = a - c;
3822
3823#ifdef PNG_USE_ABS
3824               pa = abs(p);
3825               pb = abs(pc);
3826               pc = abs(p + pc);
3827#else
3828               pa = p < 0 ? -p : p;
3829               pb = pc < 0 ? -pc : pc;
3830               pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3831#endif
3832
3833               /*
3834                  if (pa <= pb && pa <= pc)
3835                     p = a;
3836                  else if (pb <= pc)
3837                     p = b;
3838                  else
3839                     p = c;
3840                */
3841
3842               p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3843
3844               *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3845               rp++;
3846            }
3847         }  //end !UseMMX
3848         break;
3849      }
3850      default:
3851         png_warning(png_ptr, "Ignoring bad adaptive filter type");
3852         *row=0;
3853         break;
3854   }
3855}
3856#endif
3857