1*593dc095SDavid du Colombier /* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
2*593dc095SDavid du Colombier *
3*593dc095SDavid du Colombier * For Intel x86 CPU and Microsoft Visual C++ compiler
4*593dc095SDavid du Colombier *
5*593dc095SDavid du Colombier * libpng version 1.2.8 - December 3, 2004
6*593dc095SDavid du Colombier * For conditions of distribution and use, see copyright notice in png.h
7*593dc095SDavid du Colombier * Copyright (c) 1998-2004 Glenn Randers-Pehrson
8*593dc095SDavid du Colombier * Copyright (c) 1998, Intel Corporation
9*593dc095SDavid du Colombier *
10*593dc095SDavid du Colombier * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11*593dc095SDavid du Colombier * Interface to libpng contributed by Gilles Vollant, 1999
12*593dc095SDavid du Colombier *
13*593dc095SDavid du Colombier *
14*593dc095SDavid du Colombier * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
15*593dc095SDavid du Colombier * a sign error in the post-MMX cleanup code for each pixel_depth resulted
16*593dc095SDavid du Colombier * in bad pixels at the beginning of some rows of some images, and also
17*593dc095SDavid du Colombier * (due to out-of-range memory reads and writes) caused heap corruption
18*593dc095SDavid du Colombier * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e.
19*593dc095SDavid du Colombier *
20*593dc095SDavid du Colombier * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
21*593dc095SDavid du Colombier *
22*593dc095SDavid du Colombier * [runtime MMX configuration, GRR 20010102]
23*593dc095SDavid du Colombier *
24*593dc095SDavid du Colombier */
25*593dc095SDavid du Colombier
26*593dc095SDavid du Colombier #define PNG_INTERNAL
27*593dc095SDavid du Colombier #include "png.h"
28*593dc095SDavid du Colombier
29*593dc095SDavid du Colombier #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
30*593dc095SDavid du Colombier
31*593dc095SDavid du Colombier static int mmx_supported=2;
32*593dc095SDavid du Colombier
33*593dc095SDavid du Colombier
34*593dc095SDavid du Colombier int PNGAPI
png_mmx_support(void)35*593dc095SDavid du Colombier png_mmx_support(void)
36*593dc095SDavid du Colombier {
37*593dc095SDavid du Colombier int mmx_supported_local = 0;
38*593dc095SDavid du Colombier _asm {
39*593dc095SDavid du Colombier push ebx //CPUID will trash these
40*593dc095SDavid du Colombier push ecx
41*593dc095SDavid du Colombier push edx
42*593dc095SDavid du Colombier
43*593dc095SDavid du Colombier pushfd //Save Eflag to stack
44*593dc095SDavid du Colombier pop eax //Get Eflag from stack into eax
45*593dc095SDavid du Colombier mov ecx, eax //Make another copy of Eflag in ecx
46*593dc095SDavid du Colombier xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
47*593dc095SDavid du Colombier push eax //Save modified Eflag back to stack
48*593dc095SDavid du Colombier
49*593dc095SDavid du Colombier popfd //Restored modified value back to Eflag reg
50*593dc095SDavid du Colombier pushfd //Save Eflag to stack
51*593dc095SDavid du Colombier pop eax //Get Eflag from stack
52*593dc095SDavid du Colombier push ecx // save original Eflag to stack
53*593dc095SDavid du Colombier popfd // restore original Eflag
54*593dc095SDavid du Colombier xor eax, ecx //Compare the new Eflag with the original Eflag
55*593dc095SDavid du Colombier jz NOT_SUPPORTED //If the same, CPUID instruction is not supported,
56*593dc095SDavid du Colombier //skip following instructions and jump to
57*593dc095SDavid du Colombier //NOT_SUPPORTED label
58*593dc095SDavid du Colombier
59*593dc095SDavid du Colombier xor eax, eax //Set eax to zero
60*593dc095SDavid du Colombier
61*593dc095SDavid du Colombier _asm _emit 0x0f //CPUID instruction (two bytes opcode)
62*593dc095SDavid du Colombier _asm _emit 0xa2
63*593dc095SDavid du Colombier
64*593dc095SDavid du Colombier cmp eax, 1 //make sure eax return non-zero value
65*593dc095SDavid du Colombier jl NOT_SUPPORTED //If eax is zero, mmx not supported
66*593dc095SDavid du Colombier
67*593dc095SDavid du Colombier xor eax, eax //set eax to zero
68*593dc095SDavid du Colombier inc eax //Now increment eax to 1. This instruction is
69*593dc095SDavid du Colombier //faster than the instruction "mov eax, 1"
70*593dc095SDavid du Colombier
71*593dc095SDavid du Colombier _asm _emit 0x0f //CPUID instruction
72*593dc095SDavid du Colombier _asm _emit 0xa2
73*593dc095SDavid du Colombier
74*593dc095SDavid du Colombier and edx, 0x00800000 //mask out all bits but mmx bit(24)
75*593dc095SDavid du Colombier cmp edx, 0 // 0 = mmx not supported
76*593dc095SDavid du Colombier jz NOT_SUPPORTED // non-zero = Yes, mmx IS supported
77*593dc095SDavid du Colombier
78*593dc095SDavid du Colombier mov mmx_supported_local, 1 //set return value to 1
79*593dc095SDavid du Colombier
80*593dc095SDavid du Colombier NOT_SUPPORTED:
81*593dc095SDavid du Colombier mov eax, mmx_supported_local //move return value to eax
82*593dc095SDavid du Colombier pop edx //CPUID trashed these
83*593dc095SDavid du Colombier pop ecx
84*593dc095SDavid du Colombier pop ebx
85*593dc095SDavid du Colombier }
86*593dc095SDavid du Colombier
87*593dc095SDavid du Colombier //mmx_supported_local=0; // test code for force don't support MMX
88*593dc095SDavid du Colombier //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
89*593dc095SDavid du Colombier
90*593dc095SDavid du Colombier mmx_supported = mmx_supported_local;
91*593dc095SDavid du Colombier return mmx_supported_local;
92*593dc095SDavid du Colombier }
93*593dc095SDavid du Colombier
94*593dc095SDavid du Colombier /* Combines the row recently read in with the previous row.
95*593dc095SDavid du Colombier This routine takes care of alpha and transparency if requested.
96*593dc095SDavid du Colombier This routine also handles the two methods of progressive display
97*593dc095SDavid du Colombier of interlaced images, depending on the mask value.
98*593dc095SDavid du Colombier The mask value describes which pixels are to be combined with
99*593dc095SDavid du Colombier the row. The pattern always repeats every 8 pixels, so just 8
100*593dc095SDavid du Colombier bits are needed. A one indicates the pixel is to be combined; a
101*593dc095SDavid du Colombier zero indicates the pixel is to be skipped. This is in addition
102*593dc095SDavid du Colombier to any alpha or transparency value associated with the pixel. If
103*593dc095SDavid du Colombier you want all pixels to be combined, pass 0xff (255) in mask. */
104*593dc095SDavid du Colombier
105*593dc095SDavid du Colombier /* Use this routine for x86 platform - uses faster MMX routine if machine
106*593dc095SDavid du Colombier supports MMX */
107*593dc095SDavid du Colombier
108*593dc095SDavid du Colombier void /* PRIVATE */
png_combine_row(png_structp png_ptr,png_bytep row,int mask)109*593dc095SDavid du Colombier png_combine_row(png_structp png_ptr, png_bytep row, int mask)
110*593dc095SDavid du Colombier {
111*593dc095SDavid du Colombier #ifdef PNG_USE_LOCAL_ARRAYS
112*593dc095SDavid du Colombier const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
113*593dc095SDavid du Colombier #endif
114*593dc095SDavid du Colombier
115*593dc095SDavid du Colombier png_debug(1,"in png_combine_row_asm\n");
116*593dc095SDavid du Colombier
117*593dc095SDavid du Colombier if (mmx_supported == 2) {
118*593dc095SDavid du Colombier #if !defined(PNG_1_0_X)
119*593dc095SDavid du Colombier /* this should have happened in png_init_mmx_flags() already */
120*593dc095SDavid du Colombier png_warning(png_ptr, "asm_flags may not have been initialized");
121*593dc095SDavid du Colombier #endif
122*593dc095SDavid du Colombier png_mmx_support();
123*593dc095SDavid du Colombier }
124*593dc095SDavid du Colombier
125*593dc095SDavid du Colombier if (mask == 0xff)
126*593dc095SDavid du Colombier {
127*593dc095SDavid du Colombier png_memcpy(row, png_ptr->row_buf + 1,
128*593dc095SDavid du Colombier (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,
129*593dc095SDavid du Colombier png_ptr->width));
130*593dc095SDavid du Colombier }
131*593dc095SDavid du Colombier /* GRR: add "else if (mask == 0)" case?
132*593dc095SDavid du Colombier * or does png_combine_row() not even get called in that case? */
133*593dc095SDavid du Colombier else
134*593dc095SDavid du Colombier {
135*593dc095SDavid du Colombier switch (png_ptr->row_info.pixel_depth)
136*593dc095SDavid du Colombier {
137*593dc095SDavid du Colombier case 1:
138*593dc095SDavid du Colombier {
139*593dc095SDavid du Colombier png_bytep sp;
140*593dc095SDavid du Colombier png_bytep dp;
141*593dc095SDavid du Colombier int s_inc, s_start, s_end;
142*593dc095SDavid du Colombier int m;
143*593dc095SDavid du Colombier int shift;
144*593dc095SDavid du Colombier png_uint_32 i;
145*593dc095SDavid du Colombier
146*593dc095SDavid du Colombier sp = png_ptr->row_buf + 1;
147*593dc095SDavid du Colombier dp = row;
148*593dc095SDavid du Colombier m = 0x80;
149*593dc095SDavid du Colombier #if defined(PNG_READ_PACKSWAP_SUPPORTED)
150*593dc095SDavid du Colombier if (png_ptr->transformations & PNG_PACKSWAP)
151*593dc095SDavid du Colombier {
152*593dc095SDavid du Colombier s_start = 0;
153*593dc095SDavid du Colombier s_end = 7;
154*593dc095SDavid du Colombier s_inc = 1;
155*593dc095SDavid du Colombier }
156*593dc095SDavid du Colombier else
157*593dc095SDavid du Colombier #endif
158*593dc095SDavid du Colombier {
159*593dc095SDavid du Colombier s_start = 7;
160*593dc095SDavid du Colombier s_end = 0;
161*593dc095SDavid du Colombier s_inc = -1;
162*593dc095SDavid du Colombier }
163*593dc095SDavid du Colombier
164*593dc095SDavid du Colombier shift = s_start;
165*593dc095SDavid du Colombier
166*593dc095SDavid du Colombier for (i = 0; i < png_ptr->width; i++)
167*593dc095SDavid du Colombier {
168*593dc095SDavid du Colombier if (m & mask)
169*593dc095SDavid du Colombier {
170*593dc095SDavid du Colombier int value;
171*593dc095SDavid du Colombier
172*593dc095SDavid du Colombier value = (*sp >> shift) & 0x1;
173*593dc095SDavid du Colombier *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
174*593dc095SDavid du Colombier *dp |= (png_byte)(value << shift);
175*593dc095SDavid du Colombier }
176*593dc095SDavid du Colombier
177*593dc095SDavid du Colombier if (shift == s_end)
178*593dc095SDavid du Colombier {
179*593dc095SDavid du Colombier shift = s_start;
180*593dc095SDavid du Colombier sp++;
181*593dc095SDavid du Colombier dp++;
182*593dc095SDavid du Colombier }
183*593dc095SDavid du Colombier else
184*593dc095SDavid du Colombier shift += s_inc;
185*593dc095SDavid du Colombier
186*593dc095SDavid du Colombier if (m == 1)
187*593dc095SDavid du Colombier m = 0x80;
188*593dc095SDavid du Colombier else
189*593dc095SDavid du Colombier m >>= 1;
190*593dc095SDavid du Colombier }
191*593dc095SDavid du Colombier break;
192*593dc095SDavid du Colombier }
193*593dc095SDavid du Colombier
194*593dc095SDavid du Colombier case 2:
195*593dc095SDavid du Colombier {
196*593dc095SDavid du Colombier png_bytep sp;
197*593dc095SDavid du Colombier png_bytep dp;
198*593dc095SDavid du Colombier int s_start, s_end, s_inc;
199*593dc095SDavid du Colombier int m;
200*593dc095SDavid du Colombier int shift;
201*593dc095SDavid du Colombier png_uint_32 i;
202*593dc095SDavid du Colombier int value;
203*593dc095SDavid du Colombier
204*593dc095SDavid du Colombier sp = png_ptr->row_buf + 1;
205*593dc095SDavid du Colombier dp = row;
206*593dc095SDavid du Colombier m = 0x80;
207*593dc095SDavid du Colombier #if defined(PNG_READ_PACKSWAP_SUPPORTED)
208*593dc095SDavid du Colombier if (png_ptr->transformations & PNG_PACKSWAP)
209*593dc095SDavid du Colombier {
210*593dc095SDavid du Colombier s_start = 0;
211*593dc095SDavid du Colombier s_end = 6;
212*593dc095SDavid du Colombier s_inc = 2;
213*593dc095SDavid du Colombier }
214*593dc095SDavid du Colombier else
215*593dc095SDavid du Colombier #endif
216*593dc095SDavid du Colombier {
217*593dc095SDavid du Colombier s_start = 6;
218*593dc095SDavid du Colombier s_end = 0;
219*593dc095SDavid du Colombier s_inc = -2;
220*593dc095SDavid du Colombier }
221*593dc095SDavid du Colombier
222*593dc095SDavid du Colombier shift = s_start;
223*593dc095SDavid du Colombier
224*593dc095SDavid du Colombier for (i = 0; i < png_ptr->width; i++)
225*593dc095SDavid du Colombier {
226*593dc095SDavid du Colombier if (m & mask)
227*593dc095SDavid du Colombier {
228*593dc095SDavid du Colombier value = (*sp >> shift) & 0x3;
229*593dc095SDavid du Colombier *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
230*593dc095SDavid du Colombier *dp |= (png_byte)(value << shift);
231*593dc095SDavid du Colombier }
232*593dc095SDavid du Colombier
233*593dc095SDavid du Colombier if (shift == s_end)
234*593dc095SDavid du Colombier {
235*593dc095SDavid du Colombier shift = s_start;
236*593dc095SDavid du Colombier sp++;
237*593dc095SDavid du Colombier dp++;
238*593dc095SDavid du Colombier }
239*593dc095SDavid du Colombier else
240*593dc095SDavid du Colombier shift += s_inc;
241*593dc095SDavid du Colombier if (m == 1)
242*593dc095SDavid du Colombier m = 0x80;
243*593dc095SDavid du Colombier else
244*593dc095SDavid du Colombier m >>= 1;
245*593dc095SDavid du Colombier }
246*593dc095SDavid du Colombier break;
247*593dc095SDavid du Colombier }
248*593dc095SDavid du Colombier
249*593dc095SDavid du Colombier case 4:
250*593dc095SDavid du Colombier {
251*593dc095SDavid du Colombier png_bytep sp;
252*593dc095SDavid du Colombier png_bytep dp;
253*593dc095SDavid du Colombier int s_start, s_end, s_inc;
254*593dc095SDavid du Colombier int m;
255*593dc095SDavid du Colombier int shift;
256*593dc095SDavid du Colombier png_uint_32 i;
257*593dc095SDavid du Colombier int value;
258*593dc095SDavid du Colombier
259*593dc095SDavid du Colombier sp = png_ptr->row_buf + 1;
260*593dc095SDavid du Colombier dp = row;
261*593dc095SDavid du Colombier m = 0x80;
262*593dc095SDavid du Colombier #if defined(PNG_READ_PACKSWAP_SUPPORTED)
263*593dc095SDavid du Colombier if (png_ptr->transformations & PNG_PACKSWAP)
264*593dc095SDavid du Colombier {
265*593dc095SDavid du Colombier s_start = 0;
266*593dc095SDavid du Colombier s_end = 4;
267*593dc095SDavid du Colombier s_inc = 4;
268*593dc095SDavid du Colombier }
269*593dc095SDavid du Colombier else
270*593dc095SDavid du Colombier #endif
271*593dc095SDavid du Colombier {
272*593dc095SDavid du Colombier s_start = 4;
273*593dc095SDavid du Colombier s_end = 0;
274*593dc095SDavid du Colombier s_inc = -4;
275*593dc095SDavid du Colombier }
276*593dc095SDavid du Colombier shift = s_start;
277*593dc095SDavid du Colombier
278*593dc095SDavid du Colombier for (i = 0; i < png_ptr->width; i++)
279*593dc095SDavid du Colombier {
280*593dc095SDavid du Colombier if (m & mask)
281*593dc095SDavid du Colombier {
282*593dc095SDavid du Colombier value = (*sp >> shift) & 0xf;
283*593dc095SDavid du Colombier *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
284*593dc095SDavid du Colombier *dp |= (png_byte)(value << shift);
285*593dc095SDavid du Colombier }
286*593dc095SDavid du Colombier
287*593dc095SDavid du Colombier if (shift == s_end)
288*593dc095SDavid du Colombier {
289*593dc095SDavid du Colombier shift = s_start;
290*593dc095SDavid du Colombier sp++;
291*593dc095SDavid du Colombier dp++;
292*593dc095SDavid du Colombier }
293*593dc095SDavid du Colombier else
294*593dc095SDavid du Colombier shift += s_inc;
295*593dc095SDavid du Colombier if (m == 1)
296*593dc095SDavid du Colombier m = 0x80;
297*593dc095SDavid du Colombier else
298*593dc095SDavid du Colombier m >>= 1;
299*593dc095SDavid du Colombier }
300*593dc095SDavid du Colombier break;
301*593dc095SDavid du Colombier }
302*593dc095SDavid du Colombier
303*593dc095SDavid du Colombier case 8:
304*593dc095SDavid du Colombier {
305*593dc095SDavid du Colombier png_bytep srcptr;
306*593dc095SDavid du Colombier png_bytep dstptr;
307*593dc095SDavid du Colombier png_uint_32 len;
308*593dc095SDavid du Colombier int m;
309*593dc095SDavid du Colombier int diff, unmask;
310*593dc095SDavid du Colombier
311*593dc095SDavid du Colombier __int64 mask0=0x0102040810204080;
312*593dc095SDavid du Colombier
313*593dc095SDavid du Colombier #if !defined(PNG_1_0_X)
314*593dc095SDavid du Colombier if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
315*593dc095SDavid du Colombier /* && mmx_supported */ )
316*593dc095SDavid du Colombier #else
317*593dc095SDavid du Colombier if (mmx_supported)
318*593dc095SDavid du Colombier #endif
319*593dc095SDavid du Colombier {
320*593dc095SDavid du Colombier srcptr = png_ptr->row_buf + 1;
321*593dc095SDavid du Colombier dstptr = row;
322*593dc095SDavid du Colombier m = 0x80;
323*593dc095SDavid du Colombier unmask = ~mask;
324*593dc095SDavid du Colombier len = png_ptr->width &~7; //reduce to multiple of 8
325*593dc095SDavid du Colombier diff = png_ptr->width & 7; //amount lost
326*593dc095SDavid du Colombier
327*593dc095SDavid du Colombier _asm
328*593dc095SDavid du Colombier {
329*593dc095SDavid du Colombier movd mm7, unmask //load bit pattern
330*593dc095SDavid du Colombier psubb mm6,mm6 //zero mm6
331*593dc095SDavid du Colombier punpcklbw mm7,mm7
332*593dc095SDavid du Colombier punpcklwd mm7,mm7
333*593dc095SDavid du Colombier punpckldq mm7,mm7 //fill register with 8 masks
334*593dc095SDavid du Colombier
335*593dc095SDavid du Colombier movq mm0,mask0
336*593dc095SDavid du Colombier
337*593dc095SDavid du Colombier pand mm0,mm7 //nonzero if keep byte
338*593dc095SDavid du Colombier pcmpeqb mm0,mm6 //zeros->1s, v versa
339*593dc095SDavid du Colombier
340*593dc095SDavid du Colombier mov ecx,len //load length of line (pixels)
341*593dc095SDavid du Colombier mov esi,srcptr //load source
342*593dc095SDavid du Colombier mov ebx,dstptr //load dest
343*593dc095SDavid du Colombier cmp ecx,0 //lcr
344*593dc095SDavid du Colombier je mainloop8end
345*593dc095SDavid du Colombier
346*593dc095SDavid du Colombier mainloop8:
347*593dc095SDavid du Colombier movq mm4,[esi]
348*593dc095SDavid du Colombier pand mm4,mm0
349*593dc095SDavid du Colombier movq mm6,mm0
350*593dc095SDavid du Colombier pandn mm6,[ebx]
351*593dc095SDavid du Colombier por mm4,mm6
352*593dc095SDavid du Colombier movq [ebx],mm4
353*593dc095SDavid du Colombier
354*593dc095SDavid du Colombier add esi,8 //inc by 8 bytes processed
355*593dc095SDavid du Colombier add ebx,8
356*593dc095SDavid du Colombier sub ecx,8 //dec by 8 pixels processed
357*593dc095SDavid du Colombier
358*593dc095SDavid du Colombier ja mainloop8
359*593dc095SDavid du Colombier mainloop8end:
360*593dc095SDavid du Colombier
361*593dc095SDavid du Colombier mov ecx,diff
362*593dc095SDavid du Colombier cmp ecx,0
363*593dc095SDavid du Colombier jz end8
364*593dc095SDavid du Colombier
365*593dc095SDavid du Colombier mov edx,mask
366*593dc095SDavid du Colombier sal edx,24 //make low byte the high byte
367*593dc095SDavid du Colombier
368*593dc095SDavid du Colombier secondloop8:
369*593dc095SDavid du Colombier sal edx,1 //move high bit to CF
370*593dc095SDavid du Colombier jnc skip8 //if CF = 0
371*593dc095SDavid du Colombier mov al,[esi]
372*593dc095SDavid du Colombier mov [ebx],al
373*593dc095SDavid du Colombier skip8:
374*593dc095SDavid du Colombier inc esi
375*593dc095SDavid du Colombier inc ebx
376*593dc095SDavid du Colombier
377*593dc095SDavid du Colombier dec ecx
378*593dc095SDavid du Colombier jnz secondloop8
379*593dc095SDavid du Colombier end8:
380*593dc095SDavid du Colombier emms
381*593dc095SDavid du Colombier }
382*593dc095SDavid du Colombier }
383*593dc095SDavid du Colombier else /* mmx not supported - use modified C routine */
384*593dc095SDavid du Colombier {
385*593dc095SDavid du Colombier register unsigned int incr1, initial_val, final_val;
386*593dc095SDavid du Colombier png_size_t pixel_bytes;
387*593dc095SDavid du Colombier png_uint_32 i;
388*593dc095SDavid du Colombier register int disp = png_pass_inc[png_ptr->pass];
389*593dc095SDavid du Colombier int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
390*593dc095SDavid du Colombier
391*593dc095SDavid du Colombier pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
392*593dc095SDavid du Colombier srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
393*593dc095SDavid du Colombier pixel_bytes;
394*593dc095SDavid du Colombier dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
395*593dc095SDavid du Colombier initial_val = offset_table[png_ptr->pass]*pixel_bytes;
396*593dc095SDavid du Colombier final_val = png_ptr->width*pixel_bytes;
397*593dc095SDavid du Colombier incr1 = (disp)*pixel_bytes;
398*593dc095SDavid du Colombier for (i = initial_val; i < final_val; i += incr1)
399*593dc095SDavid du Colombier {
400*593dc095SDavid du Colombier png_memcpy(dstptr, srcptr, pixel_bytes);
401*593dc095SDavid du Colombier srcptr += incr1;
402*593dc095SDavid du Colombier dstptr += incr1;
403*593dc095SDavid du Colombier }
404*593dc095SDavid du Colombier } /* end of else */
405*593dc095SDavid du Colombier
406*593dc095SDavid du Colombier break;
407*593dc095SDavid du Colombier } // end 8 bpp
408*593dc095SDavid du Colombier
409*593dc095SDavid du Colombier case 16:
410*593dc095SDavid du Colombier {
411*593dc095SDavid du Colombier png_bytep srcptr;
412*593dc095SDavid du Colombier png_bytep dstptr;
413*593dc095SDavid du Colombier png_uint_32 len;
414*593dc095SDavid du Colombier int unmask, diff;
415*593dc095SDavid du Colombier __int64 mask1=0x0101020204040808,
416*593dc095SDavid du Colombier mask0=0x1010202040408080;
417*593dc095SDavid du Colombier
418*593dc095SDavid du Colombier #if !defined(PNG_1_0_X)
419*593dc095SDavid du Colombier if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
420*593dc095SDavid du Colombier /* && mmx_supported */ )
421*593dc095SDavid du Colombier #else
422*593dc095SDavid du Colombier if (mmx_supported)
423*593dc095SDavid du Colombier #endif
424*593dc095SDavid du Colombier {
425*593dc095SDavid du Colombier srcptr = png_ptr->row_buf + 1;
426*593dc095SDavid du Colombier dstptr = row;
427*593dc095SDavid du Colombier
428*593dc095SDavid du Colombier unmask = ~mask;
429*593dc095SDavid du Colombier len = (png_ptr->width)&~7;
430*593dc095SDavid du Colombier diff = (png_ptr->width)&7;
431*593dc095SDavid du Colombier _asm
432*593dc095SDavid du Colombier {
433*593dc095SDavid du Colombier movd mm7, unmask //load bit pattern
434*593dc095SDavid du Colombier psubb mm6,mm6 //zero mm6
435*593dc095SDavid du Colombier punpcklbw mm7,mm7
436*593dc095SDavid du Colombier punpcklwd mm7,mm7
437*593dc095SDavid du Colombier punpckldq mm7,mm7 //fill register with 8 masks
438*593dc095SDavid du Colombier
439*593dc095SDavid du Colombier movq mm0,mask0
440*593dc095SDavid du Colombier movq mm1,mask1
441*593dc095SDavid du Colombier
442*593dc095SDavid du Colombier pand mm0,mm7
443*593dc095SDavid du Colombier pand mm1,mm7
444*593dc095SDavid du Colombier
445*593dc095SDavid du Colombier pcmpeqb mm0,mm6
446*593dc095SDavid du Colombier pcmpeqb mm1,mm6
447*593dc095SDavid du Colombier
448*593dc095SDavid du Colombier mov ecx,len //load length of line
449*593dc095SDavid du Colombier mov esi,srcptr //load source
450*593dc095SDavid du Colombier mov ebx,dstptr //load dest
451*593dc095SDavid du Colombier cmp ecx,0 //lcr
452*593dc095SDavid du Colombier jz mainloop16end
453*593dc095SDavid du Colombier
454*593dc095SDavid du Colombier mainloop16:
455*593dc095SDavid du Colombier movq mm4,[esi]
456*593dc095SDavid du Colombier pand mm4,mm0
457*593dc095SDavid du Colombier movq mm6,mm0
458*593dc095SDavid du Colombier movq mm7,[ebx]
459*593dc095SDavid du Colombier pandn mm6,mm7
460*593dc095SDavid du Colombier por mm4,mm6
461*593dc095SDavid du Colombier movq [ebx],mm4
462*593dc095SDavid du Colombier
463*593dc095SDavid du Colombier movq mm5,[esi+8]
464*593dc095SDavid du Colombier pand mm5,mm1
465*593dc095SDavid du Colombier movq mm7,mm1
466*593dc095SDavid du Colombier movq mm6,[ebx+8]
467*593dc095SDavid du Colombier pandn mm7,mm6
468*593dc095SDavid du Colombier por mm5,mm7
469*593dc095SDavid du Colombier movq [ebx+8],mm5
470*593dc095SDavid du Colombier
471*593dc095SDavid du Colombier add esi,16 //inc by 16 bytes processed
472*593dc095SDavid du Colombier add ebx,16
473*593dc095SDavid du Colombier sub ecx,8 //dec by 8 pixels processed
474*593dc095SDavid du Colombier
475*593dc095SDavid du Colombier ja mainloop16
476*593dc095SDavid du Colombier
477*593dc095SDavid du Colombier mainloop16end:
478*593dc095SDavid du Colombier mov ecx,diff
479*593dc095SDavid du Colombier cmp ecx,0
480*593dc095SDavid du Colombier jz end16
481*593dc095SDavid du Colombier
482*593dc095SDavid du Colombier mov edx,mask
483*593dc095SDavid du Colombier sal edx,24 //make low byte the high byte
484*593dc095SDavid du Colombier secondloop16:
485*593dc095SDavid du Colombier sal edx,1 //move high bit to CF
486*593dc095SDavid du Colombier jnc skip16 //if CF = 0
487*593dc095SDavid du Colombier mov ax,[esi]
488*593dc095SDavid du Colombier mov [ebx],ax
489*593dc095SDavid du Colombier skip16:
490*593dc095SDavid du Colombier add esi,2
491*593dc095SDavid du Colombier add ebx,2
492*593dc095SDavid du Colombier
493*593dc095SDavid du Colombier dec ecx
494*593dc095SDavid du Colombier jnz secondloop16
495*593dc095SDavid du Colombier end16:
496*593dc095SDavid du Colombier emms
497*593dc095SDavid du Colombier }
498*593dc095SDavid du Colombier }
499*593dc095SDavid du Colombier else /* mmx not supported - use modified C routine */
500*593dc095SDavid du Colombier {
501*593dc095SDavid du Colombier register unsigned int incr1, initial_val, final_val;
502*593dc095SDavid du Colombier png_size_t pixel_bytes;
503*593dc095SDavid du Colombier png_uint_32 i;
504*593dc095SDavid du Colombier register int disp = png_pass_inc[png_ptr->pass];
505*593dc095SDavid du Colombier int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
506*593dc095SDavid du Colombier
507*593dc095SDavid du Colombier pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
508*593dc095SDavid du Colombier srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
509*593dc095SDavid du Colombier pixel_bytes;
510*593dc095SDavid du Colombier dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
511*593dc095SDavid du Colombier initial_val = offset_table[png_ptr->pass]*pixel_bytes;
512*593dc095SDavid du Colombier final_val = png_ptr->width*pixel_bytes;
513*593dc095SDavid du Colombier incr1 = (disp)*pixel_bytes;
514*593dc095SDavid du Colombier for (i = initial_val; i < final_val; i += incr1)
515*593dc095SDavid du Colombier {
516*593dc095SDavid du Colombier png_memcpy(dstptr, srcptr, pixel_bytes);
517*593dc095SDavid du Colombier srcptr += incr1;
518*593dc095SDavid du Colombier dstptr += incr1;
519*593dc095SDavid du Colombier }
520*593dc095SDavid du Colombier } /* end of else */
521*593dc095SDavid du Colombier
522*593dc095SDavid du Colombier break;
523*593dc095SDavid du Colombier } // end 16 bpp
524*593dc095SDavid du Colombier
525*593dc095SDavid du Colombier case 24:
526*593dc095SDavid du Colombier {
527*593dc095SDavid du Colombier png_bytep srcptr;
528*593dc095SDavid du Colombier png_bytep dstptr;
529*593dc095SDavid du Colombier png_uint_32 len;
530*593dc095SDavid du Colombier int unmask, diff;
531*593dc095SDavid du Colombier
532*593dc095SDavid du Colombier __int64 mask2=0x0101010202020404, //24bpp
533*593dc095SDavid du Colombier mask1=0x0408080810101020,
534*593dc095SDavid du Colombier mask0=0x2020404040808080;
535*593dc095SDavid du Colombier
536*593dc095SDavid du Colombier srcptr = png_ptr->row_buf + 1;
537*593dc095SDavid du Colombier dstptr = row;
538*593dc095SDavid du Colombier
539*593dc095SDavid du Colombier unmask = ~mask;
540*593dc095SDavid du Colombier len = (png_ptr->width)&~7;
541*593dc095SDavid du Colombier diff = (png_ptr->width)&7;
542*593dc095SDavid du Colombier
543*593dc095SDavid du Colombier #if !defined(PNG_1_0_X)
544*593dc095SDavid du Colombier if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
545*593dc095SDavid du Colombier /* && mmx_supported */ )
546*593dc095SDavid du Colombier #else
547*593dc095SDavid du Colombier if (mmx_supported)
548*593dc095SDavid du Colombier #endif
549*593dc095SDavid du Colombier {
550*593dc095SDavid du Colombier _asm
551*593dc095SDavid du Colombier {
552*593dc095SDavid du Colombier movd mm7, unmask //load bit pattern
553*593dc095SDavid du Colombier psubb mm6,mm6 //zero mm6
554*593dc095SDavid du Colombier punpcklbw mm7,mm7
555*593dc095SDavid du Colombier punpcklwd mm7,mm7
556*593dc095SDavid du Colombier punpckldq mm7,mm7 //fill register with 8 masks
557*593dc095SDavid du Colombier
558*593dc095SDavid du Colombier movq mm0,mask0
559*593dc095SDavid du Colombier movq mm1,mask1
560*593dc095SDavid du Colombier movq mm2,mask2
561*593dc095SDavid du Colombier
562*593dc095SDavid du Colombier pand mm0,mm7
563*593dc095SDavid du Colombier pand mm1,mm7
564*593dc095SDavid du Colombier pand mm2,mm7
565*593dc095SDavid du Colombier
566*593dc095SDavid du Colombier pcmpeqb mm0,mm6
567*593dc095SDavid du Colombier pcmpeqb mm1,mm6
568*593dc095SDavid du Colombier pcmpeqb mm2,mm6
569*593dc095SDavid du Colombier
570*593dc095SDavid du Colombier mov ecx,len //load length of line
571*593dc095SDavid du Colombier mov esi,srcptr //load source
572*593dc095SDavid du Colombier mov ebx,dstptr //load dest
573*593dc095SDavid du Colombier cmp ecx,0
574*593dc095SDavid du Colombier jz mainloop24end
575*593dc095SDavid du Colombier
576*593dc095SDavid du Colombier mainloop24:
577*593dc095SDavid du Colombier movq mm4,[esi]
578*593dc095SDavid du Colombier pand mm4,mm0
579*593dc095SDavid du Colombier movq mm6,mm0
580*593dc095SDavid du Colombier movq mm7,[ebx]
581*593dc095SDavid du Colombier pandn mm6,mm7
582*593dc095SDavid du Colombier por mm4,mm6
583*593dc095SDavid du Colombier movq [ebx],mm4
584*593dc095SDavid du Colombier
585*593dc095SDavid du Colombier
586*593dc095SDavid du Colombier movq mm5,[esi+8]
587*593dc095SDavid du Colombier pand mm5,mm1
588*593dc095SDavid du Colombier movq mm7,mm1
589*593dc095SDavid du Colombier movq mm6,[ebx+8]
590*593dc095SDavid du Colombier pandn mm7,mm6
591*593dc095SDavid du Colombier por mm5,mm7
592*593dc095SDavid du Colombier movq [ebx+8],mm5
593*593dc095SDavid du Colombier
594*593dc095SDavid du Colombier movq mm6,[esi+16]
595*593dc095SDavid du Colombier pand mm6,mm2
596*593dc095SDavid du Colombier movq mm4,mm2
597*593dc095SDavid du Colombier movq mm7,[ebx+16]
598*593dc095SDavid du Colombier pandn mm4,mm7
599*593dc095SDavid du Colombier por mm6,mm4
600*593dc095SDavid du Colombier movq [ebx+16],mm6
601*593dc095SDavid du Colombier
602*593dc095SDavid du Colombier add esi,24 //inc by 24 bytes processed
603*593dc095SDavid du Colombier add ebx,24
604*593dc095SDavid du Colombier sub ecx,8 //dec by 8 pixels processed
605*593dc095SDavid du Colombier
606*593dc095SDavid du Colombier ja mainloop24
607*593dc095SDavid du Colombier
608*593dc095SDavid du Colombier mainloop24end:
609*593dc095SDavid du Colombier mov ecx,diff
610*593dc095SDavid du Colombier cmp ecx,0
611*593dc095SDavid du Colombier jz end24
612*593dc095SDavid du Colombier
613*593dc095SDavid du Colombier mov edx,mask
614*593dc095SDavid du Colombier sal edx,24 //make low byte the high byte
615*593dc095SDavid du Colombier secondloop24:
616*593dc095SDavid du Colombier sal edx,1 //move high bit to CF
617*593dc095SDavid du Colombier jnc skip24 //if CF = 0
618*593dc095SDavid du Colombier mov ax,[esi]
619*593dc095SDavid du Colombier mov [ebx],ax
620*593dc095SDavid du Colombier xor eax,eax
621*593dc095SDavid du Colombier mov al,[esi+2]
622*593dc095SDavid du Colombier mov [ebx+2],al
623*593dc095SDavid du Colombier skip24:
624*593dc095SDavid du Colombier add esi,3
625*593dc095SDavid du Colombier add ebx,3
626*593dc095SDavid du Colombier
627*593dc095SDavid du Colombier dec ecx
628*593dc095SDavid du Colombier jnz secondloop24
629*593dc095SDavid du Colombier
630*593dc095SDavid du Colombier end24:
631*593dc095SDavid du Colombier emms
632*593dc095SDavid du Colombier }
633*593dc095SDavid du Colombier }
634*593dc095SDavid du Colombier else /* mmx not supported - use modified C routine */
635*593dc095SDavid du Colombier {
636*593dc095SDavid du Colombier register unsigned int incr1, initial_val, final_val;
637*593dc095SDavid du Colombier png_size_t pixel_bytes;
638*593dc095SDavid du Colombier png_uint_32 i;
639*593dc095SDavid du Colombier register int disp = png_pass_inc[png_ptr->pass];
640*593dc095SDavid du Colombier int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
641*593dc095SDavid du Colombier
642*593dc095SDavid du Colombier pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
643*593dc095SDavid du Colombier srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
644*593dc095SDavid du Colombier pixel_bytes;
645*593dc095SDavid du Colombier dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
646*593dc095SDavid du Colombier initial_val = offset_table[png_ptr->pass]*pixel_bytes;
647*593dc095SDavid du Colombier final_val = png_ptr->width*pixel_bytes;
648*593dc095SDavid du Colombier incr1 = (disp)*pixel_bytes;
649*593dc095SDavid du Colombier for (i = initial_val; i < final_val; i += incr1)
650*593dc095SDavid du Colombier {
651*593dc095SDavid du Colombier png_memcpy(dstptr, srcptr, pixel_bytes);
652*593dc095SDavid du Colombier srcptr += incr1;
653*593dc095SDavid du Colombier dstptr += incr1;
654*593dc095SDavid du Colombier }
655*593dc095SDavid du Colombier } /* end of else */
656*593dc095SDavid du Colombier
657*593dc095SDavid du Colombier break;
658*593dc095SDavid du Colombier } // end 24 bpp
659*593dc095SDavid du Colombier
660*593dc095SDavid du Colombier case 32:
661*593dc095SDavid du Colombier {
662*593dc095SDavid du Colombier png_bytep srcptr;
663*593dc095SDavid du Colombier png_bytep dstptr;
664*593dc095SDavid du Colombier png_uint_32 len;
665*593dc095SDavid du Colombier int unmask, diff;
666*593dc095SDavid du Colombier
667*593dc095SDavid du Colombier __int64 mask3=0x0101010102020202, //32bpp
668*593dc095SDavid du Colombier mask2=0x0404040408080808,
669*593dc095SDavid du Colombier mask1=0x1010101020202020,
670*593dc095SDavid du Colombier mask0=0x4040404080808080;
671*593dc095SDavid du Colombier
672*593dc095SDavid du Colombier srcptr = png_ptr->row_buf + 1;
673*593dc095SDavid du Colombier dstptr = row;
674*593dc095SDavid du Colombier
675*593dc095SDavid du Colombier unmask = ~mask;
676*593dc095SDavid du Colombier len = (png_ptr->width)&~7;
677*593dc095SDavid du Colombier diff = (png_ptr->width)&7;
678*593dc095SDavid du Colombier
679*593dc095SDavid du Colombier #if !defined(PNG_1_0_X)
680*593dc095SDavid du Colombier if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
681*593dc095SDavid du Colombier /* && mmx_supported */ )
682*593dc095SDavid du Colombier #else
683*593dc095SDavid du Colombier if (mmx_supported)
684*593dc095SDavid du Colombier #endif
685*593dc095SDavid du Colombier {
686*593dc095SDavid du Colombier _asm
687*593dc095SDavid du Colombier {
688*593dc095SDavid du Colombier movd mm7, unmask //load bit pattern
689*593dc095SDavid du Colombier psubb mm6,mm6 //zero mm6
690*593dc095SDavid du Colombier punpcklbw mm7,mm7
691*593dc095SDavid du Colombier punpcklwd mm7,mm7
692*593dc095SDavid du Colombier punpckldq mm7,mm7 //fill register with 8 masks
693*593dc095SDavid du Colombier
694*593dc095SDavid du Colombier movq mm0,mask0
695*593dc095SDavid du Colombier movq mm1,mask1
696*593dc095SDavid du Colombier movq mm2,mask2
697*593dc095SDavid du Colombier movq mm3,mask3
698*593dc095SDavid du Colombier
699*593dc095SDavid du Colombier pand mm0,mm7
700*593dc095SDavid du Colombier pand mm1,mm7
701*593dc095SDavid du Colombier pand mm2,mm7
702*593dc095SDavid du Colombier pand mm3,mm7
703*593dc095SDavid du Colombier
704*593dc095SDavid du Colombier pcmpeqb mm0,mm6
705*593dc095SDavid du Colombier pcmpeqb mm1,mm6
706*593dc095SDavid du Colombier pcmpeqb mm2,mm6
707*593dc095SDavid du Colombier pcmpeqb mm3,mm6
708*593dc095SDavid du Colombier
709*593dc095SDavid du Colombier mov ecx,len //load length of line
710*593dc095SDavid du Colombier mov esi,srcptr //load source
711*593dc095SDavid du Colombier mov ebx,dstptr //load dest
712*593dc095SDavid du Colombier
713*593dc095SDavid du Colombier cmp ecx,0 //lcr
714*593dc095SDavid du Colombier jz mainloop32end
715*593dc095SDavid du Colombier
716*593dc095SDavid du Colombier mainloop32:
717*593dc095SDavid du Colombier movq mm4,[esi]
718*593dc095SDavid du Colombier pand mm4,mm0
719*593dc095SDavid du Colombier movq mm6,mm0
720*593dc095SDavid du Colombier movq mm7,[ebx]
721*593dc095SDavid du Colombier pandn mm6,mm7
722*593dc095SDavid du Colombier por mm4,mm6
723*593dc095SDavid du Colombier movq [ebx],mm4
724*593dc095SDavid du Colombier
725*593dc095SDavid du Colombier movq mm5,[esi+8]
726*593dc095SDavid du Colombier pand mm5,mm1
727*593dc095SDavid du Colombier movq mm7,mm1
728*593dc095SDavid du Colombier movq mm6,[ebx+8]
729*593dc095SDavid du Colombier pandn mm7,mm6
730*593dc095SDavid du Colombier por mm5,mm7
731*593dc095SDavid du Colombier movq [ebx+8],mm5
732*593dc095SDavid du Colombier
733*593dc095SDavid du Colombier movq mm6,[esi+16]
734*593dc095SDavid du Colombier pand mm6,mm2
735*593dc095SDavid du Colombier movq mm4,mm2
736*593dc095SDavid du Colombier movq mm7,[ebx+16]
737*593dc095SDavid du Colombier pandn mm4,mm7
738*593dc095SDavid du Colombier por mm6,mm4
739*593dc095SDavid du Colombier movq [ebx+16],mm6
740*593dc095SDavid du Colombier
741*593dc095SDavid du Colombier movq mm7,[esi+24]
742*593dc095SDavid du Colombier pand mm7,mm3
743*593dc095SDavid du Colombier movq mm5,mm3
744*593dc095SDavid du Colombier movq mm4,[ebx+24]
745*593dc095SDavid du Colombier pandn mm5,mm4
746*593dc095SDavid du Colombier por mm7,mm5
747*593dc095SDavid du Colombier movq [ebx+24],mm7
748*593dc095SDavid du Colombier
749*593dc095SDavid du Colombier add esi,32 //inc by 32 bytes processed
750*593dc095SDavid du Colombier add ebx,32
751*593dc095SDavid du Colombier sub ecx,8 //dec by 8 pixels processed
752*593dc095SDavid du Colombier
753*593dc095SDavid du Colombier ja mainloop32
754*593dc095SDavid du Colombier
755*593dc095SDavid du Colombier mainloop32end:
756*593dc095SDavid du Colombier mov ecx,diff
757*593dc095SDavid du Colombier cmp ecx,0
758*593dc095SDavid du Colombier jz end32
759*593dc095SDavid du Colombier
760*593dc095SDavid du Colombier mov edx,mask
761*593dc095SDavid du Colombier sal edx,24 //make low byte the high byte
762*593dc095SDavid du Colombier secondloop32:
763*593dc095SDavid du Colombier sal edx,1 //move high bit to CF
764*593dc095SDavid du Colombier jnc skip32 //if CF = 0
765*593dc095SDavid du Colombier mov eax,[esi]
766*593dc095SDavid du Colombier mov [ebx],eax
767*593dc095SDavid du Colombier skip32:
768*593dc095SDavid du Colombier add esi,4
769*593dc095SDavid du Colombier add ebx,4
770*593dc095SDavid du Colombier
771*593dc095SDavid du Colombier dec ecx
772*593dc095SDavid du Colombier jnz secondloop32
773*593dc095SDavid du Colombier
774*593dc095SDavid du Colombier end32:
775*593dc095SDavid du Colombier emms
776*593dc095SDavid du Colombier }
777*593dc095SDavid du Colombier }
778*593dc095SDavid du Colombier else /* mmx _not supported - Use modified C routine */
779*593dc095SDavid du Colombier {
780*593dc095SDavid du Colombier register unsigned int incr1, initial_val, final_val;
781*593dc095SDavid du Colombier png_size_t pixel_bytes;
782*593dc095SDavid du Colombier png_uint_32 i;
783*593dc095SDavid du Colombier register int disp = png_pass_inc[png_ptr->pass];
784*593dc095SDavid du Colombier int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
785*593dc095SDavid du Colombier
786*593dc095SDavid du Colombier pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
787*593dc095SDavid du Colombier srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
788*593dc095SDavid du Colombier pixel_bytes;
789*593dc095SDavid du Colombier dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
790*593dc095SDavid du Colombier initial_val = offset_table[png_ptr->pass]*pixel_bytes;
791*593dc095SDavid du Colombier final_val = png_ptr->width*pixel_bytes;
792*593dc095SDavid du Colombier incr1 = (disp)*pixel_bytes;
793*593dc095SDavid du Colombier for (i = initial_val; i < final_val; i += incr1)
794*593dc095SDavid du Colombier {
795*593dc095SDavid du Colombier png_memcpy(dstptr, srcptr, pixel_bytes);
796*593dc095SDavid du Colombier srcptr += incr1;
797*593dc095SDavid du Colombier dstptr += incr1;
798*593dc095SDavid du Colombier }
799*593dc095SDavid du Colombier } /* end of else */
800*593dc095SDavid du Colombier
801*593dc095SDavid du Colombier break;
802*593dc095SDavid du Colombier } // end 32 bpp
803*593dc095SDavid du Colombier
804*593dc095SDavid du Colombier case 48:
805*593dc095SDavid du Colombier {
806*593dc095SDavid du Colombier png_bytep srcptr;
807*593dc095SDavid du Colombier png_bytep dstptr;
808*593dc095SDavid du Colombier png_uint_32 len;
809*593dc095SDavid du Colombier int unmask, diff;
810*593dc095SDavid du Colombier
811*593dc095SDavid du Colombier __int64 mask5=0x0101010101010202,
812*593dc095SDavid du Colombier mask4=0x0202020204040404,
813*593dc095SDavid du Colombier mask3=0x0404080808080808,
814*593dc095SDavid du Colombier mask2=0x1010101010102020,
815*593dc095SDavid du Colombier mask1=0x2020202040404040,
816*593dc095SDavid du Colombier mask0=0x4040808080808080;
817*593dc095SDavid du Colombier
818*593dc095SDavid du Colombier #if !defined(PNG_1_0_X)
819*593dc095SDavid du Colombier if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
820*593dc095SDavid du Colombier /* && mmx_supported */ )
821*593dc095SDavid du Colombier #else
822*593dc095SDavid du Colombier if (mmx_supported)
823*593dc095SDavid du Colombier #endif
824*593dc095SDavid du Colombier {
825*593dc095SDavid du Colombier srcptr = png_ptr->row_buf + 1;
826*593dc095SDavid du Colombier dstptr = row;
827*593dc095SDavid du Colombier
828*593dc095SDavid du Colombier unmask = ~mask;
829*593dc095SDavid du Colombier len = (png_ptr->width)&~7;
830*593dc095SDavid du Colombier diff = (png_ptr->width)&7;
831*593dc095SDavid du Colombier _asm
832*593dc095SDavid du Colombier {
833*593dc095SDavid du Colombier movd mm7, unmask //load bit pattern
834*593dc095SDavid du Colombier psubb mm6,mm6 //zero mm6
835*593dc095SDavid du Colombier punpcklbw mm7,mm7
836*593dc095SDavid du Colombier punpcklwd mm7,mm7
837*593dc095SDavid du Colombier punpckldq mm7,mm7 //fill register with 8 masks
838*593dc095SDavid du Colombier
839*593dc095SDavid du Colombier movq mm0,mask0
840*593dc095SDavid du Colombier movq mm1,mask1
841*593dc095SDavid du Colombier movq mm2,mask2
842*593dc095SDavid du Colombier movq mm3,mask3
843*593dc095SDavid du Colombier movq mm4,mask4
844*593dc095SDavid du Colombier movq mm5,mask5
845*593dc095SDavid du Colombier
846*593dc095SDavid du Colombier pand mm0,mm7
847*593dc095SDavid du Colombier pand mm1,mm7
848*593dc095SDavid du Colombier pand mm2,mm7
849*593dc095SDavid du Colombier pand mm3,mm7
850*593dc095SDavid du Colombier pand mm4,mm7
851*593dc095SDavid du Colombier pand mm5,mm7
852*593dc095SDavid du Colombier
853*593dc095SDavid du Colombier pcmpeqb mm0,mm6
854*593dc095SDavid du Colombier pcmpeqb mm1,mm6
855*593dc095SDavid du Colombier pcmpeqb mm2,mm6
856*593dc095SDavid du Colombier pcmpeqb mm3,mm6
857*593dc095SDavid du Colombier pcmpeqb mm4,mm6
858*593dc095SDavid du Colombier pcmpeqb mm5,mm6
859*593dc095SDavid du Colombier
860*593dc095SDavid du Colombier mov ecx,len //load length of line
861*593dc095SDavid du Colombier mov esi,srcptr //load source
862*593dc095SDavid du Colombier mov ebx,dstptr //load dest
863*593dc095SDavid du Colombier
864*593dc095SDavid du Colombier cmp ecx,0
865*593dc095SDavid du Colombier jz mainloop48end
866*593dc095SDavid du Colombier
867*593dc095SDavid du Colombier mainloop48:
868*593dc095SDavid du Colombier movq mm7,[esi]
869*593dc095SDavid du Colombier pand mm7,mm0
870*593dc095SDavid du Colombier movq mm6,mm0
871*593dc095SDavid du Colombier pandn mm6,[ebx]
872*593dc095SDavid du Colombier por mm7,mm6
873*593dc095SDavid du Colombier movq [ebx],mm7
874*593dc095SDavid du Colombier
875*593dc095SDavid du Colombier movq mm6,[esi+8]
876*593dc095SDavid du Colombier pand mm6,mm1
877*593dc095SDavid du Colombier movq mm7,mm1
878*593dc095SDavid du Colombier pandn mm7,[ebx+8]
879*593dc095SDavid du Colombier por mm6,mm7
880*593dc095SDavid du Colombier movq [ebx+8],mm6
881*593dc095SDavid du Colombier
882*593dc095SDavid du Colombier movq mm6,[esi+16]
883*593dc095SDavid du Colombier pand mm6,mm2
884*593dc095SDavid du Colombier movq mm7,mm2
885*593dc095SDavid du Colombier pandn mm7,[ebx+16]
886*593dc095SDavid du Colombier por mm6,mm7
887*593dc095SDavid du Colombier movq [ebx+16],mm6
888*593dc095SDavid du Colombier
889*593dc095SDavid du Colombier movq mm7,[esi+24]
890*593dc095SDavid du Colombier pand mm7,mm3
891*593dc095SDavid du Colombier movq mm6,mm3
892*593dc095SDavid du Colombier pandn mm6,[ebx+24]
893*593dc095SDavid du Colombier por mm7,mm6
894*593dc095SDavid du Colombier movq [ebx+24],mm7
895*593dc095SDavid du Colombier
896*593dc095SDavid du Colombier movq mm6,[esi+32]
897*593dc095SDavid du Colombier pand mm6,mm4
898*593dc095SDavid du Colombier movq mm7,mm4
899*593dc095SDavid du Colombier pandn mm7,[ebx+32]
900*593dc095SDavid du Colombier por mm6,mm7
901*593dc095SDavid du Colombier movq [ebx+32],mm6
902*593dc095SDavid du Colombier
903*593dc095SDavid du Colombier movq mm7,[esi+40]
904*593dc095SDavid du Colombier pand mm7,mm5
905*593dc095SDavid du Colombier movq mm6,mm5
906*593dc095SDavid du Colombier pandn mm6,[ebx+40]
907*593dc095SDavid du Colombier por mm7,mm6
908*593dc095SDavid du Colombier movq [ebx+40],mm7
909*593dc095SDavid du Colombier
910*593dc095SDavid du Colombier add esi,48 //inc by 32 bytes processed
911*593dc095SDavid du Colombier add ebx,48
912*593dc095SDavid du Colombier sub ecx,8 //dec by 8 pixels processed
913*593dc095SDavid du Colombier
914*593dc095SDavid du Colombier ja mainloop48
915*593dc095SDavid du Colombier mainloop48end:
916*593dc095SDavid du Colombier
917*593dc095SDavid du Colombier mov ecx,diff
918*593dc095SDavid du Colombier cmp ecx,0
919*593dc095SDavid du Colombier jz end48
920*593dc095SDavid du Colombier
921*593dc095SDavid du Colombier mov edx,mask
922*593dc095SDavid du Colombier sal edx,24 //make low byte the high byte
923*593dc095SDavid du Colombier
924*593dc095SDavid du Colombier secondloop48:
925*593dc095SDavid du Colombier sal edx,1 //move high bit to CF
926*593dc095SDavid du Colombier jnc skip48 //if CF = 0
927*593dc095SDavid du Colombier mov eax,[esi]
928*593dc095SDavid du Colombier mov [ebx],eax
929*593dc095SDavid du Colombier skip48:
930*593dc095SDavid du Colombier add esi,4
931*593dc095SDavid du Colombier add ebx,4
932*593dc095SDavid du Colombier
933*593dc095SDavid du Colombier dec ecx
934*593dc095SDavid du Colombier jnz secondloop48
935*593dc095SDavid du Colombier
936*593dc095SDavid du Colombier end48:
937*593dc095SDavid du Colombier emms
938*593dc095SDavid du Colombier }
939*593dc095SDavid du Colombier }
940*593dc095SDavid du Colombier else /* mmx _not supported - Use modified C routine */
941*593dc095SDavid du Colombier {
942*593dc095SDavid du Colombier register unsigned int incr1, initial_val, final_val;
943*593dc095SDavid du Colombier png_size_t pixel_bytes;
944*593dc095SDavid du Colombier png_uint_32 i;
945*593dc095SDavid du Colombier register int disp = png_pass_inc[png_ptr->pass];
946*593dc095SDavid du Colombier int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
947*593dc095SDavid du Colombier
948*593dc095SDavid du Colombier pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
949*593dc095SDavid du Colombier srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
950*593dc095SDavid du Colombier pixel_bytes;
951*593dc095SDavid du Colombier dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
952*593dc095SDavid du Colombier initial_val = offset_table[png_ptr->pass]*pixel_bytes;
953*593dc095SDavid du Colombier final_val = png_ptr->width*pixel_bytes;
954*593dc095SDavid du Colombier incr1 = (disp)*pixel_bytes;
955*593dc095SDavid du Colombier for (i = initial_val; i < final_val; i += incr1)
956*593dc095SDavid du Colombier {
957*593dc095SDavid du Colombier png_memcpy(dstptr, srcptr, pixel_bytes);
958*593dc095SDavid du Colombier srcptr += incr1;
959*593dc095SDavid du Colombier dstptr += incr1;
960*593dc095SDavid du Colombier }
961*593dc095SDavid du Colombier } /* end of else */
962*593dc095SDavid du Colombier
963*593dc095SDavid du Colombier break;
964*593dc095SDavid du Colombier } // end 48 bpp
965*593dc095SDavid du Colombier
966*593dc095SDavid du Colombier default:
967*593dc095SDavid du Colombier {
968*593dc095SDavid du Colombier png_bytep sptr;
969*593dc095SDavid du Colombier png_bytep dp;
970*593dc095SDavid du Colombier png_size_t pixel_bytes;
971*593dc095SDavid du Colombier int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
972*593dc095SDavid du Colombier unsigned int i;
973*593dc095SDavid du Colombier register int disp = png_pass_inc[png_ptr->pass]; // get the offset
974*593dc095SDavid du Colombier register unsigned int incr1, initial_val, final_val;
975*593dc095SDavid du Colombier
976*593dc095SDavid du Colombier pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
977*593dc095SDavid du Colombier sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
978*593dc095SDavid du Colombier pixel_bytes;
979*593dc095SDavid du Colombier dp = row + offset_table[png_ptr->pass]*pixel_bytes;
980*593dc095SDavid du Colombier initial_val = offset_table[png_ptr->pass]*pixel_bytes;
981*593dc095SDavid du Colombier final_val = png_ptr->width*pixel_bytes;
982*593dc095SDavid du Colombier incr1 = (disp)*pixel_bytes;
983*593dc095SDavid du Colombier for (i = initial_val; i < final_val; i += incr1)
984*593dc095SDavid du Colombier {
985*593dc095SDavid du Colombier png_memcpy(dp, sptr, pixel_bytes);
986*593dc095SDavid du Colombier sptr += incr1;
987*593dc095SDavid du Colombier dp += incr1;
988*593dc095SDavid du Colombier }
989*593dc095SDavid du Colombier break;
990*593dc095SDavid du Colombier }
991*593dc095SDavid du Colombier } /* end switch (png_ptr->row_info.pixel_depth) */
992*593dc095SDavid du Colombier } /* end if (non-trivial mask) */
993*593dc095SDavid du Colombier
994*593dc095SDavid du Colombier } /* end png_combine_row() */
995*593dc095SDavid du Colombier
996*593dc095SDavid du Colombier
997*593dc095SDavid du Colombier #if defined(PNG_READ_INTERLACING_SUPPORTED)
998*593dc095SDavid du Colombier
999*593dc095SDavid du Colombier void /* PRIVATE */
png_do_read_interlace(png_structp png_ptr)1000*593dc095SDavid du Colombier png_do_read_interlace(png_structp png_ptr)
1001*593dc095SDavid du Colombier {
1002*593dc095SDavid du Colombier png_row_infop row_info = &(png_ptr->row_info);
1003*593dc095SDavid du Colombier png_bytep row = png_ptr->row_buf + 1;
1004*593dc095SDavid du Colombier int pass = png_ptr->pass;
1005*593dc095SDavid du Colombier png_uint_32 transformations = png_ptr->transformations;
1006*593dc095SDavid du Colombier #ifdef PNG_USE_LOCAL_ARRAYS
1007*593dc095SDavid du Colombier const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
1008*593dc095SDavid du Colombier #endif
1009*593dc095SDavid du Colombier
1010*593dc095SDavid du Colombier png_debug(1,"in png_do_read_interlace\n");
1011*593dc095SDavid du Colombier
1012*593dc095SDavid du Colombier if (mmx_supported == 2) {
1013*593dc095SDavid du Colombier #if !defined(PNG_1_0_X)
1014*593dc095SDavid du Colombier /* this should have happened in png_init_mmx_flags() already */
1015*593dc095SDavid du Colombier png_warning(png_ptr, "asm_flags may not have been initialized");
1016*593dc095SDavid du Colombier #endif
1017*593dc095SDavid du Colombier png_mmx_support();
1018*593dc095SDavid du Colombier }
1019*593dc095SDavid du Colombier
1020*593dc095SDavid du Colombier if (row != NULL && row_info != NULL)
1021*593dc095SDavid du Colombier {
1022*593dc095SDavid du Colombier png_uint_32 final_width;
1023*593dc095SDavid du Colombier
1024*593dc095SDavid du Colombier final_width = row_info->width * png_pass_inc[pass];
1025*593dc095SDavid du Colombier
1026*593dc095SDavid du Colombier switch (row_info->pixel_depth)
1027*593dc095SDavid du Colombier {
1028*593dc095SDavid du Colombier case 1:
1029*593dc095SDavid du Colombier {
1030*593dc095SDavid du Colombier png_bytep sp, dp;
1031*593dc095SDavid du Colombier int sshift, dshift;
1032*593dc095SDavid du Colombier int s_start, s_end, s_inc;
1033*593dc095SDavid du Colombier png_byte v;
1034*593dc095SDavid du Colombier png_uint_32 i;
1035*593dc095SDavid du Colombier int j;
1036*593dc095SDavid du Colombier
1037*593dc095SDavid du Colombier sp = row + (png_size_t)((row_info->width - 1) >> 3);
1038*593dc095SDavid du Colombier dp = row + (png_size_t)((final_width - 1) >> 3);
1039*593dc095SDavid du Colombier #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1040*593dc095SDavid du Colombier if (transformations & PNG_PACKSWAP)
1041*593dc095SDavid du Colombier {
1042*593dc095SDavid du Colombier sshift = (int)((row_info->width + 7) & 7);
1043*593dc095SDavid du Colombier dshift = (int)((final_width + 7) & 7);
1044*593dc095SDavid du Colombier s_start = 7;
1045*593dc095SDavid du Colombier s_end = 0;
1046*593dc095SDavid du Colombier s_inc = -1;
1047*593dc095SDavid du Colombier }
1048*593dc095SDavid du Colombier else
1049*593dc095SDavid du Colombier #endif
1050*593dc095SDavid du Colombier {
1051*593dc095SDavid du Colombier sshift = 7 - (int)((row_info->width + 7) & 7);
1052*593dc095SDavid du Colombier dshift = 7 - (int)((final_width + 7) & 7);
1053*593dc095SDavid du Colombier s_start = 0;
1054*593dc095SDavid du Colombier s_end = 7;
1055*593dc095SDavid du Colombier s_inc = 1;
1056*593dc095SDavid du Colombier }
1057*593dc095SDavid du Colombier
1058*593dc095SDavid du Colombier for (i = row_info->width; i; i--)
1059*593dc095SDavid du Colombier {
1060*593dc095SDavid du Colombier v = (png_byte)((*sp >> sshift) & 0x1);
1061*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1062*593dc095SDavid du Colombier {
1063*593dc095SDavid du Colombier *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1064*593dc095SDavid du Colombier *dp |= (png_byte)(v << dshift);
1065*593dc095SDavid du Colombier if (dshift == s_end)
1066*593dc095SDavid du Colombier {
1067*593dc095SDavid du Colombier dshift = s_start;
1068*593dc095SDavid du Colombier dp--;
1069*593dc095SDavid du Colombier }
1070*593dc095SDavid du Colombier else
1071*593dc095SDavid du Colombier dshift += s_inc;
1072*593dc095SDavid du Colombier }
1073*593dc095SDavid du Colombier if (sshift == s_end)
1074*593dc095SDavid du Colombier {
1075*593dc095SDavid du Colombier sshift = s_start;
1076*593dc095SDavid du Colombier sp--;
1077*593dc095SDavid du Colombier }
1078*593dc095SDavid du Colombier else
1079*593dc095SDavid du Colombier sshift += s_inc;
1080*593dc095SDavid du Colombier }
1081*593dc095SDavid du Colombier break;
1082*593dc095SDavid du Colombier }
1083*593dc095SDavid du Colombier
1084*593dc095SDavid du Colombier case 2:
1085*593dc095SDavid du Colombier {
1086*593dc095SDavid du Colombier png_bytep sp, dp;
1087*593dc095SDavid du Colombier int sshift, dshift;
1088*593dc095SDavid du Colombier int s_start, s_end, s_inc;
1089*593dc095SDavid du Colombier png_uint_32 i;
1090*593dc095SDavid du Colombier
1091*593dc095SDavid du Colombier sp = row + (png_size_t)((row_info->width - 1) >> 2);
1092*593dc095SDavid du Colombier dp = row + (png_size_t)((final_width - 1) >> 2);
1093*593dc095SDavid du Colombier #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1094*593dc095SDavid du Colombier if (transformations & PNG_PACKSWAP)
1095*593dc095SDavid du Colombier {
1096*593dc095SDavid du Colombier sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1097*593dc095SDavid du Colombier dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1098*593dc095SDavid du Colombier s_start = 6;
1099*593dc095SDavid du Colombier s_end = 0;
1100*593dc095SDavid du Colombier s_inc = -2;
1101*593dc095SDavid du Colombier }
1102*593dc095SDavid du Colombier else
1103*593dc095SDavid du Colombier #endif
1104*593dc095SDavid du Colombier {
1105*593dc095SDavid du Colombier sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1106*593dc095SDavid du Colombier dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1107*593dc095SDavid du Colombier s_start = 0;
1108*593dc095SDavid du Colombier s_end = 6;
1109*593dc095SDavid du Colombier s_inc = 2;
1110*593dc095SDavid du Colombier }
1111*593dc095SDavid du Colombier
1112*593dc095SDavid du Colombier for (i = row_info->width; i; i--)
1113*593dc095SDavid du Colombier {
1114*593dc095SDavid du Colombier png_byte v;
1115*593dc095SDavid du Colombier int j;
1116*593dc095SDavid du Colombier
1117*593dc095SDavid du Colombier v = (png_byte)((*sp >> sshift) & 0x3);
1118*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1119*593dc095SDavid du Colombier {
1120*593dc095SDavid du Colombier *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1121*593dc095SDavid du Colombier *dp |= (png_byte)(v << dshift);
1122*593dc095SDavid du Colombier if (dshift == s_end)
1123*593dc095SDavid du Colombier {
1124*593dc095SDavid du Colombier dshift = s_start;
1125*593dc095SDavid du Colombier dp--;
1126*593dc095SDavid du Colombier }
1127*593dc095SDavid du Colombier else
1128*593dc095SDavid du Colombier dshift += s_inc;
1129*593dc095SDavid du Colombier }
1130*593dc095SDavid du Colombier if (sshift == s_end)
1131*593dc095SDavid du Colombier {
1132*593dc095SDavid du Colombier sshift = s_start;
1133*593dc095SDavid du Colombier sp--;
1134*593dc095SDavid du Colombier }
1135*593dc095SDavid du Colombier else
1136*593dc095SDavid du Colombier sshift += s_inc;
1137*593dc095SDavid du Colombier }
1138*593dc095SDavid du Colombier break;
1139*593dc095SDavid du Colombier }
1140*593dc095SDavid du Colombier
1141*593dc095SDavid du Colombier case 4:
1142*593dc095SDavid du Colombier {
1143*593dc095SDavid du Colombier png_bytep sp, dp;
1144*593dc095SDavid du Colombier int sshift, dshift;
1145*593dc095SDavid du Colombier int s_start, s_end, s_inc;
1146*593dc095SDavid du Colombier png_uint_32 i;
1147*593dc095SDavid du Colombier
1148*593dc095SDavid du Colombier sp = row + (png_size_t)((row_info->width - 1) >> 1);
1149*593dc095SDavid du Colombier dp = row + (png_size_t)((final_width - 1) >> 1);
1150*593dc095SDavid du Colombier #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1151*593dc095SDavid du Colombier if (transformations & PNG_PACKSWAP)
1152*593dc095SDavid du Colombier {
1153*593dc095SDavid du Colombier sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1154*593dc095SDavid du Colombier dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1155*593dc095SDavid du Colombier s_start = 4;
1156*593dc095SDavid du Colombier s_end = 0;
1157*593dc095SDavid du Colombier s_inc = -4;
1158*593dc095SDavid du Colombier }
1159*593dc095SDavid du Colombier else
1160*593dc095SDavid du Colombier #endif
1161*593dc095SDavid du Colombier {
1162*593dc095SDavid du Colombier sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1163*593dc095SDavid du Colombier dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1164*593dc095SDavid du Colombier s_start = 0;
1165*593dc095SDavid du Colombier s_end = 4;
1166*593dc095SDavid du Colombier s_inc = 4;
1167*593dc095SDavid du Colombier }
1168*593dc095SDavid du Colombier
1169*593dc095SDavid du Colombier for (i = row_info->width; i; i--)
1170*593dc095SDavid du Colombier {
1171*593dc095SDavid du Colombier png_byte v;
1172*593dc095SDavid du Colombier int j;
1173*593dc095SDavid du Colombier
1174*593dc095SDavid du Colombier v = (png_byte)((*sp >> sshift) & 0xf);
1175*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1176*593dc095SDavid du Colombier {
1177*593dc095SDavid du Colombier *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1178*593dc095SDavid du Colombier *dp |= (png_byte)(v << dshift);
1179*593dc095SDavid du Colombier if (dshift == s_end)
1180*593dc095SDavid du Colombier {
1181*593dc095SDavid du Colombier dshift = s_start;
1182*593dc095SDavid du Colombier dp--;
1183*593dc095SDavid du Colombier }
1184*593dc095SDavid du Colombier else
1185*593dc095SDavid du Colombier dshift += s_inc;
1186*593dc095SDavid du Colombier }
1187*593dc095SDavid du Colombier if (sshift == s_end)
1188*593dc095SDavid du Colombier {
1189*593dc095SDavid du Colombier sshift = s_start;
1190*593dc095SDavid du Colombier sp--;
1191*593dc095SDavid du Colombier }
1192*593dc095SDavid du Colombier else
1193*593dc095SDavid du Colombier sshift += s_inc;
1194*593dc095SDavid du Colombier }
1195*593dc095SDavid du Colombier break;
1196*593dc095SDavid du Colombier }
1197*593dc095SDavid du Colombier
1198*593dc095SDavid du Colombier default: // This is the place where the routine is modified
1199*593dc095SDavid du Colombier {
1200*593dc095SDavid du Colombier __int64 const4 = 0x0000000000FFFFFF;
1201*593dc095SDavid du Colombier // __int64 const5 = 0x000000FFFFFF0000; // unused...
1202*593dc095SDavid du Colombier __int64 const6 = 0x00000000000000FF;
1203*593dc095SDavid du Colombier png_bytep sptr, dp;
1204*593dc095SDavid du Colombier png_uint_32 i;
1205*593dc095SDavid du Colombier png_size_t pixel_bytes;
1206*593dc095SDavid du Colombier int width = row_info->width;
1207*593dc095SDavid du Colombier
1208*593dc095SDavid du Colombier pixel_bytes = (row_info->pixel_depth >> 3);
1209*593dc095SDavid du Colombier
1210*593dc095SDavid du Colombier sptr = row + (width - 1) * pixel_bytes;
1211*593dc095SDavid du Colombier dp = row + (final_width - 1) * pixel_bytes;
1212*593dc095SDavid du Colombier // New code by Nirav Chhatrapati - Intel Corporation
1213*593dc095SDavid du Colombier // sign fix by GRR
1214*593dc095SDavid du Colombier // NOTE: there is NO MMX code for 48-bit and 64-bit images
1215*593dc095SDavid du Colombier
1216*593dc095SDavid du Colombier // use MMX routine if machine supports it
1217*593dc095SDavid du Colombier #if !defined(PNG_1_0_X)
1218*593dc095SDavid du Colombier if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1219*593dc095SDavid du Colombier /* && mmx_supported */ )
1220*593dc095SDavid du Colombier #else
1221*593dc095SDavid du Colombier if (mmx_supported)
1222*593dc095SDavid du Colombier #endif
1223*593dc095SDavid du Colombier {
1224*593dc095SDavid du Colombier if (pixel_bytes == 3)
1225*593dc095SDavid du Colombier {
1226*593dc095SDavid du Colombier if (((pass == 0) || (pass == 1)) && width)
1227*593dc095SDavid du Colombier {
1228*593dc095SDavid du Colombier _asm
1229*593dc095SDavid du Colombier {
1230*593dc095SDavid du Colombier mov esi, sptr
1231*593dc095SDavid du Colombier mov edi, dp
1232*593dc095SDavid du Colombier mov ecx, width
1233*593dc095SDavid du Colombier sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
1234*593dc095SDavid du Colombier loop_pass0:
1235*593dc095SDavid du Colombier movd mm0, [esi] ; X X X X X v2 v1 v0
1236*593dc095SDavid du Colombier pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1237*593dc095SDavid du Colombier movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1238*593dc095SDavid du Colombier psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1239*593dc095SDavid du Colombier movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1240*593dc095SDavid du Colombier psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1241*593dc095SDavid du Colombier psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1242*593dc095SDavid du Colombier por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1243*593dc095SDavid du Colombier por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1244*593dc095SDavid du Colombier movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
1245*593dc095SDavid du Colombier psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
1246*593dc095SDavid du Colombier movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
1247*593dc095SDavid du Colombier punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
1248*593dc095SDavid du Colombier movq [edi+16] , mm4
1249*593dc095SDavid du Colombier psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
1250*593dc095SDavid du Colombier movq [edi+8] , mm3
1251*593dc095SDavid du Colombier punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
1252*593dc095SDavid du Colombier sub esi, 3
1253*593dc095SDavid du Colombier movq [edi], mm0
1254*593dc095SDavid du Colombier sub edi, 24
1255*593dc095SDavid du Colombier //sub esi, 3
1256*593dc095SDavid du Colombier dec ecx
1257*593dc095SDavid du Colombier jnz loop_pass0
1258*593dc095SDavid du Colombier EMMS
1259*593dc095SDavid du Colombier }
1260*593dc095SDavid du Colombier }
1261*593dc095SDavid du Colombier else if (((pass == 2) || (pass == 3)) && width)
1262*593dc095SDavid du Colombier {
1263*593dc095SDavid du Colombier _asm
1264*593dc095SDavid du Colombier {
1265*593dc095SDavid du Colombier mov esi, sptr
1266*593dc095SDavid du Colombier mov edi, dp
1267*593dc095SDavid du Colombier mov ecx, width
1268*593dc095SDavid du Colombier sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
1269*593dc095SDavid du Colombier loop_pass2:
1270*593dc095SDavid du Colombier movd mm0, [esi] ; X X X X X v2 v1 v0
1271*593dc095SDavid du Colombier pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1272*593dc095SDavid du Colombier movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1273*593dc095SDavid du Colombier psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1274*593dc095SDavid du Colombier movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1275*593dc095SDavid du Colombier psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1276*593dc095SDavid du Colombier psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1277*593dc095SDavid du Colombier por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1278*593dc095SDavid du Colombier por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1279*593dc095SDavid du Colombier movq [edi+4], mm0 ; move to memory
1280*593dc095SDavid du Colombier psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
1281*593dc095SDavid du Colombier movd [edi], mm0 ; move to memory
1282*593dc095SDavid du Colombier sub esi, 3
1283*593dc095SDavid du Colombier sub edi, 12
1284*593dc095SDavid du Colombier dec ecx
1285*593dc095SDavid du Colombier jnz loop_pass2
1286*593dc095SDavid du Colombier EMMS
1287*593dc095SDavid du Colombier }
1288*593dc095SDavid du Colombier }
1289*593dc095SDavid du Colombier else if (width) /* && ((pass == 4) || (pass == 5)) */
1290*593dc095SDavid du Colombier {
1291*593dc095SDavid du Colombier int width_mmx = ((width >> 1) << 1) - 8;
1292*593dc095SDavid du Colombier if (width_mmx < 0)
1293*593dc095SDavid du Colombier width_mmx = 0;
1294*593dc095SDavid du Colombier width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1295*593dc095SDavid du Colombier if (width_mmx)
1296*593dc095SDavid du Colombier {
1297*593dc095SDavid du Colombier _asm
1298*593dc095SDavid du Colombier {
1299*593dc095SDavid du Colombier mov esi, sptr
1300*593dc095SDavid du Colombier mov edi, dp
1301*593dc095SDavid du Colombier mov ecx, width_mmx
1302*593dc095SDavid du Colombier sub esi, 3
1303*593dc095SDavid du Colombier sub edi, 9
1304*593dc095SDavid du Colombier loop_pass4:
1305*593dc095SDavid du Colombier movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
1306*593dc095SDavid du Colombier movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
1307*593dc095SDavid du Colombier movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
1308*593dc095SDavid du Colombier psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
1309*593dc095SDavid du Colombier pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
1310*593dc095SDavid du Colombier psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
1311*593dc095SDavid du Colombier por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
1312*593dc095SDavid du Colombier movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
1313*593dc095SDavid du Colombier psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
1314*593dc095SDavid du Colombier movq [edi], mm0 ; move quad to memory
1315*593dc095SDavid du Colombier psrlq mm5, 16 ; 0 0 0 0 0 X X v2
1316*593dc095SDavid du Colombier pand mm5, const6 ; 0 0 0 0 0 0 0 v2
1317*593dc095SDavid du Colombier por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
1318*593dc095SDavid du Colombier movd [edi+8], mm6 ; move double to memory
1319*593dc095SDavid du Colombier sub esi, 6
1320*593dc095SDavid du Colombier sub edi, 12
1321*593dc095SDavid du Colombier sub ecx, 2
1322*593dc095SDavid du Colombier jnz loop_pass4
1323*593dc095SDavid du Colombier EMMS
1324*593dc095SDavid du Colombier }
1325*593dc095SDavid du Colombier }
1326*593dc095SDavid du Colombier
1327*593dc095SDavid du Colombier sptr -= width_mmx*3;
1328*593dc095SDavid du Colombier dp -= width_mmx*6;
1329*593dc095SDavid du Colombier for (i = width; i; i--)
1330*593dc095SDavid du Colombier {
1331*593dc095SDavid du Colombier png_byte v[8];
1332*593dc095SDavid du Colombier int j;
1333*593dc095SDavid du Colombier
1334*593dc095SDavid du Colombier png_memcpy(v, sptr, 3);
1335*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1336*593dc095SDavid du Colombier {
1337*593dc095SDavid du Colombier png_memcpy(dp, v, 3);
1338*593dc095SDavid du Colombier dp -= 3;
1339*593dc095SDavid du Colombier }
1340*593dc095SDavid du Colombier sptr -= 3;
1341*593dc095SDavid du Colombier }
1342*593dc095SDavid du Colombier }
1343*593dc095SDavid du Colombier } /* end of pixel_bytes == 3 */
1344*593dc095SDavid du Colombier
1345*593dc095SDavid du Colombier else if (pixel_bytes == 1)
1346*593dc095SDavid du Colombier {
1347*593dc095SDavid du Colombier if (((pass == 0) || (pass == 1)) && width)
1348*593dc095SDavid du Colombier {
1349*593dc095SDavid du Colombier int width_mmx = ((width >> 2) << 2);
1350*593dc095SDavid du Colombier width -= width_mmx;
1351*593dc095SDavid du Colombier if (width_mmx)
1352*593dc095SDavid du Colombier {
1353*593dc095SDavid du Colombier _asm
1354*593dc095SDavid du Colombier {
1355*593dc095SDavid du Colombier mov esi, sptr
1356*593dc095SDavid du Colombier mov edi, dp
1357*593dc095SDavid du Colombier mov ecx, width_mmx
1358*593dc095SDavid du Colombier sub edi, 31
1359*593dc095SDavid du Colombier sub esi, 3
1360*593dc095SDavid du Colombier loop1_pass0:
1361*593dc095SDavid du Colombier movd mm0, [esi] ; X X X X v0 v1 v2 v3
1362*593dc095SDavid du Colombier movq mm1, mm0 ; X X X X v0 v1 v2 v3
1363*593dc095SDavid du Colombier punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1364*593dc095SDavid du Colombier movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1365*593dc095SDavid du Colombier punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1366*593dc095SDavid du Colombier movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1367*593dc095SDavid du Colombier punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
1368*593dc095SDavid du Colombier punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
1369*593dc095SDavid du Colombier movq [edi], mm0 ; move to memory v3
1370*593dc095SDavid du Colombier punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1371*593dc095SDavid du Colombier movq [edi+8], mm3 ; move to memory v2
1372*593dc095SDavid du Colombier movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1373*593dc095SDavid du Colombier punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
1374*593dc095SDavid du Colombier punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
1375*593dc095SDavid du Colombier movq [edi+16], mm2 ; move to memory v1
1376*593dc095SDavid du Colombier movq [edi+24], mm4 ; move to memory v0
1377*593dc095SDavid du Colombier sub esi, 4
1378*593dc095SDavid du Colombier sub edi, 32
1379*593dc095SDavid du Colombier sub ecx, 4
1380*593dc095SDavid du Colombier jnz loop1_pass0
1381*593dc095SDavid du Colombier EMMS
1382*593dc095SDavid du Colombier }
1383*593dc095SDavid du Colombier }
1384*593dc095SDavid du Colombier
1385*593dc095SDavid du Colombier sptr -= width_mmx;
1386*593dc095SDavid du Colombier dp -= width_mmx*8;
1387*593dc095SDavid du Colombier for (i = width; i; i--)
1388*593dc095SDavid du Colombier {
1389*593dc095SDavid du Colombier int j;
1390*593dc095SDavid du Colombier
1391*593dc095SDavid du Colombier /* I simplified this part in version 1.0.4e
1392*593dc095SDavid du Colombier * here and in several other instances where
1393*593dc095SDavid du Colombier * pixel_bytes == 1 -- GR-P
1394*593dc095SDavid du Colombier *
1395*593dc095SDavid du Colombier * Original code:
1396*593dc095SDavid du Colombier *
1397*593dc095SDavid du Colombier * png_byte v[8];
1398*593dc095SDavid du Colombier * png_memcpy(v, sptr, pixel_bytes);
1399*593dc095SDavid du Colombier * for (j = 0; j < png_pass_inc[pass]; j++)
1400*593dc095SDavid du Colombier * {
1401*593dc095SDavid du Colombier * png_memcpy(dp, v, pixel_bytes);
1402*593dc095SDavid du Colombier * dp -= pixel_bytes;
1403*593dc095SDavid du Colombier * }
1404*593dc095SDavid du Colombier * sptr -= pixel_bytes;
1405*593dc095SDavid du Colombier *
1406*593dc095SDavid du Colombier * Replacement code is in the next three lines:
1407*593dc095SDavid du Colombier */
1408*593dc095SDavid du Colombier
1409*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1410*593dc095SDavid du Colombier *dp-- = *sptr;
1411*593dc095SDavid du Colombier sptr--;
1412*593dc095SDavid du Colombier }
1413*593dc095SDavid du Colombier }
1414*593dc095SDavid du Colombier else if (((pass == 2) || (pass == 3)) && width)
1415*593dc095SDavid du Colombier {
1416*593dc095SDavid du Colombier int width_mmx = ((width >> 2) << 2);
1417*593dc095SDavid du Colombier width -= width_mmx;
1418*593dc095SDavid du Colombier if (width_mmx)
1419*593dc095SDavid du Colombier {
1420*593dc095SDavid du Colombier _asm
1421*593dc095SDavid du Colombier {
1422*593dc095SDavid du Colombier mov esi, sptr
1423*593dc095SDavid du Colombier mov edi, dp
1424*593dc095SDavid du Colombier mov ecx, width_mmx
1425*593dc095SDavid du Colombier sub edi, 15
1426*593dc095SDavid du Colombier sub esi, 3
1427*593dc095SDavid du Colombier loop1_pass2:
1428*593dc095SDavid du Colombier movd mm0, [esi] ; X X X X v0 v1 v2 v3
1429*593dc095SDavid du Colombier punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1430*593dc095SDavid du Colombier movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1431*593dc095SDavid du Colombier punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1432*593dc095SDavid du Colombier punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
1433*593dc095SDavid du Colombier movq [edi], mm0 ; move to memory v2 and v3
1434*593dc095SDavid du Colombier sub esi, 4
1435*593dc095SDavid du Colombier movq [edi+8], mm1 ; move to memory v1 and v0
1436*593dc095SDavid du Colombier sub edi, 16
1437*593dc095SDavid du Colombier sub ecx, 4
1438*593dc095SDavid du Colombier jnz loop1_pass2
1439*593dc095SDavid du Colombier EMMS
1440*593dc095SDavid du Colombier }
1441*593dc095SDavid du Colombier }
1442*593dc095SDavid du Colombier
1443*593dc095SDavid du Colombier sptr -= width_mmx;
1444*593dc095SDavid du Colombier dp -= width_mmx*4;
1445*593dc095SDavid du Colombier for (i = width; i; i--)
1446*593dc095SDavid du Colombier {
1447*593dc095SDavid du Colombier int j;
1448*593dc095SDavid du Colombier
1449*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1450*593dc095SDavid du Colombier {
1451*593dc095SDavid du Colombier *dp-- = *sptr;
1452*593dc095SDavid du Colombier }
1453*593dc095SDavid du Colombier sptr --;
1454*593dc095SDavid du Colombier }
1455*593dc095SDavid du Colombier }
1456*593dc095SDavid du Colombier else if (width) /* && ((pass == 4) || (pass == 5))) */
1457*593dc095SDavid du Colombier {
1458*593dc095SDavid du Colombier int width_mmx = ((width >> 3) << 3);
1459*593dc095SDavid du Colombier width -= width_mmx;
1460*593dc095SDavid du Colombier if (width_mmx)
1461*593dc095SDavid du Colombier {
1462*593dc095SDavid du Colombier _asm
1463*593dc095SDavid du Colombier {
1464*593dc095SDavid du Colombier mov esi, sptr
1465*593dc095SDavid du Colombier mov edi, dp
1466*593dc095SDavid du Colombier mov ecx, width_mmx
1467*593dc095SDavid du Colombier sub edi, 15
1468*593dc095SDavid du Colombier sub esi, 7
1469*593dc095SDavid du Colombier loop1_pass4:
1470*593dc095SDavid du Colombier movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
1471*593dc095SDavid du Colombier movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
1472*593dc095SDavid du Colombier punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
1473*593dc095SDavid du Colombier //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1474*593dc095SDavid du Colombier punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
1475*593dc095SDavid du Colombier movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
1476*593dc095SDavid du Colombier sub esi, 8
1477*593dc095SDavid du Colombier movq [edi], mm0 ; move to memory v4 v5 v6 and v7
1478*593dc095SDavid du Colombier //sub esi, 4
1479*593dc095SDavid du Colombier sub edi, 16
1480*593dc095SDavid du Colombier sub ecx, 8
1481*593dc095SDavid du Colombier jnz loop1_pass4
1482*593dc095SDavid du Colombier EMMS
1483*593dc095SDavid du Colombier }
1484*593dc095SDavid du Colombier }
1485*593dc095SDavid du Colombier
1486*593dc095SDavid du Colombier sptr -= width_mmx;
1487*593dc095SDavid du Colombier dp -= width_mmx*2;
1488*593dc095SDavid du Colombier for (i = width; i; i--)
1489*593dc095SDavid du Colombier {
1490*593dc095SDavid du Colombier int j;
1491*593dc095SDavid du Colombier
1492*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1493*593dc095SDavid du Colombier {
1494*593dc095SDavid du Colombier *dp-- = *sptr;
1495*593dc095SDavid du Colombier }
1496*593dc095SDavid du Colombier sptr --;
1497*593dc095SDavid du Colombier }
1498*593dc095SDavid du Colombier }
1499*593dc095SDavid du Colombier } /* end of pixel_bytes == 1 */
1500*593dc095SDavid du Colombier
1501*593dc095SDavid du Colombier else if (pixel_bytes == 2)
1502*593dc095SDavid du Colombier {
1503*593dc095SDavid du Colombier if (((pass == 0) || (pass == 1)) && width)
1504*593dc095SDavid du Colombier {
1505*593dc095SDavid du Colombier int width_mmx = ((width >> 1) << 1);
1506*593dc095SDavid du Colombier width -= width_mmx;
1507*593dc095SDavid du Colombier if (width_mmx)
1508*593dc095SDavid du Colombier {
1509*593dc095SDavid du Colombier _asm
1510*593dc095SDavid du Colombier {
1511*593dc095SDavid du Colombier mov esi, sptr
1512*593dc095SDavid du Colombier mov edi, dp
1513*593dc095SDavid du Colombier mov ecx, width_mmx
1514*593dc095SDavid du Colombier sub esi, 2
1515*593dc095SDavid du Colombier sub edi, 30
1516*593dc095SDavid du Colombier loop2_pass0:
1517*593dc095SDavid du Colombier movd mm0, [esi] ; X X X X v1 v0 v3 v2
1518*593dc095SDavid du Colombier punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1519*593dc095SDavid du Colombier movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1520*593dc095SDavid du Colombier punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1521*593dc095SDavid du Colombier punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1522*593dc095SDavid du Colombier movq [edi], mm0
1523*593dc095SDavid du Colombier movq [edi + 8], mm0
1524*593dc095SDavid du Colombier movq [edi + 16], mm1
1525*593dc095SDavid du Colombier movq [edi + 24], mm1
1526*593dc095SDavid du Colombier sub esi, 4
1527*593dc095SDavid du Colombier sub edi, 32
1528*593dc095SDavid du Colombier sub ecx, 2
1529*593dc095SDavid du Colombier jnz loop2_pass0
1530*593dc095SDavid du Colombier EMMS
1531*593dc095SDavid du Colombier }
1532*593dc095SDavid du Colombier }
1533*593dc095SDavid du Colombier
1534*593dc095SDavid du Colombier sptr -= (width_mmx*2 - 2); // sign fixed
1535*593dc095SDavid du Colombier dp -= (width_mmx*16 - 2); // sign fixed
1536*593dc095SDavid du Colombier for (i = width; i; i--)
1537*593dc095SDavid du Colombier {
1538*593dc095SDavid du Colombier png_byte v[8];
1539*593dc095SDavid du Colombier int j;
1540*593dc095SDavid du Colombier sptr -= 2;
1541*593dc095SDavid du Colombier png_memcpy(v, sptr, 2);
1542*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1543*593dc095SDavid du Colombier {
1544*593dc095SDavid du Colombier dp -= 2;
1545*593dc095SDavid du Colombier png_memcpy(dp, v, 2);
1546*593dc095SDavid du Colombier }
1547*593dc095SDavid du Colombier }
1548*593dc095SDavid du Colombier }
1549*593dc095SDavid du Colombier else if (((pass == 2) || (pass == 3)) && width)
1550*593dc095SDavid du Colombier {
1551*593dc095SDavid du Colombier int width_mmx = ((width >> 1) << 1) ;
1552*593dc095SDavid du Colombier width -= width_mmx;
1553*593dc095SDavid du Colombier if (width_mmx)
1554*593dc095SDavid du Colombier {
1555*593dc095SDavid du Colombier _asm
1556*593dc095SDavid du Colombier {
1557*593dc095SDavid du Colombier mov esi, sptr
1558*593dc095SDavid du Colombier mov edi, dp
1559*593dc095SDavid du Colombier mov ecx, width_mmx
1560*593dc095SDavid du Colombier sub esi, 2
1561*593dc095SDavid du Colombier sub edi, 14
1562*593dc095SDavid du Colombier loop2_pass2:
1563*593dc095SDavid du Colombier movd mm0, [esi] ; X X X X v1 v0 v3 v2
1564*593dc095SDavid du Colombier punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1565*593dc095SDavid du Colombier movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1566*593dc095SDavid du Colombier punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1567*593dc095SDavid du Colombier punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1568*593dc095SDavid du Colombier movq [edi], mm0
1569*593dc095SDavid du Colombier sub esi, 4
1570*593dc095SDavid du Colombier movq [edi + 8], mm1
1571*593dc095SDavid du Colombier //sub esi, 4
1572*593dc095SDavid du Colombier sub edi, 16
1573*593dc095SDavid du Colombier sub ecx, 2
1574*593dc095SDavid du Colombier jnz loop2_pass2
1575*593dc095SDavid du Colombier EMMS
1576*593dc095SDavid du Colombier }
1577*593dc095SDavid du Colombier }
1578*593dc095SDavid du Colombier
1579*593dc095SDavid du Colombier sptr -= (width_mmx*2 - 2); // sign fixed
1580*593dc095SDavid du Colombier dp -= (width_mmx*8 - 2); // sign fixed
1581*593dc095SDavid du Colombier for (i = width; i; i--)
1582*593dc095SDavid du Colombier {
1583*593dc095SDavid du Colombier png_byte v[8];
1584*593dc095SDavid du Colombier int j;
1585*593dc095SDavid du Colombier sptr -= 2;
1586*593dc095SDavid du Colombier png_memcpy(v, sptr, 2);
1587*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1588*593dc095SDavid du Colombier {
1589*593dc095SDavid du Colombier dp -= 2;
1590*593dc095SDavid du Colombier png_memcpy(dp, v, 2);
1591*593dc095SDavid du Colombier }
1592*593dc095SDavid du Colombier }
1593*593dc095SDavid du Colombier }
1594*593dc095SDavid du Colombier else if (width) // pass == 4 or 5
1595*593dc095SDavid du Colombier {
1596*593dc095SDavid du Colombier int width_mmx = ((width >> 1) << 1) ;
1597*593dc095SDavid du Colombier width -= width_mmx;
1598*593dc095SDavid du Colombier if (width_mmx)
1599*593dc095SDavid du Colombier {
1600*593dc095SDavid du Colombier _asm
1601*593dc095SDavid du Colombier {
1602*593dc095SDavid du Colombier mov esi, sptr
1603*593dc095SDavid du Colombier mov edi, dp
1604*593dc095SDavid du Colombier mov ecx, width_mmx
1605*593dc095SDavid du Colombier sub esi, 2
1606*593dc095SDavid du Colombier sub edi, 6
1607*593dc095SDavid du Colombier loop2_pass4:
1608*593dc095SDavid du Colombier movd mm0, [esi] ; X X X X v1 v0 v3 v2
1609*593dc095SDavid du Colombier punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1610*593dc095SDavid du Colombier sub esi, 4
1611*593dc095SDavid du Colombier movq [edi], mm0
1612*593dc095SDavid du Colombier sub edi, 8
1613*593dc095SDavid du Colombier sub ecx, 2
1614*593dc095SDavid du Colombier jnz loop2_pass4
1615*593dc095SDavid du Colombier EMMS
1616*593dc095SDavid du Colombier }
1617*593dc095SDavid du Colombier }
1618*593dc095SDavid du Colombier
1619*593dc095SDavid du Colombier sptr -= (width_mmx*2 - 2); // sign fixed
1620*593dc095SDavid du Colombier dp -= (width_mmx*4 - 2); // sign fixed
1621*593dc095SDavid du Colombier for (i = width; i; i--)
1622*593dc095SDavid du Colombier {
1623*593dc095SDavid du Colombier png_byte v[8];
1624*593dc095SDavid du Colombier int j;
1625*593dc095SDavid du Colombier sptr -= 2;
1626*593dc095SDavid du Colombier png_memcpy(v, sptr, 2);
1627*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1628*593dc095SDavid du Colombier {
1629*593dc095SDavid du Colombier dp -= 2;
1630*593dc095SDavid du Colombier png_memcpy(dp, v, 2);
1631*593dc095SDavid du Colombier }
1632*593dc095SDavid du Colombier }
1633*593dc095SDavid du Colombier }
1634*593dc095SDavid du Colombier } /* end of pixel_bytes == 2 */
1635*593dc095SDavid du Colombier
1636*593dc095SDavid du Colombier else if (pixel_bytes == 4)
1637*593dc095SDavid du Colombier {
1638*593dc095SDavid du Colombier if (((pass == 0) || (pass == 1)) && width)
1639*593dc095SDavid du Colombier {
1640*593dc095SDavid du Colombier int width_mmx = ((width >> 1) << 1) ;
1641*593dc095SDavid du Colombier width -= width_mmx;
1642*593dc095SDavid du Colombier if (width_mmx)
1643*593dc095SDavid du Colombier {
1644*593dc095SDavid du Colombier _asm
1645*593dc095SDavid du Colombier {
1646*593dc095SDavid du Colombier mov esi, sptr
1647*593dc095SDavid du Colombier mov edi, dp
1648*593dc095SDavid du Colombier mov ecx, width_mmx
1649*593dc095SDavid du Colombier sub esi, 4
1650*593dc095SDavid du Colombier sub edi, 60
1651*593dc095SDavid du Colombier loop4_pass0:
1652*593dc095SDavid du Colombier movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1653*593dc095SDavid du Colombier movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1654*593dc095SDavid du Colombier punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1655*593dc095SDavid du Colombier punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1656*593dc095SDavid du Colombier movq [edi], mm0
1657*593dc095SDavid du Colombier movq [edi + 8], mm0
1658*593dc095SDavid du Colombier movq [edi + 16], mm0
1659*593dc095SDavid du Colombier movq [edi + 24], mm0
1660*593dc095SDavid du Colombier movq [edi+32], mm1
1661*593dc095SDavid du Colombier movq [edi + 40], mm1
1662*593dc095SDavid du Colombier movq [edi+ 48], mm1
1663*593dc095SDavid du Colombier sub esi, 8
1664*593dc095SDavid du Colombier movq [edi + 56], mm1
1665*593dc095SDavid du Colombier sub edi, 64
1666*593dc095SDavid du Colombier sub ecx, 2
1667*593dc095SDavid du Colombier jnz loop4_pass0
1668*593dc095SDavid du Colombier EMMS
1669*593dc095SDavid du Colombier }
1670*593dc095SDavid du Colombier }
1671*593dc095SDavid du Colombier
1672*593dc095SDavid du Colombier sptr -= (width_mmx*4 - 4); // sign fixed
1673*593dc095SDavid du Colombier dp -= (width_mmx*32 - 4); // sign fixed
1674*593dc095SDavid du Colombier for (i = width; i; i--)
1675*593dc095SDavid du Colombier {
1676*593dc095SDavid du Colombier png_byte v[8];
1677*593dc095SDavid du Colombier int j;
1678*593dc095SDavid du Colombier sptr -= 4;
1679*593dc095SDavid du Colombier png_memcpy(v, sptr, 4);
1680*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1681*593dc095SDavid du Colombier {
1682*593dc095SDavid du Colombier dp -= 4;
1683*593dc095SDavid du Colombier png_memcpy(dp, v, 4);
1684*593dc095SDavid du Colombier }
1685*593dc095SDavid du Colombier }
1686*593dc095SDavid du Colombier }
1687*593dc095SDavid du Colombier else if (((pass == 2) || (pass == 3)) && width)
1688*593dc095SDavid du Colombier {
1689*593dc095SDavid du Colombier int width_mmx = ((width >> 1) << 1) ;
1690*593dc095SDavid du Colombier width -= width_mmx;
1691*593dc095SDavid du Colombier if (width_mmx)
1692*593dc095SDavid du Colombier {
1693*593dc095SDavid du Colombier _asm
1694*593dc095SDavid du Colombier {
1695*593dc095SDavid du Colombier mov esi, sptr
1696*593dc095SDavid du Colombier mov edi, dp
1697*593dc095SDavid du Colombier mov ecx, width_mmx
1698*593dc095SDavid du Colombier sub esi, 4
1699*593dc095SDavid du Colombier sub edi, 28
1700*593dc095SDavid du Colombier loop4_pass2:
1701*593dc095SDavid du Colombier movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1702*593dc095SDavid du Colombier movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1703*593dc095SDavid du Colombier punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1704*593dc095SDavid du Colombier punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1705*593dc095SDavid du Colombier movq [edi], mm0
1706*593dc095SDavid du Colombier movq [edi + 8], mm0
1707*593dc095SDavid du Colombier movq [edi+16], mm1
1708*593dc095SDavid du Colombier movq [edi + 24], mm1
1709*593dc095SDavid du Colombier sub esi, 8
1710*593dc095SDavid du Colombier sub edi, 32
1711*593dc095SDavid du Colombier sub ecx, 2
1712*593dc095SDavid du Colombier jnz loop4_pass2
1713*593dc095SDavid du Colombier EMMS
1714*593dc095SDavid du Colombier }
1715*593dc095SDavid du Colombier }
1716*593dc095SDavid du Colombier
1717*593dc095SDavid du Colombier sptr -= (width_mmx*4 - 4); // sign fixed
1718*593dc095SDavid du Colombier dp -= (width_mmx*16 - 4); // sign fixed
1719*593dc095SDavid du Colombier for (i = width; i; i--)
1720*593dc095SDavid du Colombier {
1721*593dc095SDavid du Colombier png_byte v[8];
1722*593dc095SDavid du Colombier int j;
1723*593dc095SDavid du Colombier sptr -= 4;
1724*593dc095SDavid du Colombier png_memcpy(v, sptr, 4);
1725*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1726*593dc095SDavid du Colombier {
1727*593dc095SDavid du Colombier dp -= 4;
1728*593dc095SDavid du Colombier png_memcpy(dp, v, 4);
1729*593dc095SDavid du Colombier }
1730*593dc095SDavid du Colombier }
1731*593dc095SDavid du Colombier }
1732*593dc095SDavid du Colombier else if (width) // pass == 4 or 5
1733*593dc095SDavid du Colombier {
1734*593dc095SDavid du Colombier int width_mmx = ((width >> 1) << 1) ;
1735*593dc095SDavid du Colombier width -= width_mmx;
1736*593dc095SDavid du Colombier if (width_mmx)
1737*593dc095SDavid du Colombier {
1738*593dc095SDavid du Colombier _asm
1739*593dc095SDavid du Colombier {
1740*593dc095SDavid du Colombier mov esi, sptr
1741*593dc095SDavid du Colombier mov edi, dp
1742*593dc095SDavid du Colombier mov ecx, width_mmx
1743*593dc095SDavid du Colombier sub esi, 4
1744*593dc095SDavid du Colombier sub edi, 12
1745*593dc095SDavid du Colombier loop4_pass4:
1746*593dc095SDavid du Colombier movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1747*593dc095SDavid du Colombier movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1748*593dc095SDavid du Colombier punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1749*593dc095SDavid du Colombier punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1750*593dc095SDavid du Colombier movq [edi], mm0
1751*593dc095SDavid du Colombier sub esi, 8
1752*593dc095SDavid du Colombier movq [edi + 8], mm1
1753*593dc095SDavid du Colombier sub edi, 16
1754*593dc095SDavid du Colombier sub ecx, 2
1755*593dc095SDavid du Colombier jnz loop4_pass4
1756*593dc095SDavid du Colombier EMMS
1757*593dc095SDavid du Colombier }
1758*593dc095SDavid du Colombier }
1759*593dc095SDavid du Colombier
1760*593dc095SDavid du Colombier sptr -= (width_mmx*4 - 4); // sign fixed
1761*593dc095SDavid du Colombier dp -= (width_mmx*8 - 4); // sign fixed
1762*593dc095SDavid du Colombier for (i = width; i; i--)
1763*593dc095SDavid du Colombier {
1764*593dc095SDavid du Colombier png_byte v[8];
1765*593dc095SDavid du Colombier int j;
1766*593dc095SDavid du Colombier sptr -= 4;
1767*593dc095SDavid du Colombier png_memcpy(v, sptr, 4);
1768*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1769*593dc095SDavid du Colombier {
1770*593dc095SDavid du Colombier dp -= 4;
1771*593dc095SDavid du Colombier png_memcpy(dp, v, 4);
1772*593dc095SDavid du Colombier }
1773*593dc095SDavid du Colombier }
1774*593dc095SDavid du Colombier }
1775*593dc095SDavid du Colombier
1776*593dc095SDavid du Colombier } /* end of pixel_bytes == 4 */
1777*593dc095SDavid du Colombier
1778*593dc095SDavid du Colombier else if (pixel_bytes == 6)
1779*593dc095SDavid du Colombier {
1780*593dc095SDavid du Colombier for (i = width; i; i--)
1781*593dc095SDavid du Colombier {
1782*593dc095SDavid du Colombier png_byte v[8];
1783*593dc095SDavid du Colombier int j;
1784*593dc095SDavid du Colombier png_memcpy(v, sptr, 6);
1785*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1786*593dc095SDavid du Colombier {
1787*593dc095SDavid du Colombier png_memcpy(dp, v, 6);
1788*593dc095SDavid du Colombier dp -= 6;
1789*593dc095SDavid du Colombier }
1790*593dc095SDavid du Colombier sptr -= 6;
1791*593dc095SDavid du Colombier }
1792*593dc095SDavid du Colombier } /* end of pixel_bytes == 6 */
1793*593dc095SDavid du Colombier
1794*593dc095SDavid du Colombier else
1795*593dc095SDavid du Colombier {
1796*593dc095SDavid du Colombier for (i = width; i; i--)
1797*593dc095SDavid du Colombier {
1798*593dc095SDavid du Colombier png_byte v[8];
1799*593dc095SDavid du Colombier int j;
1800*593dc095SDavid du Colombier png_memcpy(v, sptr, pixel_bytes);
1801*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1802*593dc095SDavid du Colombier {
1803*593dc095SDavid du Colombier png_memcpy(dp, v, pixel_bytes);
1804*593dc095SDavid du Colombier dp -= pixel_bytes;
1805*593dc095SDavid du Colombier }
1806*593dc095SDavid du Colombier sptr-= pixel_bytes;
1807*593dc095SDavid du Colombier }
1808*593dc095SDavid du Colombier }
1809*593dc095SDavid du Colombier } /* end of mmx_supported */
1810*593dc095SDavid du Colombier
1811*593dc095SDavid du Colombier else /* MMX not supported: use modified C code - takes advantage
1812*593dc095SDavid du Colombier * of inlining of memcpy for a constant */
1813*593dc095SDavid du Colombier {
1814*593dc095SDavid du Colombier if (pixel_bytes == 1)
1815*593dc095SDavid du Colombier {
1816*593dc095SDavid du Colombier for (i = width; i; i--)
1817*593dc095SDavid du Colombier {
1818*593dc095SDavid du Colombier int j;
1819*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1820*593dc095SDavid du Colombier *dp-- = *sptr;
1821*593dc095SDavid du Colombier sptr--;
1822*593dc095SDavid du Colombier }
1823*593dc095SDavid du Colombier }
1824*593dc095SDavid du Colombier else if (pixel_bytes == 3)
1825*593dc095SDavid du Colombier {
1826*593dc095SDavid du Colombier for (i = width; i; i--)
1827*593dc095SDavid du Colombier {
1828*593dc095SDavid du Colombier png_byte v[8];
1829*593dc095SDavid du Colombier int j;
1830*593dc095SDavid du Colombier png_memcpy(v, sptr, pixel_bytes);
1831*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1832*593dc095SDavid du Colombier {
1833*593dc095SDavid du Colombier png_memcpy(dp, v, pixel_bytes);
1834*593dc095SDavid du Colombier dp -= pixel_bytes;
1835*593dc095SDavid du Colombier }
1836*593dc095SDavid du Colombier sptr -= pixel_bytes;
1837*593dc095SDavid du Colombier }
1838*593dc095SDavid du Colombier }
1839*593dc095SDavid du Colombier else if (pixel_bytes == 2)
1840*593dc095SDavid du Colombier {
1841*593dc095SDavid du Colombier for (i = width; i; i--)
1842*593dc095SDavid du Colombier {
1843*593dc095SDavid du Colombier png_byte v[8];
1844*593dc095SDavid du Colombier int j;
1845*593dc095SDavid du Colombier png_memcpy(v, sptr, pixel_bytes);
1846*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1847*593dc095SDavid du Colombier {
1848*593dc095SDavid du Colombier png_memcpy(dp, v, pixel_bytes);
1849*593dc095SDavid du Colombier dp -= pixel_bytes;
1850*593dc095SDavid du Colombier }
1851*593dc095SDavid du Colombier sptr -= pixel_bytes;
1852*593dc095SDavid du Colombier }
1853*593dc095SDavid du Colombier }
1854*593dc095SDavid du Colombier else if (pixel_bytes == 4)
1855*593dc095SDavid du Colombier {
1856*593dc095SDavid du Colombier for (i = width; i; i--)
1857*593dc095SDavid du Colombier {
1858*593dc095SDavid du Colombier png_byte v[8];
1859*593dc095SDavid du Colombier int j;
1860*593dc095SDavid du Colombier png_memcpy(v, sptr, pixel_bytes);
1861*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1862*593dc095SDavid du Colombier {
1863*593dc095SDavid du Colombier png_memcpy(dp, v, pixel_bytes);
1864*593dc095SDavid du Colombier dp -= pixel_bytes;
1865*593dc095SDavid du Colombier }
1866*593dc095SDavid du Colombier sptr -= pixel_bytes;
1867*593dc095SDavid du Colombier }
1868*593dc095SDavid du Colombier }
1869*593dc095SDavid du Colombier else if (pixel_bytes == 6)
1870*593dc095SDavid du Colombier {
1871*593dc095SDavid du Colombier for (i = width; i; i--)
1872*593dc095SDavid du Colombier {
1873*593dc095SDavid du Colombier png_byte v[8];
1874*593dc095SDavid du Colombier int j;
1875*593dc095SDavid du Colombier png_memcpy(v, sptr, pixel_bytes);
1876*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1877*593dc095SDavid du Colombier {
1878*593dc095SDavid du Colombier png_memcpy(dp, v, pixel_bytes);
1879*593dc095SDavid du Colombier dp -= pixel_bytes;
1880*593dc095SDavid du Colombier }
1881*593dc095SDavid du Colombier sptr -= pixel_bytes;
1882*593dc095SDavid du Colombier }
1883*593dc095SDavid du Colombier }
1884*593dc095SDavid du Colombier else
1885*593dc095SDavid du Colombier {
1886*593dc095SDavid du Colombier for (i = width; i; i--)
1887*593dc095SDavid du Colombier {
1888*593dc095SDavid du Colombier png_byte v[8];
1889*593dc095SDavid du Colombier int j;
1890*593dc095SDavid du Colombier png_memcpy(v, sptr, pixel_bytes);
1891*593dc095SDavid du Colombier for (j = 0; j < png_pass_inc[pass]; j++)
1892*593dc095SDavid du Colombier {
1893*593dc095SDavid du Colombier png_memcpy(dp, v, pixel_bytes);
1894*593dc095SDavid du Colombier dp -= pixel_bytes;
1895*593dc095SDavid du Colombier }
1896*593dc095SDavid du Colombier sptr -= pixel_bytes;
1897*593dc095SDavid du Colombier }
1898*593dc095SDavid du Colombier }
1899*593dc095SDavid du Colombier
1900*593dc095SDavid du Colombier } /* end of MMX not supported */
1901*593dc095SDavid du Colombier break;
1902*593dc095SDavid du Colombier }
1903*593dc095SDavid du Colombier } /* end switch (row_info->pixel_depth) */
1904*593dc095SDavid du Colombier
1905*593dc095SDavid du Colombier row_info->width = final_width;
1906*593dc095SDavid du Colombier
1907*593dc095SDavid du Colombier row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
1908*593dc095SDavid du Colombier }
1909*593dc095SDavid du Colombier
1910*593dc095SDavid du Colombier }
1911*593dc095SDavid du Colombier
1912*593dc095SDavid du Colombier #endif /* PNG_READ_INTERLACING_SUPPORTED */
1913*593dc095SDavid du Colombier
1914*593dc095SDavid du Colombier
1915*593dc095SDavid du Colombier // These variables are utilized in the functions below. They are declared
1916*593dc095SDavid du Colombier // globally here to ensure alignment on 8-byte boundaries.
1917*593dc095SDavid du Colombier
1918*593dc095SDavid du Colombier union uAll {
1919*593dc095SDavid du Colombier __int64 use;
1920*593dc095SDavid du Colombier double align;
1921*593dc095SDavid du Colombier } LBCarryMask = {0x0101010101010101},
1922*593dc095SDavid du Colombier HBClearMask = {0x7f7f7f7f7f7f7f7f},
1923*593dc095SDavid du Colombier ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
1924*593dc095SDavid du Colombier
1925*593dc095SDavid du Colombier
1926*593dc095SDavid du Colombier // Optimized code for PNG Average filter decoder
1927*593dc095SDavid du Colombier void /* PRIVATE */
png_read_filter_row_mmx_avg(png_row_infop row_info,png_bytep row,png_bytep prev_row)1928*593dc095SDavid du Colombier png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1929*593dc095SDavid du Colombier , png_bytep prev_row)
1930*593dc095SDavid du Colombier {
1931*593dc095SDavid du Colombier int bpp;
1932*593dc095SDavid du Colombier png_uint_32 FullLength;
1933*593dc095SDavid du Colombier png_uint_32 MMXLength;
1934*593dc095SDavid du Colombier //png_uint_32 len;
1935*593dc095SDavid du Colombier int diff;
1936*593dc095SDavid du Colombier
1937*593dc095SDavid du Colombier bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
1938*593dc095SDavid du Colombier FullLength = row_info->rowbytes; // # of bytes to filter
1939*593dc095SDavid du Colombier _asm {
1940*593dc095SDavid du Colombier // Init address pointers and offset
1941*593dc095SDavid du Colombier mov edi, row // edi ==> Avg(x)
1942*593dc095SDavid du Colombier xor ebx, ebx // ebx ==> x
1943*593dc095SDavid du Colombier mov edx, edi
1944*593dc095SDavid du Colombier mov esi, prev_row // esi ==> Prior(x)
1945*593dc095SDavid du Colombier sub edx, bpp // edx ==> Raw(x-bpp)
1946*593dc095SDavid du Colombier
1947*593dc095SDavid du Colombier xor eax, eax
1948*593dc095SDavid du Colombier // Compute the Raw value for the first bpp bytes
1949*593dc095SDavid du Colombier // Raw(x) = Avg(x) + (Prior(x)/2)
1950*593dc095SDavid du Colombier davgrlp:
1951*593dc095SDavid du Colombier mov al, [esi + ebx] // Load al with Prior(x)
1952*593dc095SDavid du Colombier inc ebx
1953*593dc095SDavid du Colombier shr al, 1 // divide by 2
1954*593dc095SDavid du Colombier add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1955*593dc095SDavid du Colombier cmp ebx, bpp
1956*593dc095SDavid du Colombier mov [edi+ebx-1], al // Write back Raw(x);
1957*593dc095SDavid du Colombier // mov does not affect flags; -1 to offset inc ebx
1958*593dc095SDavid du Colombier jb davgrlp
1959*593dc095SDavid du Colombier // get # of bytes to alignment
1960*593dc095SDavid du Colombier mov diff, edi // take start of row
1961*593dc095SDavid du Colombier add diff, ebx // add bpp
1962*593dc095SDavid du Colombier add diff, 0xf // add 7 + 8 to incr past alignment boundary
1963*593dc095SDavid du Colombier and diff, 0xfffffff8 // mask to alignment boundary
1964*593dc095SDavid du Colombier sub diff, edi // subtract from start ==> value ebx at alignment
1965*593dc095SDavid du Colombier jz davggo
1966*593dc095SDavid du Colombier // fix alignment
1967*593dc095SDavid du Colombier // Compute the Raw value for the bytes upto the alignment boundary
1968*593dc095SDavid du Colombier // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
1969*593dc095SDavid du Colombier xor ecx, ecx
1970*593dc095SDavid du Colombier davglp1:
1971*593dc095SDavid du Colombier xor eax, eax
1972*593dc095SDavid du Colombier mov cl, [esi + ebx] // load cl with Prior(x)
1973*593dc095SDavid du Colombier mov al, [edx + ebx] // load al with Raw(x-bpp)
1974*593dc095SDavid du Colombier add ax, cx
1975*593dc095SDavid du Colombier inc ebx
1976*593dc095SDavid du Colombier shr ax, 1 // divide by 2
1977*593dc095SDavid du Colombier add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1978*593dc095SDavid du Colombier cmp ebx, diff // Check if at alignment boundary
1979*593dc095SDavid du Colombier mov [edi+ebx-1], al // Write back Raw(x);
1980*593dc095SDavid du Colombier // mov does not affect flags; -1 to offset inc ebx
1981*593dc095SDavid du Colombier jb davglp1 // Repeat until at alignment boundary
1982*593dc095SDavid du Colombier davggo:
1983*593dc095SDavid du Colombier mov eax, FullLength
1984*593dc095SDavid du Colombier mov ecx, eax
1985*593dc095SDavid du Colombier sub eax, ebx // subtract alignment fix
1986*593dc095SDavid du Colombier and eax, 0x00000007 // calc bytes over mult of 8
1987*593dc095SDavid du Colombier sub ecx, eax // drop over bytes from original length
1988*593dc095SDavid du Colombier mov MMXLength, ecx
1989*593dc095SDavid du Colombier } // end _asm block
1990*593dc095SDavid du Colombier // Now do the math for the rest of the row
1991*593dc095SDavid du Colombier switch ( bpp )
1992*593dc095SDavid du Colombier {
1993*593dc095SDavid du Colombier case 3:
1994*593dc095SDavid du Colombier {
1995*593dc095SDavid du Colombier ActiveMask.use = 0x0000000000ffffff;
1996*593dc095SDavid du Colombier ShiftBpp.use = 24; // == 3 * 8
1997*593dc095SDavid du Colombier ShiftRem.use = 40; // == 64 - 24
1998*593dc095SDavid du Colombier _asm {
1999*593dc095SDavid du Colombier // Re-init address pointers and offset
2000*593dc095SDavid du Colombier movq mm7, ActiveMask
2001*593dc095SDavid du Colombier mov ebx, diff // ebx ==> x = offset to alignment boundary
2002*593dc095SDavid du Colombier movq mm5, LBCarryMask
2003*593dc095SDavid du Colombier mov edi, row // edi ==> Avg(x)
2004*593dc095SDavid du Colombier movq mm4, HBClearMask
2005*593dc095SDavid du Colombier mov esi, prev_row // esi ==> Prior(x)
2006*593dc095SDavid du Colombier // PRIME the pump (load the first Raw(x-bpp) data set
2007*593dc095SDavid du Colombier movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2008*593dc095SDavid du Colombier // (we correct position in loop below)
2009*593dc095SDavid du Colombier davg3lp:
2010*593dc095SDavid du Colombier movq mm0, [edi + ebx] // Load mm0 with Avg(x)
2011*593dc095SDavid du Colombier // Add (Prev_row/2) to Average
2012*593dc095SDavid du Colombier movq mm3, mm5
2013*593dc095SDavid du Colombier psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data
2014*593dc095SDavid du Colombier movq mm1, [esi + ebx] // Load mm1 with Prior(x)
2015*593dc095SDavid du Colombier movq mm6, mm7
2016*593dc095SDavid du Colombier pand mm3, mm1 // get lsb for each prev_row byte
2017*593dc095SDavid du Colombier psrlq mm1, 1 // divide prev_row bytes by 2
2018*593dc095SDavid du Colombier pand mm1, mm4 // clear invalid bit 7 of each byte
2019*593dc095SDavid du Colombier paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2020*593dc095SDavid du Colombier // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2021*593dc095SDavid du Colombier movq mm1, mm3 // now use mm1 for getting LBCarrys
2022*593dc095SDavid du Colombier pand mm1, mm2 // get LBCarrys for each byte where both
2023*593dc095SDavid du Colombier // lsb's were == 1 (Only valid for active group)
2024*593dc095SDavid du Colombier psrlq mm2, 1 // divide raw bytes by 2
2025*593dc095SDavid du Colombier pand mm2, mm4 // clear invalid bit 7 of each byte
2026*593dc095SDavid du Colombier paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2027*593dc095SDavid du Colombier pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2028*593dc095SDavid du Colombier paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2029*593dc095SDavid du Colombier // byte
2030*593dc095SDavid du Colombier // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2031*593dc095SDavid du Colombier psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5
2032*593dc095SDavid du Colombier movq mm2, mm0 // mov updated Raws to mm2
2033*593dc095SDavid du Colombier psllq mm2, ShiftBpp // shift data to position correctly
2034*593dc095SDavid du Colombier movq mm1, mm3 // now use mm1 for getting LBCarrys
2035*593dc095SDavid du Colombier pand mm1, mm2 // get LBCarrys for each byte where both
2036*593dc095SDavid du Colombier // lsb's were == 1 (Only valid for active group)
2037*593dc095SDavid du Colombier psrlq mm2, 1 // divide raw bytes by 2
2038*593dc095SDavid du Colombier pand mm2, mm4 // clear invalid bit 7 of each byte
2039*593dc095SDavid du Colombier paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2040*593dc095SDavid du Colombier pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2041*593dc095SDavid du Colombier paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2042*593dc095SDavid du Colombier // byte
2043*593dc095SDavid du Colombier
2044*593dc095SDavid du Colombier // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2045*593dc095SDavid du Colombier psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two
2046*593dc095SDavid du Colombier // bytes
2047*593dc095SDavid du Colombier movq mm2, mm0 // mov updated Raws to mm2
2048*593dc095SDavid du Colombier psllq mm2, ShiftBpp // shift data to position correctly
2049*593dc095SDavid du Colombier // Data only needs to be shifted once here to
2050*593dc095SDavid du Colombier // get the correct x-bpp offset.
2051*593dc095SDavid du Colombier movq mm1, mm3 // now use mm1 for getting LBCarrys
2052*593dc095SDavid du Colombier pand mm1, mm2 // get LBCarrys for each byte where both
2053*593dc095SDavid du Colombier // lsb's were == 1 (Only valid for active group)
2054*593dc095SDavid du Colombier psrlq mm2, 1 // divide raw bytes by 2
2055*593dc095SDavid du Colombier pand mm2, mm4 // clear invalid bit 7 of each byte
2056*593dc095SDavid du Colombier paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2057*593dc095SDavid du Colombier pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2058*593dc095SDavid du Colombier add ebx, 8
2059*593dc095SDavid du Colombier paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2060*593dc095SDavid du Colombier // byte
2061*593dc095SDavid du Colombier
2062*593dc095SDavid du Colombier // Now ready to write back to memory
2063*593dc095SDavid du Colombier movq [edi + ebx - 8], mm0
2064*593dc095SDavid du Colombier // Move updated Raw(x) to use as Raw(x-bpp) for next loop
2065*593dc095SDavid du Colombier cmp ebx, MMXLength
2066*593dc095SDavid du Colombier movq mm2, mm0 // mov updated Raw(x) to mm2
2067*593dc095SDavid du Colombier jb davg3lp
2068*593dc095SDavid du Colombier } // end _asm block
2069*593dc095SDavid du Colombier }
2070*593dc095SDavid du Colombier break;
2071*593dc095SDavid du Colombier
2072*593dc095SDavid du Colombier case 6:
2073*593dc095SDavid du Colombier case 4:
2074*593dc095SDavid du Colombier case 7:
2075*593dc095SDavid du Colombier case 5:
2076*593dc095SDavid du Colombier {
2077*593dc095SDavid du Colombier ActiveMask.use = 0xffffffffffffffff; // use shift below to clear
2078*593dc095SDavid du Colombier // appropriate inactive bytes
2079*593dc095SDavid du Colombier ShiftBpp.use = bpp << 3;
2080*593dc095SDavid du Colombier ShiftRem.use = 64 - ShiftBpp.use;
2081*593dc095SDavid du Colombier _asm {
2082*593dc095SDavid du Colombier movq mm4, HBClearMask
2083*593dc095SDavid du Colombier // Re-init address pointers and offset
2084*593dc095SDavid du Colombier mov ebx, diff // ebx ==> x = offset to alignment boundary
2085*593dc095SDavid du Colombier // Load ActiveMask and clear all bytes except for 1st active group
2086*593dc095SDavid du Colombier movq mm7, ActiveMask
2087*593dc095SDavid du Colombier mov edi, row // edi ==> Avg(x)
2088*593dc095SDavid du Colombier psrlq mm7, ShiftRem
2089*593dc095SDavid du Colombier mov esi, prev_row // esi ==> Prior(x)
2090*593dc095SDavid du Colombier movq mm6, mm7
2091*593dc095SDavid du Colombier movq mm5, LBCarryMask
2092*593dc095SDavid du Colombier psllq mm6, ShiftBpp // Create mask for 2nd active group
2093*593dc095SDavid du Colombier // PRIME the pump (load the first Raw(x-bpp) data set
2094*593dc095SDavid du Colombier movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2095*593dc095SDavid du Colombier // (we correct position in loop below)
2096*593dc095SDavid du Colombier davg4lp:
2097*593dc095SDavid du Colombier movq mm0, [edi + ebx]
2098*593dc095SDavid du Colombier psrlq mm2, ShiftRem // shift data to position correctly
2099*593dc095SDavid du Colombier movq mm1, [esi + ebx]
2100*593dc095SDavid du Colombier // Add (Prev_row/2) to Average
2101*593dc095SDavid du Colombier movq mm3, mm5
2102*593dc095SDavid du Colombier pand mm3, mm1 // get lsb for each prev_row byte
2103*593dc095SDavid du Colombier psrlq mm1, 1 // divide prev_row bytes by 2
2104*593dc095SDavid du Colombier pand mm1, mm4 // clear invalid bit 7 of each byte
2105*593dc095SDavid du Colombier paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2106*593dc095SDavid du Colombier // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2107*593dc095SDavid du Colombier movq mm1, mm3 // now use mm1 for getting LBCarrys
2108*593dc095SDavid du Colombier pand mm1, mm2 // get LBCarrys for each byte where both
2109*593dc095SDavid du Colombier // lsb's were == 1 (Only valid for active group)
2110*593dc095SDavid du Colombier psrlq mm2, 1 // divide raw bytes by 2
2111*593dc095SDavid du Colombier pand mm2, mm4 // clear invalid bit 7 of each byte
2112*593dc095SDavid du Colombier paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2113*593dc095SDavid du Colombier pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg
2114*593dc095SDavid du Colombier paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2115*593dc095SDavid du Colombier // byte
2116*593dc095SDavid du Colombier // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2117*593dc095SDavid du Colombier movq mm2, mm0 // mov updated Raws to mm2
2118*593dc095SDavid du Colombier psllq mm2, ShiftBpp // shift data to position correctly
2119*593dc095SDavid du Colombier add ebx, 8
2120*593dc095SDavid du Colombier movq mm1, mm3 // now use mm1 for getting LBCarrys
2121*593dc095SDavid du Colombier pand mm1, mm2 // get LBCarrys for each byte where both
2122*593dc095SDavid du Colombier // lsb's were == 1 (Only valid for active group)
2123*593dc095SDavid du Colombier psrlq mm2, 1 // divide raw bytes by 2
2124*593dc095SDavid du Colombier pand mm2, mm4 // clear invalid bit 7 of each byte
2125*593dc095SDavid du Colombier paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2126*593dc095SDavid du Colombier pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2127*593dc095SDavid du Colombier paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2128*593dc095SDavid du Colombier // byte
2129*593dc095SDavid du Colombier cmp ebx, MMXLength
2130*593dc095SDavid du Colombier // Now ready to write back to memory
2131*593dc095SDavid du Colombier movq [edi + ebx - 8], mm0
2132*593dc095SDavid du Colombier // Prep Raw(x-bpp) for next loop
2133*593dc095SDavid du Colombier movq mm2, mm0 // mov updated Raws to mm2
2134*593dc095SDavid du Colombier jb davg4lp
2135*593dc095SDavid du Colombier } // end _asm block
2136*593dc095SDavid du Colombier }
2137*593dc095SDavid du Colombier break;
2138*593dc095SDavid du Colombier case 2:
2139*593dc095SDavid du Colombier {
2140*593dc095SDavid du Colombier ActiveMask.use = 0x000000000000ffff;
2141*593dc095SDavid du Colombier ShiftBpp.use = 16; // == 2 * 8 [BUGFIX]
2142*593dc095SDavid du Colombier ShiftRem.use = 48; // == 64 - 16 [BUGFIX]
2143*593dc095SDavid du Colombier _asm {
2144*593dc095SDavid du Colombier // Load ActiveMask
2145*593dc095SDavid du Colombier movq mm7, ActiveMask
2146*593dc095SDavid du Colombier // Re-init address pointers and offset
2147*593dc095SDavid du Colombier mov ebx, diff // ebx ==> x = offset to alignment boundary
2148*593dc095SDavid du Colombier movq mm5, LBCarryMask
2149*593dc095SDavid du Colombier mov edi, row // edi ==> Avg(x)
2150*593dc095SDavid du Colombier movq mm4, HBClearMask
2151*593dc095SDavid du Colombier mov esi, prev_row // esi ==> Prior(x)
2152*593dc095SDavid du Colombier // PRIME the pump (load the first Raw(x-bpp) data set
2153*593dc095SDavid du Colombier movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2154*593dc095SDavid du Colombier // (we correct position in loop below)
2155*593dc095SDavid du Colombier davg2lp:
2156*593dc095SDavid du Colombier movq mm0, [edi + ebx]
2157*593dc095SDavid du Colombier psrlq mm2, ShiftRem // shift data to position correctly [BUGFIX]
2158*593dc095SDavid du Colombier movq mm1, [esi + ebx]
2159*593dc095SDavid du Colombier // Add (Prev_row/2) to Average
2160*593dc095SDavid du Colombier movq mm3, mm5
2161*593dc095SDavid du Colombier pand mm3, mm1 // get lsb for each prev_row byte
2162*593dc095SDavid du Colombier psrlq mm1, 1 // divide prev_row bytes by 2
2163*593dc095SDavid du Colombier pand mm1, mm4 // clear invalid bit 7 of each byte
2164*593dc095SDavid du Colombier movq mm6, mm7
2165*593dc095SDavid du Colombier paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2166*593dc095SDavid du Colombier // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2167*593dc095SDavid du Colombier movq mm1, mm3 // now use mm1 for getting LBCarrys
2168*593dc095SDavid du Colombier pand mm1, mm2 // get LBCarrys for each byte where both
2169*593dc095SDavid du Colombier // lsb's were == 1 (Only valid for active group)
2170*593dc095SDavid du Colombier psrlq mm2, 1 // divide raw bytes by 2
2171*593dc095SDavid du Colombier pand mm2, mm4 // clear invalid bit 7 of each byte
2172*593dc095SDavid du Colombier paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2173*593dc095SDavid du Colombier pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2174*593dc095SDavid du Colombier paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2175*593dc095SDavid du Colombier // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2176*593dc095SDavid du Colombier psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
2177*593dc095SDavid du Colombier movq mm2, mm0 // mov updated Raws to mm2
2178*593dc095SDavid du Colombier psllq mm2, ShiftBpp // shift data to position correctly
2179*593dc095SDavid du Colombier movq mm1, mm3 // now use mm1 for getting LBCarrys
2180*593dc095SDavid du Colombier pand mm1, mm2 // get LBCarrys for each byte where both
2181*593dc095SDavid du Colombier // lsb's were == 1 (Only valid for active group)
2182*593dc095SDavid du Colombier psrlq mm2, 1 // divide raw bytes by 2
2183*593dc095SDavid du Colombier pand mm2, mm4 // clear invalid bit 7 of each byte
2184*593dc095SDavid du Colombier paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2185*593dc095SDavid du Colombier pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2186*593dc095SDavid du Colombier paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2187*593dc095SDavid du Colombier
2188*593dc095SDavid du Colombier // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2189*593dc095SDavid du Colombier psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
2190*593dc095SDavid du Colombier movq mm2, mm0 // mov updated Raws to mm2
2191*593dc095SDavid du Colombier psllq mm2, ShiftBpp // shift data to position correctly
2192*593dc095SDavid du Colombier // Data only needs to be shifted once here to
2193*593dc095SDavid du Colombier // get the correct x-bpp offset.
2194*593dc095SDavid du Colombier movq mm1, mm3 // now use mm1 for getting LBCarrys
2195*593dc095SDavid du Colombier pand mm1, mm2 // get LBCarrys for each byte where both
2196*593dc095SDavid du Colombier // lsb's were == 1 (Only valid for active group)
2197*593dc095SDavid du Colombier psrlq mm2, 1 // divide raw bytes by 2
2198*593dc095SDavid du Colombier pand mm2, mm4 // clear invalid bit 7 of each byte
2199*593dc095SDavid du Colombier paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2200*593dc095SDavid du Colombier pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2201*593dc095SDavid du Colombier paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2202*593dc095SDavid du Colombier
2203*593dc095SDavid du Colombier // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2204*593dc095SDavid du Colombier psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7
2205*593dc095SDavid du Colombier movq mm2, mm0 // mov updated Raws to mm2
2206*593dc095SDavid du Colombier psllq mm2, ShiftBpp // shift data to position correctly
2207*593dc095SDavid du Colombier // Data only needs to be shifted once here to
2208*593dc095SDavid du Colombier // get the correct x-bpp offset.
2209*593dc095SDavid du Colombier add ebx, 8
2210*593dc095SDavid du Colombier movq mm1, mm3 // now use mm1 for getting LBCarrys
2211*593dc095SDavid du Colombier pand mm1, mm2 // get LBCarrys for each byte where both
2212*593dc095SDavid du Colombier // lsb's were == 1 (Only valid for active group)
2213*593dc095SDavid du Colombier psrlq mm2, 1 // divide raw bytes by 2
2214*593dc095SDavid du Colombier pand mm2, mm4 // clear invalid bit 7 of each byte
2215*593dc095SDavid du Colombier paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2216*593dc095SDavid du Colombier pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2217*593dc095SDavid du Colombier paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2218*593dc095SDavid du Colombier
2219*593dc095SDavid du Colombier cmp ebx, MMXLength
2220*593dc095SDavid du Colombier // Now ready to write back to memory
2221*593dc095SDavid du Colombier movq [edi + ebx - 8], mm0
2222*593dc095SDavid du Colombier // Prep Raw(x-bpp) for next loop
2223*593dc095SDavid du Colombier movq mm2, mm0 // mov updated Raws to mm2
2224*593dc095SDavid du Colombier jb davg2lp
2225*593dc095SDavid du Colombier } // end _asm block
2226*593dc095SDavid du Colombier }
2227*593dc095SDavid du Colombier break;
2228*593dc095SDavid du Colombier
2229*593dc095SDavid du Colombier case 1: // bpp == 1
2230*593dc095SDavid du Colombier {
2231*593dc095SDavid du Colombier _asm {
2232*593dc095SDavid du Colombier // Re-init address pointers and offset
2233*593dc095SDavid du Colombier mov ebx, diff // ebx ==> x = offset to alignment boundary
2234*593dc095SDavid du Colombier mov edi, row // edi ==> Avg(x)
2235*593dc095SDavid du Colombier cmp ebx, FullLength // Test if offset at end of array
2236*593dc095SDavid du Colombier jnb davg1end
2237*593dc095SDavid du Colombier // Do Paeth decode for remaining bytes
2238*593dc095SDavid du Colombier mov esi, prev_row // esi ==> Prior(x)
2239*593dc095SDavid du Colombier mov edx, edi
2240*593dc095SDavid du Colombier xor ecx, ecx // zero ecx before using cl & cx in loop below
2241*593dc095SDavid du Colombier sub edx, bpp // edx ==> Raw(x-bpp)
2242*593dc095SDavid du Colombier davg1lp:
2243*593dc095SDavid du Colombier // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2244*593dc095SDavid du Colombier xor eax, eax
2245*593dc095SDavid du Colombier mov cl, [esi + ebx] // load cl with Prior(x)
2246*593dc095SDavid du Colombier mov al, [edx + ebx] // load al with Raw(x-bpp)
2247*593dc095SDavid du Colombier add ax, cx
2248*593dc095SDavid du Colombier inc ebx
2249*593dc095SDavid du Colombier shr ax, 1 // divide by 2
2250*593dc095SDavid du Colombier add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2251*593dc095SDavid du Colombier cmp ebx, FullLength // Check if at end of array
2252*593dc095SDavid du Colombier mov [edi+ebx-1], al // Write back Raw(x);
2253*593dc095SDavid du Colombier // mov does not affect flags; -1 to offset inc ebx
2254*593dc095SDavid du Colombier jb davg1lp
2255*593dc095SDavid du Colombier davg1end:
2256*593dc095SDavid du Colombier } // end _asm block
2257*593dc095SDavid du Colombier }
2258*593dc095SDavid du Colombier return;
2259*593dc095SDavid du Colombier
2260*593dc095SDavid du Colombier case 8: // bpp == 8
2261*593dc095SDavid du Colombier {
2262*593dc095SDavid du Colombier _asm {
2263*593dc095SDavid du Colombier // Re-init address pointers and offset
2264*593dc095SDavid du Colombier mov ebx, diff // ebx ==> x = offset to alignment boundary
2265*593dc095SDavid du Colombier movq mm5, LBCarryMask
2266*593dc095SDavid du Colombier mov edi, row // edi ==> Avg(x)
2267*593dc095SDavid du Colombier movq mm4, HBClearMask
2268*593dc095SDavid du Colombier mov esi, prev_row // esi ==> Prior(x)
2269*593dc095SDavid du Colombier // PRIME the pump (load the first Raw(x-bpp) data set
2270*593dc095SDavid du Colombier movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2271*593dc095SDavid du Colombier // (NO NEED to correct position in loop below)
2272*593dc095SDavid du Colombier davg8lp:
2273*593dc095SDavid du Colombier movq mm0, [edi + ebx]
2274*593dc095SDavid du Colombier movq mm3, mm5
2275*593dc095SDavid du Colombier movq mm1, [esi + ebx]
2276*593dc095SDavid du Colombier add ebx, 8
2277*593dc095SDavid du Colombier pand mm3, mm1 // get lsb for each prev_row byte
2278*593dc095SDavid du Colombier psrlq mm1, 1 // divide prev_row bytes by 2
2279*593dc095SDavid du Colombier pand mm3, mm2 // get LBCarrys for each byte where both
2280*593dc095SDavid du Colombier // lsb's were == 1
2281*593dc095SDavid du Colombier psrlq mm2, 1 // divide raw bytes by 2
2282*593dc095SDavid du Colombier pand mm1, mm4 // clear invalid bit 7 of each byte
2283*593dc095SDavid du Colombier paddb mm0, mm3 // add LBCarrys to Avg for each byte
2284*593dc095SDavid du Colombier pand mm2, mm4 // clear invalid bit 7 of each byte
2285*593dc095SDavid du Colombier paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2286*593dc095SDavid du Colombier paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2287*593dc095SDavid du Colombier cmp ebx, MMXLength
2288*593dc095SDavid du Colombier movq [edi + ebx - 8], mm0
2289*593dc095SDavid du Colombier movq mm2, mm0 // reuse as Raw(x-bpp)
2290*593dc095SDavid du Colombier jb davg8lp
2291*593dc095SDavid du Colombier } // end _asm block
2292*593dc095SDavid du Colombier }
2293*593dc095SDavid du Colombier break;
2294*593dc095SDavid du Colombier default: // bpp greater than 8
2295*593dc095SDavid du Colombier {
2296*593dc095SDavid du Colombier _asm {
2297*593dc095SDavid du Colombier movq mm5, LBCarryMask
2298*593dc095SDavid du Colombier // Re-init address pointers and offset
2299*593dc095SDavid du Colombier mov ebx, diff // ebx ==> x = offset to alignment boundary
2300*593dc095SDavid du Colombier mov edi, row // edi ==> Avg(x)
2301*593dc095SDavid du Colombier movq mm4, HBClearMask
2302*593dc095SDavid du Colombier mov edx, edi
2303*593dc095SDavid du Colombier mov esi, prev_row // esi ==> Prior(x)
2304*593dc095SDavid du Colombier sub edx, bpp // edx ==> Raw(x-bpp)
2305*593dc095SDavid du Colombier davgAlp:
2306*593dc095SDavid du Colombier movq mm0, [edi + ebx]
2307*593dc095SDavid du Colombier movq mm3, mm5
2308*593dc095SDavid du Colombier movq mm1, [esi + ebx]
2309*593dc095SDavid du Colombier pand mm3, mm1 // get lsb for each prev_row byte
2310*593dc095SDavid du Colombier movq mm2, [edx + ebx]
2311*593dc095SDavid du Colombier psrlq mm1, 1 // divide prev_row bytes by 2
2312*593dc095SDavid du Colombier pand mm3, mm2 // get LBCarrys for each byte where both
2313*593dc095SDavid du Colombier // lsb's were == 1
2314*593dc095SDavid du Colombier psrlq mm2, 1 // divide raw bytes by 2
2315*593dc095SDavid du Colombier pand mm1, mm4 // clear invalid bit 7 of each byte
2316*593dc095SDavid du Colombier paddb mm0, mm3 // add LBCarrys to Avg for each byte
2317*593dc095SDavid du Colombier pand mm2, mm4 // clear invalid bit 7 of each byte
2318*593dc095SDavid du Colombier paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2319*593dc095SDavid du Colombier add ebx, 8
2320*593dc095SDavid du Colombier paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2321*593dc095SDavid du Colombier cmp ebx, MMXLength
2322*593dc095SDavid du Colombier movq [edi + ebx - 8], mm0
2323*593dc095SDavid du Colombier jb davgAlp
2324*593dc095SDavid du Colombier } // end _asm block
2325*593dc095SDavid du Colombier }
2326*593dc095SDavid du Colombier break;
2327*593dc095SDavid du Colombier } // end switch ( bpp )
2328*593dc095SDavid du Colombier
2329*593dc095SDavid du Colombier _asm {
2330*593dc095SDavid du Colombier // MMX acceleration complete now do clean-up
2331*593dc095SDavid du Colombier // Check if any remaining bytes left to decode
2332*593dc095SDavid du Colombier mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX
2333*593dc095SDavid du Colombier mov edi, row // edi ==> Avg(x)
2334*593dc095SDavid du Colombier cmp ebx, FullLength // Test if offset at end of array
2335*593dc095SDavid du Colombier jnb davgend
2336*593dc095SDavid du Colombier // Do Paeth decode for remaining bytes
2337*593dc095SDavid du Colombier mov esi, prev_row // esi ==> Prior(x)
2338*593dc095SDavid du Colombier mov edx, edi
2339*593dc095SDavid du Colombier xor ecx, ecx // zero ecx before using cl & cx in loop below
2340*593dc095SDavid du Colombier sub edx, bpp // edx ==> Raw(x-bpp)
2341*593dc095SDavid du Colombier davglp2:
2342*593dc095SDavid du Colombier // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2343*593dc095SDavid du Colombier xor eax, eax
2344*593dc095SDavid du Colombier mov cl, [esi + ebx] // load cl with Prior(x)
2345*593dc095SDavid du Colombier mov al, [edx + ebx] // load al with Raw(x-bpp)
2346*593dc095SDavid du Colombier add ax, cx
2347*593dc095SDavid du Colombier inc ebx
2348*593dc095SDavid du Colombier shr ax, 1 // divide by 2
2349*593dc095SDavid du Colombier add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2350*593dc095SDavid du Colombier cmp ebx, FullLength // Check if at end of array
2351*593dc095SDavid du Colombier mov [edi+ebx-1], al // Write back Raw(x);
2352*593dc095SDavid du Colombier // mov does not affect flags; -1 to offset inc ebx
2353*593dc095SDavid du Colombier jb davglp2
2354*593dc095SDavid du Colombier davgend:
2355*593dc095SDavid du Colombier emms // End MMX instructions; prep for possible FP instrs.
2356*593dc095SDavid du Colombier } // end _asm block
2357*593dc095SDavid du Colombier }
2358*593dc095SDavid du Colombier
2359*593dc095SDavid du Colombier // Optimized code for PNG Paeth filter decoder
2360*593dc095SDavid du Colombier void /* PRIVATE */
png_read_filter_row_mmx_paeth(png_row_infop row_info,png_bytep row,png_bytep prev_row)2361*593dc095SDavid du Colombier png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2362*593dc095SDavid du Colombier png_bytep prev_row)
2363*593dc095SDavid du Colombier {
2364*593dc095SDavid du Colombier png_uint_32 FullLength;
2365*593dc095SDavid du Colombier png_uint_32 MMXLength;
2366*593dc095SDavid du Colombier //png_uint_32 len;
2367*593dc095SDavid du Colombier int bpp;
2368*593dc095SDavid du Colombier int diff;
2369*593dc095SDavid du Colombier //int ptemp;
2370*593dc095SDavid du Colombier int patemp, pbtemp, pctemp;
2371*593dc095SDavid du Colombier
2372*593dc095SDavid du Colombier bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2373*593dc095SDavid du Colombier FullLength = row_info->rowbytes; // # of bytes to filter
2374*593dc095SDavid du Colombier _asm
2375*593dc095SDavid du Colombier {
2376*593dc095SDavid du Colombier xor ebx, ebx // ebx ==> x offset
2377*593dc095SDavid du Colombier mov edi, row
2378*593dc095SDavid du Colombier xor edx, edx // edx ==> x-bpp offset
2379*593dc095SDavid du Colombier mov esi, prev_row
2380*593dc095SDavid du Colombier xor eax, eax
2381*593dc095SDavid du Colombier
2382*593dc095SDavid du Colombier // Compute the Raw value for the first bpp bytes
2383*593dc095SDavid du Colombier // Note: the formula works out to be always
2384*593dc095SDavid du Colombier // Paeth(x) = Raw(x) + Prior(x) where x < bpp
2385*593dc095SDavid du Colombier dpthrlp:
2386*593dc095SDavid du Colombier mov al, [edi + ebx]
2387*593dc095SDavid du Colombier add al, [esi + ebx]
2388*593dc095SDavid du Colombier inc ebx
2389*593dc095SDavid du Colombier cmp ebx, bpp
2390*593dc095SDavid du Colombier mov [edi + ebx - 1], al
2391*593dc095SDavid du Colombier jb dpthrlp
2392*593dc095SDavid du Colombier // get # of bytes to alignment
2393*593dc095SDavid du Colombier mov diff, edi // take start of row
2394*593dc095SDavid du Colombier add diff, ebx // add bpp
2395*593dc095SDavid du Colombier xor ecx, ecx
2396*593dc095SDavid du Colombier add diff, 0xf // add 7 + 8 to incr past alignment boundary
2397*593dc095SDavid du Colombier and diff, 0xfffffff8 // mask to alignment boundary
2398*593dc095SDavid du Colombier sub diff, edi // subtract from start ==> value ebx at alignment
2399*593dc095SDavid du Colombier jz dpthgo
2400*593dc095SDavid du Colombier // fix alignment
2401*593dc095SDavid du Colombier dpthlp1:
2402*593dc095SDavid du Colombier xor eax, eax
2403*593dc095SDavid du Colombier // pav = p - a = (a + b - c) - a = b - c
2404*593dc095SDavid du Colombier mov al, [esi + ebx] // load Prior(x) into al
2405*593dc095SDavid du Colombier mov cl, [esi + edx] // load Prior(x-bpp) into cl
2406*593dc095SDavid du Colombier sub eax, ecx // subtract Prior(x-bpp)
2407*593dc095SDavid du Colombier mov patemp, eax // Save pav for later use
2408*593dc095SDavid du Colombier xor eax, eax
2409*593dc095SDavid du Colombier // pbv = p - b = (a + b - c) - b = a - c
2410*593dc095SDavid du Colombier mov al, [edi + edx] // load Raw(x-bpp) into al
2411*593dc095SDavid du Colombier sub eax, ecx // subtract Prior(x-bpp)
2412*593dc095SDavid du Colombier mov ecx, eax
2413*593dc095SDavid du Colombier // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2414*593dc095SDavid du Colombier add eax, patemp // pcv = pav + pbv
2415*593dc095SDavid du Colombier // pc = abs(pcv)
2416*593dc095SDavid du Colombier test eax, 0x80000000
2417*593dc095SDavid du Colombier jz dpthpca
2418*593dc095SDavid du Colombier neg eax // reverse sign of neg values
2419*593dc095SDavid du Colombier dpthpca:
2420*593dc095SDavid du Colombier mov pctemp, eax // save pc for later use
2421*593dc095SDavid du Colombier // pb = abs(pbv)
2422*593dc095SDavid du Colombier test ecx, 0x80000000
2423*593dc095SDavid du Colombier jz dpthpba
2424*593dc095SDavid du Colombier neg ecx // reverse sign of neg values
2425*593dc095SDavid du Colombier dpthpba:
2426*593dc095SDavid du Colombier mov pbtemp, ecx // save pb for later use
2427*593dc095SDavid du Colombier // pa = abs(pav)
2428*593dc095SDavid du Colombier mov eax, patemp
2429*593dc095SDavid du Colombier test eax, 0x80000000
2430*593dc095SDavid du Colombier jz dpthpaa
2431*593dc095SDavid du Colombier neg eax // reverse sign of neg values
2432*593dc095SDavid du Colombier dpthpaa:
2433*593dc095SDavid du Colombier mov patemp, eax // save pa for later use
2434*593dc095SDavid du Colombier // test if pa <= pb
2435*593dc095SDavid du Colombier cmp eax, ecx
2436*593dc095SDavid du Colombier jna dpthabb
2437*593dc095SDavid du Colombier // pa > pb; now test if pb <= pc
2438*593dc095SDavid du Colombier cmp ecx, pctemp
2439*593dc095SDavid du Colombier jna dpthbbc
2440*593dc095SDavid du Colombier // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2441*593dc095SDavid du Colombier mov cl, [esi + edx] // load Prior(x-bpp) into cl
2442*593dc095SDavid du Colombier jmp dpthpaeth
2443*593dc095SDavid du Colombier dpthbbc:
2444*593dc095SDavid du Colombier // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
2445*593dc095SDavid du Colombier mov cl, [esi + ebx] // load Prior(x) into cl
2446*593dc095SDavid du Colombier jmp dpthpaeth
2447*593dc095SDavid du Colombier dpthabb:
2448*593dc095SDavid du Colombier // pa <= pb; now test if pa <= pc
2449*593dc095SDavid du Colombier cmp eax, pctemp
2450*593dc095SDavid du Colombier jna dpthabc
2451*593dc095SDavid du Colombier // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2452*593dc095SDavid du Colombier mov cl, [esi + edx] // load Prior(x-bpp) into cl
2453*593dc095SDavid du Colombier jmp dpthpaeth
2454*593dc095SDavid du Colombier dpthabc:
2455*593dc095SDavid du Colombier // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
2456*593dc095SDavid du Colombier mov cl, [edi + edx] // load Raw(x-bpp) into cl
2457*593dc095SDavid du Colombier dpthpaeth:
2458*593dc095SDavid du Colombier inc ebx
2459*593dc095SDavid du Colombier inc edx
2460*593dc095SDavid du Colombier // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
2461*593dc095SDavid du Colombier add [edi + ebx - 1], cl
2462*593dc095SDavid du Colombier cmp ebx, diff
2463*593dc095SDavid du Colombier jb dpthlp1
2464*593dc095SDavid du Colombier dpthgo:
2465*593dc095SDavid du Colombier mov ecx, FullLength
2466*593dc095SDavid du Colombier mov eax, ecx
2467*593dc095SDavid du Colombier sub eax, ebx // subtract alignment fix
2468*593dc095SDavid du Colombier and eax, 0x00000007 // calc bytes over mult of 8
2469*593dc095SDavid du Colombier sub ecx, eax // drop over bytes from original length
2470*593dc095SDavid du Colombier mov MMXLength, ecx
2471*593dc095SDavid du Colombier } // end _asm block
2472*593dc095SDavid du Colombier // Now do the math for the rest of the row
2473*593dc095SDavid du Colombier switch ( bpp )
2474*593dc095SDavid du Colombier {
2475*593dc095SDavid du Colombier case 3:
2476*593dc095SDavid du Colombier {
2477*593dc095SDavid du Colombier ActiveMask.use = 0x0000000000ffffff;
2478*593dc095SDavid du Colombier ActiveMaskEnd.use = 0xffff000000000000;
2479*593dc095SDavid du Colombier ShiftBpp.use = 24; // == bpp(3) * 8
2480*593dc095SDavid du Colombier ShiftRem.use = 40; // == 64 - 24
2481*593dc095SDavid du Colombier _asm
2482*593dc095SDavid du Colombier {
2483*593dc095SDavid du Colombier mov ebx, diff
2484*593dc095SDavid du Colombier mov edi, row
2485*593dc095SDavid du Colombier mov esi, prev_row
2486*593dc095SDavid du Colombier pxor mm0, mm0
2487*593dc095SDavid du Colombier // PRIME the pump (load the first Raw(x-bpp) data set
2488*593dc095SDavid du Colombier movq mm1, [edi+ebx-8]
2489*593dc095SDavid du Colombier dpth3lp:
2490*593dc095SDavid du Colombier psrlq mm1, ShiftRem // shift last 3 bytes to 1st 3 bytes
2491*593dc095SDavid du Colombier movq mm2, [esi + ebx] // load b=Prior(x)
2492*593dc095SDavid du Colombier punpcklbw mm1, mm0 // Unpack High bytes of a
2493*593dc095SDavid du Colombier movq mm3, [esi+ebx-8] // Prep c=Prior(x-bpp) bytes
2494*593dc095SDavid du Colombier punpcklbw mm2, mm0 // Unpack High bytes of b
2495*593dc095SDavid du Colombier psrlq mm3, ShiftRem // shift last 3 bytes to 1st 3 bytes
2496*593dc095SDavid du Colombier // pav = p - a = (a + b - c) - a = b - c
2497*593dc095SDavid du Colombier movq mm4, mm2
2498*593dc095SDavid du Colombier punpcklbw mm3, mm0 // Unpack High bytes of c
2499*593dc095SDavid du Colombier // pbv = p - b = (a + b - c) - b = a - c
2500*593dc095SDavid du Colombier movq mm5, mm1
2501*593dc095SDavid du Colombier psubw mm4, mm3
2502*593dc095SDavid du Colombier pxor mm7, mm7
2503*593dc095SDavid du Colombier // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2504*593dc095SDavid du Colombier movq mm6, mm4
2505*593dc095SDavid du Colombier psubw mm5, mm3
2506*593dc095SDavid du Colombier
2507*593dc095SDavid du Colombier // pa = abs(p-a) = abs(pav)
2508*593dc095SDavid du Colombier // pb = abs(p-b) = abs(pbv)
2509*593dc095SDavid du Colombier // pc = abs(p-c) = abs(pcv)
2510*593dc095SDavid du Colombier pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2511*593dc095SDavid du Colombier paddw mm6, mm5
2512*593dc095SDavid du Colombier pand mm0, mm4 // Only pav bytes < 0 in mm7
2513*593dc095SDavid du Colombier pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2514*593dc095SDavid du Colombier psubw mm4, mm0
2515*593dc095SDavid du Colombier pand mm7, mm5 // Only pbv bytes < 0 in mm0
2516*593dc095SDavid du Colombier psubw mm4, mm0
2517*593dc095SDavid du Colombier psubw mm5, mm7
2518*593dc095SDavid du Colombier pxor mm0, mm0
2519*593dc095SDavid du Colombier pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2520*593dc095SDavid du Colombier pand mm0, mm6 // Only pav bytes < 0 in mm7
2521*593dc095SDavid du Colombier psubw mm5, mm7
2522*593dc095SDavid du Colombier psubw mm6, mm0
2523*593dc095SDavid du Colombier // test pa <= pb
2524*593dc095SDavid du Colombier movq mm7, mm4
2525*593dc095SDavid du Colombier psubw mm6, mm0
2526*593dc095SDavid du Colombier pcmpgtw mm7, mm5 // pa > pb?
2527*593dc095SDavid du Colombier movq mm0, mm7
2528*593dc095SDavid du Colombier // use mm7 mask to merge pa & pb
2529*593dc095SDavid du Colombier pand mm5, mm7
2530*593dc095SDavid du Colombier // use mm0 mask copy to merge a & b
2531*593dc095SDavid du Colombier pand mm2, mm0
2532*593dc095SDavid du Colombier pandn mm7, mm4
2533*593dc095SDavid du Colombier pandn mm0, mm1
2534*593dc095SDavid du Colombier paddw mm7, mm5
2535*593dc095SDavid du Colombier paddw mm0, mm2
2536*593dc095SDavid du Colombier // test ((pa <= pb)? pa:pb) <= pc
2537*593dc095SDavid du Colombier pcmpgtw mm7, mm6 // pab > pc?
2538*593dc095SDavid du Colombier pxor mm1, mm1
2539*593dc095SDavid du Colombier pand mm3, mm7
2540*593dc095SDavid du Colombier pandn mm7, mm0
2541*593dc095SDavid du Colombier paddw mm7, mm3
2542*593dc095SDavid du Colombier pxor mm0, mm0
2543*593dc095SDavid du Colombier packuswb mm7, mm1
2544*593dc095SDavid du Colombier movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2545*593dc095SDavid du Colombier pand mm7, ActiveMask
2546*593dc095SDavid du Colombier movq mm2, mm3 // load b=Prior(x) step 1
2547*593dc095SDavid du Colombier paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2548*593dc095SDavid du Colombier punpcklbw mm3, mm0 // Unpack High bytes of c
2549*593dc095SDavid du Colombier movq [edi + ebx], mm7 // write back updated value
2550*593dc095SDavid du Colombier movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2551*593dc095SDavid du Colombier // Now do Paeth for 2nd set of bytes (3-5)
2552*593dc095SDavid du Colombier psrlq mm2, ShiftBpp // load b=Prior(x) step 2
2553*593dc095SDavid du Colombier punpcklbw mm1, mm0 // Unpack High bytes of a
2554*593dc095SDavid du Colombier pxor mm7, mm7
2555*593dc095SDavid du Colombier punpcklbw mm2, mm0 // Unpack High bytes of b
2556*593dc095SDavid du Colombier // pbv = p - b = (a + b - c) - b = a - c
2557*593dc095SDavid du Colombier movq mm5, mm1
2558*593dc095SDavid du Colombier // pav = p - a = (a + b - c) - a = b - c
2559*593dc095SDavid du Colombier movq mm4, mm2
2560*593dc095SDavid du Colombier psubw mm5, mm3
2561*593dc095SDavid du Colombier psubw mm4, mm3
2562*593dc095SDavid du Colombier // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
2563*593dc095SDavid du Colombier // pav + pbv = pbv + pav
2564*593dc095SDavid du Colombier movq mm6, mm5
2565*593dc095SDavid du Colombier paddw mm6, mm4
2566*593dc095SDavid du Colombier
2567*593dc095SDavid du Colombier // pa = abs(p-a) = abs(pav)
2568*593dc095SDavid du Colombier // pb = abs(p-b) = abs(pbv)
2569*593dc095SDavid du Colombier // pc = abs(p-c) = abs(pcv)
2570*593dc095SDavid du Colombier pcmpgtw mm0, mm5 // Create mask pbv bytes < 0
2571*593dc095SDavid du Colombier pcmpgtw mm7, mm4 // Create mask pav bytes < 0
2572*593dc095SDavid du Colombier pand mm0, mm5 // Only pbv bytes < 0 in mm0
2573*593dc095SDavid du Colombier pand mm7, mm4 // Only pav bytes < 0 in mm7
2574*593dc095SDavid du Colombier psubw mm5, mm0
2575*593dc095SDavid du Colombier psubw mm4, mm7
2576*593dc095SDavid du Colombier psubw mm5, mm0
2577*593dc095SDavid du Colombier psubw mm4, mm7
2578*593dc095SDavid du Colombier pxor mm0, mm0
2579*593dc095SDavid du Colombier pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2580*593dc095SDavid du Colombier pand mm0, mm6 // Only pav bytes < 0 in mm7
2581*593dc095SDavid du Colombier psubw mm6, mm0
2582*593dc095SDavid du Colombier // test pa <= pb
2583*593dc095SDavid du Colombier movq mm7, mm4
2584*593dc095SDavid du Colombier psubw mm6, mm0
2585*593dc095SDavid du Colombier pcmpgtw mm7, mm5 // pa > pb?
2586*593dc095SDavid du Colombier movq mm0, mm7
2587*593dc095SDavid du Colombier // use mm7 mask to merge pa & pb
2588*593dc095SDavid du Colombier pand mm5, mm7
2589*593dc095SDavid du Colombier // use mm0 mask copy to merge a & b
2590*593dc095SDavid du Colombier pand mm2, mm0
2591*593dc095SDavid du Colombier pandn mm7, mm4
2592*593dc095SDavid du Colombier pandn mm0, mm1
2593*593dc095SDavid du Colombier paddw mm7, mm5
2594*593dc095SDavid du Colombier paddw mm0, mm2
2595*593dc095SDavid du Colombier // test ((pa <= pb)? pa:pb) <= pc
2596*593dc095SDavid du Colombier pcmpgtw mm7, mm6 // pab > pc?
2597*593dc095SDavid du Colombier movq mm2, [esi + ebx] // load b=Prior(x)
2598*593dc095SDavid du Colombier pand mm3, mm7
2599*593dc095SDavid du Colombier pandn mm7, mm0
2600*593dc095SDavid du Colombier pxor mm1, mm1
2601*593dc095SDavid du Colombier paddw mm7, mm3
2602*593dc095SDavid du Colombier pxor mm0, mm0
2603*593dc095SDavid du Colombier packuswb mm7, mm1
2604*593dc095SDavid du Colombier movq mm3, mm2 // load c=Prior(x-bpp) step 1
2605*593dc095SDavid du Colombier pand mm7, ActiveMask
2606*593dc095SDavid du Colombier punpckhbw mm2, mm0 // Unpack High bytes of b
2607*593dc095SDavid du Colombier psllq mm7, ShiftBpp // Shift bytes to 2nd group of 3 bytes
2608*593dc095SDavid du Colombier // pav = p - a = (a + b - c) - a = b - c
2609*593dc095SDavid du Colombier movq mm4, mm2
2610*593dc095SDavid du Colombier paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2611*593dc095SDavid du Colombier psllq mm3, ShiftBpp // load c=Prior(x-bpp) step 2
2612*593dc095SDavid du Colombier movq [edi + ebx], mm7 // write back updated value
2613*593dc095SDavid du Colombier movq mm1, mm7
2614*593dc095SDavid du Colombier punpckhbw mm3, mm0 // Unpack High bytes of c
2615*593dc095SDavid du Colombier psllq mm1, ShiftBpp // Shift bytes
2616*593dc095SDavid du Colombier // Now mm1 will be used as Raw(x-bpp)
2617*593dc095SDavid du Colombier // Now do Paeth for 3rd, and final, set of bytes (6-7)
2618*593dc095SDavid du Colombier pxor mm7, mm7
2619*593dc095SDavid du Colombier punpckhbw mm1, mm0 // Unpack High bytes of a
2620*593dc095SDavid du Colombier psubw mm4, mm3
2621*593dc095SDavid du Colombier // pbv = p - b = (a + b - c) - b = a - c
2622*593dc095SDavid du Colombier movq mm5, mm1
2623*593dc095SDavid du Colombier // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2624*593dc095SDavid du Colombier movq mm6, mm4
2625*593dc095SDavid du Colombier psubw mm5, mm3
2626*593dc095SDavid du Colombier pxor mm0, mm0
2627*593dc095SDavid du Colombier paddw mm6, mm5
2628*593dc095SDavid du Colombier
2629*593dc095SDavid du Colombier // pa = abs(p-a) = abs(pav)
2630*593dc095SDavid du Colombier // pb = abs(p-b) = abs(pbv)
2631*593dc095SDavid du Colombier // pc = abs(p-c) = abs(pcv)
2632*593dc095SDavid du Colombier pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2633*593dc095SDavid du Colombier pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2634*593dc095SDavid du Colombier pand mm0, mm4 // Only pav bytes < 0 in mm7
2635*593dc095SDavid du Colombier pand mm7, mm5 // Only pbv bytes < 0 in mm0
2636*593dc095SDavid du Colombier psubw mm4, mm0
2637*593dc095SDavid du Colombier psubw mm5, mm7
2638*593dc095SDavid du Colombier psubw mm4, mm0
2639*593dc095SDavid du Colombier psubw mm5, mm7
2640*593dc095SDavid du Colombier pxor mm0, mm0
2641*593dc095SDavid du Colombier pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2642*593dc095SDavid du Colombier pand mm0, mm6 // Only pav bytes < 0 in mm7
2643*593dc095SDavid du Colombier psubw mm6, mm0
2644*593dc095SDavid du Colombier // test pa <= pb
2645*593dc095SDavid du Colombier movq mm7, mm4
2646*593dc095SDavid du Colombier psubw mm6, mm0
2647*593dc095SDavid du Colombier pcmpgtw mm7, mm5 // pa > pb?
2648*593dc095SDavid du Colombier movq mm0, mm7
2649*593dc095SDavid du Colombier // use mm0 mask copy to merge a & b
2650*593dc095SDavid du Colombier pand mm2, mm0
2651*593dc095SDavid du Colombier // use mm7 mask to merge pa & pb
2652*593dc095SDavid du Colombier pand mm5, mm7
2653*593dc095SDavid du Colombier pandn mm0, mm1
2654*593dc095SDavid du Colombier pandn mm7, mm4
2655*593dc095SDavid du Colombier paddw mm0, mm2
2656*593dc095SDavid du Colombier paddw mm7, mm5
2657*593dc095SDavid du Colombier // test ((pa <= pb)? pa:pb) <= pc
2658*593dc095SDavid du Colombier pcmpgtw mm7, mm6 // pab > pc?
2659*593dc095SDavid du Colombier pand mm3, mm7
2660*593dc095SDavid du Colombier pandn mm7, mm0
2661*593dc095SDavid du Colombier paddw mm7, mm3
2662*593dc095SDavid du Colombier pxor mm1, mm1
2663*593dc095SDavid du Colombier packuswb mm1, mm7
2664*593dc095SDavid du Colombier // Step ebx to next set of 8 bytes and repeat loop til done
2665*593dc095SDavid du Colombier add ebx, 8
2666*593dc095SDavid du Colombier pand mm1, ActiveMaskEnd
2667*593dc095SDavid du Colombier paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2668*593dc095SDavid du Colombier
2669*593dc095SDavid du Colombier cmp ebx, MMXLength
2670*593dc095SDavid du Colombier pxor mm0, mm0 // pxor does not affect flags
2671*593dc095SDavid du Colombier movq [edi + ebx - 8], mm1 // write back updated value
2672*593dc095SDavid du Colombier // mm1 will be used as Raw(x-bpp) next loop
2673*593dc095SDavid du Colombier // mm3 ready to be used as Prior(x-bpp) next loop
2674*593dc095SDavid du Colombier jb dpth3lp
2675*593dc095SDavid du Colombier } // end _asm block
2676*593dc095SDavid du Colombier }
2677*593dc095SDavid du Colombier break;
2678*593dc095SDavid du Colombier
2679*593dc095SDavid du Colombier case 6:
2680*593dc095SDavid du Colombier case 7:
2681*593dc095SDavid du Colombier case 5:
2682*593dc095SDavid du Colombier {
2683*593dc095SDavid du Colombier ActiveMask.use = 0x00000000ffffffff;
2684*593dc095SDavid du Colombier ActiveMask2.use = 0xffffffff00000000;
2685*593dc095SDavid du Colombier ShiftBpp.use = bpp << 3; // == bpp * 8
2686*593dc095SDavid du Colombier ShiftRem.use = 64 - ShiftBpp.use;
2687*593dc095SDavid du Colombier _asm
2688*593dc095SDavid du Colombier {
2689*593dc095SDavid du Colombier mov ebx, diff
2690*593dc095SDavid du Colombier mov edi, row
2691*593dc095SDavid du Colombier mov esi, prev_row
2692*593dc095SDavid du Colombier // PRIME the pump (load the first Raw(x-bpp) data set
2693*593dc095SDavid du Colombier movq mm1, [edi+ebx-8]
2694*593dc095SDavid du Colombier pxor mm0, mm0
2695*593dc095SDavid du Colombier dpth6lp:
2696*593dc095SDavid du Colombier // Must shift to position Raw(x-bpp) data
2697*593dc095SDavid du Colombier psrlq mm1, ShiftRem
2698*593dc095SDavid du Colombier // Do first set of 4 bytes
2699*593dc095SDavid du Colombier movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2700*593dc095SDavid du Colombier punpcklbw mm1, mm0 // Unpack Low bytes of a
2701*593dc095SDavid du Colombier movq mm2, [esi + ebx] // load b=Prior(x)
2702*593dc095SDavid du Colombier punpcklbw mm2, mm0 // Unpack Low bytes of b
2703*593dc095SDavid du Colombier // Must shift to position Prior(x-bpp) data
2704*593dc095SDavid du Colombier psrlq mm3, ShiftRem
2705*593dc095SDavid du Colombier // pav = p - a = (a + b - c) - a = b - c
2706*593dc095SDavid du Colombier movq mm4, mm2
2707*593dc095SDavid du Colombier punpcklbw mm3, mm0 // Unpack Low bytes of c
2708*593dc095SDavid du Colombier // pbv = p - b = (a + b - c) - b = a - c
2709*593dc095SDavid du Colombier movq mm5, mm1
2710*593dc095SDavid du Colombier psubw mm4, mm3
2711*593dc095SDavid du Colombier pxor mm7, mm7
2712*593dc095SDavid du Colombier // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2713*593dc095SDavid du Colombier movq mm6, mm4
2714*593dc095SDavid du Colombier psubw mm5, mm3
2715*593dc095SDavid du Colombier // pa = abs(p-a) = abs(pav)
2716*593dc095SDavid du Colombier // pb = abs(p-b) = abs(pbv)
2717*593dc095SDavid du Colombier // pc = abs(p-c) = abs(pcv)
2718*593dc095SDavid du Colombier pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2719*593dc095SDavid du Colombier paddw mm6, mm5
2720*593dc095SDavid du Colombier pand mm0, mm4 // Only pav bytes < 0 in mm7
2721*593dc095SDavid du Colombier pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2722*593dc095SDavid du Colombier psubw mm4, mm0
2723*593dc095SDavid du Colombier pand mm7, mm5 // Only pbv bytes < 0 in mm0
2724*593dc095SDavid du Colombier psubw mm4, mm0
2725*593dc095SDavid du Colombier psubw mm5, mm7
2726*593dc095SDavid du Colombier pxor mm0, mm0
2727*593dc095SDavid du Colombier pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2728*593dc095SDavid du Colombier pand mm0, mm6 // Only pav bytes < 0 in mm7
2729*593dc095SDavid du Colombier psubw mm5, mm7
2730*593dc095SDavid du Colombier psubw mm6, mm0
2731*593dc095SDavid du Colombier // test pa <= pb
2732*593dc095SDavid du Colombier movq mm7, mm4
2733*593dc095SDavid du Colombier psubw mm6, mm0
2734*593dc095SDavid du Colombier pcmpgtw mm7, mm5 // pa > pb?
2735*593dc095SDavid du Colombier movq mm0, mm7
2736*593dc095SDavid du Colombier // use mm7 mask to merge pa & pb
2737*593dc095SDavid du Colombier pand mm5, mm7
2738*593dc095SDavid du Colombier // use mm0 mask copy to merge a & b
2739*593dc095SDavid du Colombier pand mm2, mm0
2740*593dc095SDavid du Colombier pandn mm7, mm4
2741*593dc095SDavid du Colombier pandn mm0, mm1
2742*593dc095SDavid du Colombier paddw mm7, mm5
2743*593dc095SDavid du Colombier paddw mm0, mm2
2744*593dc095SDavid du Colombier // test ((pa <= pb)? pa:pb) <= pc
2745*593dc095SDavid du Colombier pcmpgtw mm7, mm6 // pab > pc?
2746*593dc095SDavid du Colombier pxor mm1, mm1
2747*593dc095SDavid du Colombier pand mm3, mm7
2748*593dc095SDavid du Colombier pandn mm7, mm0
2749*593dc095SDavid du Colombier paddw mm7, mm3
2750*593dc095SDavid du Colombier pxor mm0, mm0
2751*593dc095SDavid du Colombier packuswb mm7, mm1
2752*593dc095SDavid du Colombier movq mm3, [esi + ebx - 8] // load c=Prior(x-bpp)
2753*593dc095SDavid du Colombier pand mm7, ActiveMask
2754*593dc095SDavid du Colombier psrlq mm3, ShiftRem
2755*593dc095SDavid du Colombier movq mm2, [esi + ebx] // load b=Prior(x) step 1
2756*593dc095SDavid du Colombier paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2757*593dc095SDavid du Colombier movq mm6, mm2
2758*593dc095SDavid du Colombier movq [edi + ebx], mm7 // write back updated value
2759*593dc095SDavid du Colombier movq mm1, [edi+ebx-8]
2760*593dc095SDavid du Colombier psllq mm6, ShiftBpp
2761*593dc095SDavid du Colombier movq mm5, mm7
2762*593dc095SDavid du Colombier psrlq mm1, ShiftRem
2763*593dc095SDavid du Colombier por mm3, mm6
2764*593dc095SDavid du Colombier psllq mm5, ShiftBpp
2765*593dc095SDavid du Colombier punpckhbw mm3, mm0 // Unpack High bytes of c
2766*593dc095SDavid du Colombier por mm1, mm5
2767*593dc095SDavid du Colombier // Do second set of 4 bytes
2768*593dc095SDavid du Colombier punpckhbw mm2, mm0 // Unpack High bytes of b
2769*593dc095SDavid du Colombier punpckhbw mm1, mm0 // Unpack High bytes of a
2770*593dc095SDavid du Colombier // pav = p - a = (a + b - c) - a = b - c
2771*593dc095SDavid du Colombier movq mm4, mm2
2772*593dc095SDavid du Colombier // pbv = p - b = (a + b - c) - b = a - c
2773*593dc095SDavid du Colombier movq mm5, mm1
2774*593dc095SDavid du Colombier psubw mm4, mm3
2775*593dc095SDavid du Colombier pxor mm7, mm7
2776*593dc095SDavid du Colombier // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2777*593dc095SDavid du Colombier movq mm6, mm4
2778*593dc095SDavid du Colombier psubw mm5, mm3
2779*593dc095SDavid du Colombier // pa = abs(p-a) = abs(pav)
2780*593dc095SDavid du Colombier // pb = abs(p-b) = abs(pbv)
2781*593dc095SDavid du Colombier // pc = abs(p-c) = abs(pcv)
2782*593dc095SDavid du Colombier pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2783*593dc095SDavid du Colombier paddw mm6, mm5
2784*593dc095SDavid du Colombier pand mm0, mm4 // Only pav bytes < 0 in mm7
2785*593dc095SDavid du Colombier pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2786*593dc095SDavid du Colombier psubw mm4, mm0
2787*593dc095SDavid du Colombier pand mm7, mm5 // Only pbv bytes < 0 in mm0
2788*593dc095SDavid du Colombier psubw mm4, mm0
2789*593dc095SDavid du Colombier psubw mm5, mm7
2790*593dc095SDavid du Colombier pxor mm0, mm0
2791*593dc095SDavid du Colombier pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2792*593dc095SDavid du Colombier pand mm0, mm6 // Only pav bytes < 0 in mm7
2793*593dc095SDavid du Colombier psubw mm5, mm7
2794*593dc095SDavid du Colombier psubw mm6, mm0
2795*593dc095SDavid du Colombier // test pa <= pb
2796*593dc095SDavid du Colombier movq mm7, mm4
2797*593dc095SDavid du Colombier psubw mm6, mm0
2798*593dc095SDavid du Colombier pcmpgtw mm7, mm5 // pa > pb?
2799*593dc095SDavid du Colombier movq mm0, mm7
2800*593dc095SDavid du Colombier // use mm7 mask to merge pa & pb
2801*593dc095SDavid du Colombier pand mm5, mm7
2802*593dc095SDavid du Colombier // use mm0 mask copy to merge a & b
2803*593dc095SDavid du Colombier pand mm2, mm0
2804*593dc095SDavid du Colombier pandn mm7, mm4
2805*593dc095SDavid du Colombier pandn mm0, mm1
2806*593dc095SDavid du Colombier paddw mm7, mm5
2807*593dc095SDavid du Colombier paddw mm0, mm2
2808*593dc095SDavid du Colombier // test ((pa <= pb)? pa:pb) <= pc
2809*593dc095SDavid du Colombier pcmpgtw mm7, mm6 // pab > pc?
2810*593dc095SDavid du Colombier pxor mm1, mm1
2811*593dc095SDavid du Colombier pand mm3, mm7
2812*593dc095SDavid du Colombier pandn mm7, mm0
2813*593dc095SDavid du Colombier pxor mm1, mm1
2814*593dc095SDavid du Colombier paddw mm7, mm3
2815*593dc095SDavid du Colombier pxor mm0, mm0
2816*593dc095SDavid du Colombier // Step ex to next set of 8 bytes and repeat loop til done
2817*593dc095SDavid du Colombier add ebx, 8
2818*593dc095SDavid du Colombier packuswb mm1, mm7
2819*593dc095SDavid du Colombier paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2820*593dc095SDavid du Colombier cmp ebx, MMXLength
2821*593dc095SDavid du Colombier movq [edi + ebx - 8], mm1 // write back updated value
2822*593dc095SDavid du Colombier // mm1 will be used as Raw(x-bpp) next loop
2823*593dc095SDavid du Colombier jb dpth6lp
2824*593dc095SDavid du Colombier } // end _asm block
2825*593dc095SDavid du Colombier }
2826*593dc095SDavid du Colombier break;
2827*593dc095SDavid du Colombier
2828*593dc095SDavid du Colombier case 4:
2829*593dc095SDavid du Colombier {
2830*593dc095SDavid du Colombier ActiveMask.use = 0x00000000ffffffff;
2831*593dc095SDavid du Colombier _asm {
2832*593dc095SDavid du Colombier mov ebx, diff
2833*593dc095SDavid du Colombier mov edi, row
2834*593dc095SDavid du Colombier mov esi, prev_row
2835*593dc095SDavid du Colombier pxor mm0, mm0
2836*593dc095SDavid du Colombier // PRIME the pump (load the first Raw(x-bpp) data set
2837*593dc095SDavid du Colombier movq mm1, [edi+ebx-8] // Only time should need to read
2838*593dc095SDavid du Colombier // a=Raw(x-bpp) bytes
2839*593dc095SDavid du Colombier dpth4lp:
2840*593dc095SDavid du Colombier // Do first set of 4 bytes
2841*593dc095SDavid du Colombier movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2842*593dc095SDavid du Colombier punpckhbw mm1, mm0 // Unpack Low bytes of a
2843*593dc095SDavid du Colombier movq mm2, [esi + ebx] // load b=Prior(x)
2844*593dc095SDavid du Colombier punpcklbw mm2, mm0 // Unpack High bytes of b
2845*593dc095SDavid du Colombier // pav = p - a = (a + b - c) - a = b - c
2846*593dc095SDavid du Colombier movq mm4, mm2
2847*593dc095SDavid du Colombier punpckhbw mm3, mm0 // Unpack High bytes of c
2848*593dc095SDavid du Colombier // pbv = p - b = (a + b - c) - b = a - c
2849*593dc095SDavid du Colombier movq mm5, mm1
2850*593dc095SDavid du Colombier psubw mm4, mm3
2851*593dc095SDavid du Colombier pxor mm7, mm7
2852*593dc095SDavid du Colombier // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2853*593dc095SDavid du Colombier movq mm6, mm4
2854*593dc095SDavid du Colombier psubw mm5, mm3
2855*593dc095SDavid du Colombier // pa = abs(p-a) = abs(pav)
2856*593dc095SDavid du Colombier // pb = abs(p-b) = abs(pbv)
2857*593dc095SDavid du Colombier // pc = abs(p-c) = abs(pcv)
2858*593dc095SDavid du Colombier pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2859*593dc095SDavid du Colombier paddw mm6, mm5
2860*593dc095SDavid du Colombier pand mm0, mm4 // Only pav bytes < 0 in mm7
2861*593dc095SDavid du Colombier pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2862*593dc095SDavid du Colombier psubw mm4, mm0
2863*593dc095SDavid du Colombier pand mm7, mm5 // Only pbv bytes < 0 in mm0
2864*593dc095SDavid du Colombier psubw mm4, mm0
2865*593dc095SDavid du Colombier psubw mm5, mm7
2866*593dc095SDavid du Colombier pxor mm0, mm0
2867*593dc095SDavid du Colombier pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2868*593dc095SDavid du Colombier pand mm0, mm6 // Only pav bytes < 0 in mm7
2869*593dc095SDavid du Colombier psubw mm5, mm7
2870*593dc095SDavid du Colombier psubw mm6, mm0
2871*593dc095SDavid du Colombier // test pa <= pb
2872*593dc095SDavid du Colombier movq mm7, mm4
2873*593dc095SDavid du Colombier psubw mm6, mm0
2874*593dc095SDavid du Colombier pcmpgtw mm7, mm5 // pa > pb?
2875*593dc095SDavid du Colombier movq mm0, mm7
2876*593dc095SDavid du Colombier // use mm7 mask to merge pa & pb
2877*593dc095SDavid du Colombier pand mm5, mm7
2878*593dc095SDavid du Colombier // use mm0 mask copy to merge a & b
2879*593dc095SDavid du Colombier pand mm2, mm0
2880*593dc095SDavid du Colombier pandn mm7, mm4
2881*593dc095SDavid du Colombier pandn mm0, mm1
2882*593dc095SDavid du Colombier paddw mm7, mm5
2883*593dc095SDavid du Colombier paddw mm0, mm2
2884*593dc095SDavid du Colombier // test ((pa <= pb)? pa:pb) <= pc
2885*593dc095SDavid du Colombier pcmpgtw mm7, mm6 // pab > pc?
2886*593dc095SDavid du Colombier pxor mm1, mm1
2887*593dc095SDavid du Colombier pand mm3, mm7
2888*593dc095SDavid du Colombier pandn mm7, mm0
2889*593dc095SDavid du Colombier paddw mm7, mm3
2890*593dc095SDavid du Colombier pxor mm0, mm0
2891*593dc095SDavid du Colombier packuswb mm7, mm1
2892*593dc095SDavid du Colombier movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2893*593dc095SDavid du Colombier pand mm7, ActiveMask
2894*593dc095SDavid du Colombier movq mm2, mm3 // load b=Prior(x) step 1
2895*593dc095SDavid du Colombier paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2896*593dc095SDavid du Colombier punpcklbw mm3, mm0 // Unpack High bytes of c
2897*593dc095SDavid du Colombier movq [edi + ebx], mm7 // write back updated value
2898*593dc095SDavid du Colombier movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2899*593dc095SDavid du Colombier // Do second set of 4 bytes
2900*593dc095SDavid du Colombier punpckhbw mm2, mm0 // Unpack Low bytes of b
2901*593dc095SDavid du Colombier punpcklbw mm1, mm0 // Unpack Low bytes of a
2902*593dc095SDavid du Colombier // pav = p - a = (a + b - c) - a = b - c
2903*593dc095SDavid du Colombier movq mm4, mm2
2904*593dc095SDavid du Colombier // pbv = p - b = (a + b - c) - b = a - c
2905*593dc095SDavid du Colombier movq mm5, mm1
2906*593dc095SDavid du Colombier psubw mm4, mm3
2907*593dc095SDavid du Colombier pxor mm7, mm7
2908*593dc095SDavid du Colombier // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2909*593dc095SDavid du Colombier movq mm6, mm4
2910*593dc095SDavid du Colombier psubw mm5, mm3
2911*593dc095SDavid du Colombier // pa = abs(p-a) = abs(pav)
2912*593dc095SDavid du Colombier // pb = abs(p-b) = abs(pbv)
2913*593dc095SDavid du Colombier // pc = abs(p-c) = abs(pcv)
2914*593dc095SDavid du Colombier pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2915*593dc095SDavid du Colombier paddw mm6, mm5
2916*593dc095SDavid du Colombier pand mm0, mm4 // Only pav bytes < 0 in mm7
2917*593dc095SDavid du Colombier pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2918*593dc095SDavid du Colombier psubw mm4, mm0
2919*593dc095SDavid du Colombier pand mm7, mm5 // Only pbv bytes < 0 in mm0
2920*593dc095SDavid du Colombier psubw mm4, mm0
2921*593dc095SDavid du Colombier psubw mm5, mm7
2922*593dc095SDavid du Colombier pxor mm0, mm0
2923*593dc095SDavid du Colombier pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2924*593dc095SDavid du Colombier pand mm0, mm6 // Only pav bytes < 0 in mm7
2925*593dc095SDavid du Colombier psubw mm5, mm7
2926*593dc095SDavid du Colombier psubw mm6, mm0
2927*593dc095SDavid du Colombier // test pa <= pb
2928*593dc095SDavid du Colombier movq mm7, mm4
2929*593dc095SDavid du Colombier psubw mm6, mm0
2930*593dc095SDavid du Colombier pcmpgtw mm7, mm5 // pa > pb?
2931*593dc095SDavid du Colombier movq mm0, mm7
2932*593dc095SDavid du Colombier // use mm7 mask to merge pa & pb
2933*593dc095SDavid du Colombier pand mm5, mm7
2934*593dc095SDavid du Colombier // use mm0 mask copy to merge a & b
2935*593dc095SDavid du Colombier pand mm2, mm0
2936*593dc095SDavid du Colombier pandn mm7, mm4
2937*593dc095SDavid du Colombier pandn mm0, mm1
2938*593dc095SDavid du Colombier paddw mm7, mm5
2939*593dc095SDavid du Colombier paddw mm0, mm2
2940*593dc095SDavid du Colombier // test ((pa <= pb)? pa:pb) <= pc
2941*593dc095SDavid du Colombier pcmpgtw mm7, mm6 // pab > pc?
2942*593dc095SDavid du Colombier pxor mm1, mm1
2943*593dc095SDavid du Colombier pand mm3, mm7
2944*593dc095SDavid du Colombier pandn mm7, mm0
2945*593dc095SDavid du Colombier pxor mm1, mm1
2946*593dc095SDavid du Colombier paddw mm7, mm3
2947*593dc095SDavid du Colombier pxor mm0, mm0
2948*593dc095SDavid du Colombier // Step ex to next set of 8 bytes and repeat loop til done
2949*593dc095SDavid du Colombier add ebx, 8
2950*593dc095SDavid du Colombier packuswb mm1, mm7
2951*593dc095SDavid du Colombier paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2952*593dc095SDavid du Colombier cmp ebx, MMXLength
2953*593dc095SDavid du Colombier movq [edi + ebx - 8], mm1 // write back updated value
2954*593dc095SDavid du Colombier // mm1 will be used as Raw(x-bpp) next loop
2955*593dc095SDavid du Colombier jb dpth4lp
2956*593dc095SDavid du Colombier } // end _asm block
2957*593dc095SDavid du Colombier }
2958*593dc095SDavid du Colombier break;
2959*593dc095SDavid du Colombier case 8: // bpp == 8
2960*593dc095SDavid du Colombier {
2961*593dc095SDavid du Colombier ActiveMask.use = 0x00000000ffffffff;
2962*593dc095SDavid du Colombier _asm {
2963*593dc095SDavid du Colombier mov ebx, diff
2964*593dc095SDavid du Colombier mov edi, row
2965*593dc095SDavid du Colombier mov esi, prev_row
2966*593dc095SDavid du Colombier pxor mm0, mm0
2967*593dc095SDavid du Colombier // PRIME the pump (load the first Raw(x-bpp) data set
2968*593dc095SDavid du Colombier movq mm1, [edi+ebx-8] // Only time should need to read
2969*593dc095SDavid du Colombier // a=Raw(x-bpp) bytes
2970*593dc095SDavid du Colombier dpth8lp:
2971*593dc095SDavid du Colombier // Do first set of 4 bytes
2972*593dc095SDavid du Colombier movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2973*593dc095SDavid du Colombier punpcklbw mm1, mm0 // Unpack Low bytes of a
2974*593dc095SDavid du Colombier movq mm2, [esi + ebx] // load b=Prior(x)
2975*593dc095SDavid du Colombier punpcklbw mm2, mm0 // Unpack Low bytes of b
2976*593dc095SDavid du Colombier // pav = p - a = (a + b - c) - a = b - c
2977*593dc095SDavid du Colombier movq mm4, mm2
2978*593dc095SDavid du Colombier punpcklbw mm3, mm0 // Unpack Low bytes of c
2979*593dc095SDavid du Colombier // pbv = p - b = (a + b - c) - b = a - c
2980*593dc095SDavid du Colombier movq mm5, mm1
2981*593dc095SDavid du Colombier psubw mm4, mm3
2982*593dc095SDavid du Colombier pxor mm7, mm7
2983*593dc095SDavid du Colombier // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2984*593dc095SDavid du Colombier movq mm6, mm4
2985*593dc095SDavid du Colombier psubw mm5, mm3
2986*593dc095SDavid du Colombier // pa = abs(p-a) = abs(pav)
2987*593dc095SDavid du Colombier // pb = abs(p-b) = abs(pbv)
2988*593dc095SDavid du Colombier // pc = abs(p-c) = abs(pcv)
2989*593dc095SDavid du Colombier pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2990*593dc095SDavid du Colombier paddw mm6, mm5
2991*593dc095SDavid du Colombier pand mm0, mm4 // Only pav bytes < 0 in mm7
2992*593dc095SDavid du Colombier pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2993*593dc095SDavid du Colombier psubw mm4, mm0
2994*593dc095SDavid du Colombier pand mm7, mm5 // Only pbv bytes < 0 in mm0
2995*593dc095SDavid du Colombier psubw mm4, mm0
2996*593dc095SDavid du Colombier psubw mm5, mm7
2997*593dc095SDavid du Colombier pxor mm0, mm0
2998*593dc095SDavid du Colombier pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2999*593dc095SDavid du Colombier pand mm0, mm6 // Only pav bytes < 0 in mm7
3000*593dc095SDavid du Colombier psubw mm5, mm7
3001*593dc095SDavid du Colombier psubw mm6, mm0
3002*593dc095SDavid du Colombier // test pa <= pb
3003*593dc095SDavid du Colombier movq mm7, mm4
3004*593dc095SDavid du Colombier psubw mm6, mm0
3005*593dc095SDavid du Colombier pcmpgtw mm7, mm5 // pa > pb?
3006*593dc095SDavid du Colombier movq mm0, mm7
3007*593dc095SDavid du Colombier // use mm7 mask to merge pa & pb
3008*593dc095SDavid du Colombier pand mm5, mm7
3009*593dc095SDavid du Colombier // use mm0 mask copy to merge a & b
3010*593dc095SDavid du Colombier pand mm2, mm0
3011*593dc095SDavid du Colombier pandn mm7, mm4
3012*593dc095SDavid du Colombier pandn mm0, mm1
3013*593dc095SDavid du Colombier paddw mm7, mm5
3014*593dc095SDavid du Colombier paddw mm0, mm2
3015*593dc095SDavid du Colombier // test ((pa <= pb)? pa:pb) <= pc
3016*593dc095SDavid du Colombier pcmpgtw mm7, mm6 // pab > pc?
3017*593dc095SDavid du Colombier pxor mm1, mm1
3018*593dc095SDavid du Colombier pand mm3, mm7
3019*593dc095SDavid du Colombier pandn mm7, mm0
3020*593dc095SDavid du Colombier paddw mm7, mm3
3021*593dc095SDavid du Colombier pxor mm0, mm0
3022*593dc095SDavid du Colombier packuswb mm7, mm1
3023*593dc095SDavid du Colombier movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
3024*593dc095SDavid du Colombier pand mm7, ActiveMask
3025*593dc095SDavid du Colombier movq mm2, [esi + ebx] // load b=Prior(x)
3026*593dc095SDavid du Colombier paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
3027*593dc095SDavid du Colombier punpckhbw mm3, mm0 // Unpack High bytes of c
3028*593dc095SDavid du Colombier movq [edi + ebx], mm7 // write back updated value
3029*593dc095SDavid du Colombier movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes
3030*593dc095SDavid du Colombier
3031*593dc095SDavid du Colombier // Do second set of 4 bytes
3032*593dc095SDavid du Colombier punpckhbw mm2, mm0 // Unpack High bytes of b
3033*593dc095SDavid du Colombier punpckhbw mm1, mm0 // Unpack High bytes of a
3034*593dc095SDavid du Colombier // pav = p - a = (a + b - c) - a = b - c
3035*593dc095SDavid du Colombier movq mm4, mm2
3036*593dc095SDavid du Colombier // pbv = p - b = (a + b - c) - b = a - c
3037*593dc095SDavid du Colombier movq mm5, mm1
3038*593dc095SDavid du Colombier psubw mm4, mm3
3039*593dc095SDavid du Colombier pxor mm7, mm7
3040*593dc095SDavid du Colombier // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3041*593dc095SDavid du Colombier movq mm6, mm4
3042*593dc095SDavid du Colombier psubw mm5, mm3
3043*593dc095SDavid du Colombier // pa = abs(p-a) = abs(pav)
3044*593dc095SDavid du Colombier // pb = abs(p-b) = abs(pbv)
3045*593dc095SDavid du Colombier // pc = abs(p-c) = abs(pcv)
3046*593dc095SDavid du Colombier pcmpgtw mm0, mm4 // Create mask pav bytes < 0
3047*593dc095SDavid du Colombier paddw mm6, mm5
3048*593dc095SDavid du Colombier pand mm0, mm4 // Only pav bytes < 0 in mm7
3049*593dc095SDavid du Colombier pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
3050*593dc095SDavid du Colombier psubw mm4, mm0
3051*593dc095SDavid du Colombier pand mm7, mm5 // Only pbv bytes < 0 in mm0
3052*593dc095SDavid du Colombier psubw mm4, mm0
3053*593dc095SDavid du Colombier psubw mm5, mm7
3054*593dc095SDavid du Colombier pxor mm0, mm0
3055*593dc095SDavid du Colombier pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
3056*593dc095SDavid du Colombier pand mm0, mm6 // Only pav bytes < 0 in mm7
3057*593dc095SDavid du Colombier psubw mm5, mm7
3058*593dc095SDavid du Colombier psubw mm6, mm0
3059*593dc095SDavid du Colombier // test pa <= pb
3060*593dc095SDavid du Colombier movq mm7, mm4
3061*593dc095SDavid du Colombier psubw mm6, mm0
3062*593dc095SDavid du Colombier pcmpgtw mm7, mm5 // pa > pb?
3063*593dc095SDavid du Colombier movq mm0, mm7
3064*593dc095SDavid du Colombier // use mm7 mask to merge pa & pb
3065*593dc095SDavid du Colombier pand mm5, mm7
3066*593dc095SDavid du Colombier // use mm0 mask copy to merge a & b
3067*593dc095SDavid du Colombier pand mm2, mm0
3068*593dc095SDavid du Colombier pandn mm7, mm4
3069*593dc095SDavid du Colombier pandn mm0, mm1
3070*593dc095SDavid du Colombier paddw mm7, mm5
3071*593dc095SDavid du Colombier paddw mm0, mm2
3072*593dc095SDavid du Colombier // test ((pa <= pb)? pa:pb) <= pc
3073*593dc095SDavid du Colombier pcmpgtw mm7, mm6 // pab > pc?
3074*593dc095SDavid du Colombier pxor mm1, mm1
3075*593dc095SDavid du Colombier pand mm3, mm7
3076*593dc095SDavid du Colombier pandn mm7, mm0
3077*593dc095SDavid du Colombier pxor mm1, mm1
3078*593dc095SDavid du Colombier paddw mm7, mm3
3079*593dc095SDavid du Colombier pxor mm0, mm0
3080*593dc095SDavid du Colombier // Step ex to next set of 8 bytes and repeat loop til done
3081*593dc095SDavid du Colombier add ebx, 8
3082*593dc095SDavid du Colombier packuswb mm1, mm7
3083*593dc095SDavid du Colombier paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
3084*593dc095SDavid du Colombier cmp ebx, MMXLength
3085*593dc095SDavid du Colombier movq [edi + ebx - 8], mm1 // write back updated value
3086*593dc095SDavid du Colombier // mm1 will be used as Raw(x-bpp) next loop
3087*593dc095SDavid du Colombier jb dpth8lp
3088*593dc095SDavid du Colombier } // end _asm block
3089*593dc095SDavid du Colombier }
3090*593dc095SDavid du Colombier break;
3091*593dc095SDavid du Colombier
3092*593dc095SDavid du Colombier case 1: // bpp = 1
3093*593dc095SDavid du Colombier case 2: // bpp = 2
3094*593dc095SDavid du Colombier default: // bpp > 8
3095*593dc095SDavid du Colombier {
3096*593dc095SDavid du Colombier _asm {
3097*593dc095SDavid du Colombier mov ebx, diff
3098*593dc095SDavid du Colombier cmp ebx, FullLength
3099*593dc095SDavid du Colombier jnb dpthdend
3100*593dc095SDavid du Colombier mov edi, row
3101*593dc095SDavid du Colombier mov esi, prev_row
3102*593dc095SDavid du Colombier // Do Paeth decode for remaining bytes
3103*593dc095SDavid du Colombier mov edx, ebx
3104*593dc095SDavid du Colombier xor ecx, ecx // zero ecx before using cl & cx in loop below
3105*593dc095SDavid du Colombier sub edx, bpp // Set edx = ebx - bpp
3106*593dc095SDavid du Colombier dpthdlp:
3107*593dc095SDavid du Colombier xor eax, eax
3108*593dc095SDavid du Colombier // pav = p - a = (a + b - c) - a = b - c
3109*593dc095SDavid du Colombier mov al, [esi + ebx] // load Prior(x) into al
3110*593dc095SDavid du Colombier mov cl, [esi + edx] // load Prior(x-bpp) into cl
3111*593dc095SDavid du Colombier sub eax, ecx // subtract Prior(x-bpp)
3112*593dc095SDavid du Colombier mov patemp, eax // Save pav for later use
3113*593dc095SDavid du Colombier xor eax, eax
3114*593dc095SDavid du Colombier // pbv = p - b = (a + b - c) - b = a - c
3115*593dc095SDavid du Colombier mov al, [edi + edx] // load Raw(x-bpp) into al
3116*593dc095SDavid du Colombier sub eax, ecx // subtract Prior(x-bpp)
3117*593dc095SDavid du Colombier mov ecx, eax
3118*593dc095SDavid du Colombier // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3119*593dc095SDavid du Colombier add eax, patemp // pcv = pav + pbv
3120*593dc095SDavid du Colombier // pc = abs(pcv)
3121*593dc095SDavid du Colombier test eax, 0x80000000
3122*593dc095SDavid du Colombier jz dpthdpca
3123*593dc095SDavid du Colombier neg eax // reverse sign of neg values
3124*593dc095SDavid du Colombier dpthdpca:
3125*593dc095SDavid du Colombier mov pctemp, eax // save pc for later use
3126*593dc095SDavid du Colombier // pb = abs(pbv)
3127*593dc095SDavid du Colombier test ecx, 0x80000000
3128*593dc095SDavid du Colombier jz dpthdpba
3129*593dc095SDavid du Colombier neg ecx // reverse sign of neg values
3130*593dc095SDavid du Colombier dpthdpba:
3131*593dc095SDavid du Colombier mov pbtemp, ecx // save pb for later use
3132*593dc095SDavid du Colombier // pa = abs(pav)
3133*593dc095SDavid du Colombier mov eax, patemp
3134*593dc095SDavid du Colombier test eax, 0x80000000
3135*593dc095SDavid du Colombier jz dpthdpaa
3136*593dc095SDavid du Colombier neg eax // reverse sign of neg values
3137*593dc095SDavid du Colombier dpthdpaa:
3138*593dc095SDavid du Colombier mov patemp, eax // save pa for later use
3139*593dc095SDavid du Colombier // test if pa <= pb
3140*593dc095SDavid du Colombier cmp eax, ecx
3141*593dc095SDavid du Colombier jna dpthdabb
3142*593dc095SDavid du Colombier // pa > pb; now test if pb <= pc
3143*593dc095SDavid du Colombier cmp ecx, pctemp
3144*593dc095SDavid du Colombier jna dpthdbbc
3145*593dc095SDavid du Colombier // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3146*593dc095SDavid du Colombier mov cl, [esi + edx] // load Prior(x-bpp) into cl
3147*593dc095SDavid du Colombier jmp dpthdpaeth
3148*593dc095SDavid du Colombier dpthdbbc:
3149*593dc095SDavid du Colombier // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3150*593dc095SDavid du Colombier mov cl, [esi + ebx] // load Prior(x) into cl
3151*593dc095SDavid du Colombier jmp dpthdpaeth
3152*593dc095SDavid du Colombier dpthdabb:
3153*593dc095SDavid du Colombier // pa <= pb; now test if pa <= pc
3154*593dc095SDavid du Colombier cmp eax, pctemp
3155*593dc095SDavid du Colombier jna dpthdabc
3156*593dc095SDavid du Colombier // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3157*593dc095SDavid du Colombier mov cl, [esi + edx] // load Prior(x-bpp) into cl
3158*593dc095SDavid du Colombier jmp dpthdpaeth
3159*593dc095SDavid du Colombier dpthdabc:
3160*593dc095SDavid du Colombier // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3161*593dc095SDavid du Colombier mov cl, [edi + edx] // load Raw(x-bpp) into cl
3162*593dc095SDavid du Colombier dpthdpaeth:
3163*593dc095SDavid du Colombier inc ebx
3164*593dc095SDavid du Colombier inc edx
3165*593dc095SDavid du Colombier // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3166*593dc095SDavid du Colombier add [edi + ebx - 1], cl
3167*593dc095SDavid du Colombier cmp ebx, FullLength
3168*593dc095SDavid du Colombier jb dpthdlp
3169*593dc095SDavid du Colombier dpthdend:
3170*593dc095SDavid du Colombier } // end _asm block
3171*593dc095SDavid du Colombier }
3172*593dc095SDavid du Colombier return; // No need to go further with this one
3173*593dc095SDavid du Colombier } // end switch ( bpp )
3174*593dc095SDavid du Colombier _asm
3175*593dc095SDavid du Colombier {
3176*593dc095SDavid du Colombier // MMX acceleration complete now do clean-up
3177*593dc095SDavid du Colombier // Check if any remaining bytes left to decode
3178*593dc095SDavid du Colombier mov ebx, MMXLength
3179*593dc095SDavid du Colombier cmp ebx, FullLength
3180*593dc095SDavid du Colombier jnb dpthend
3181*593dc095SDavid du Colombier mov edi, row
3182*593dc095SDavid du Colombier mov esi, prev_row
3183*593dc095SDavid du Colombier // Do Paeth decode for remaining bytes
3184*593dc095SDavid du Colombier mov edx, ebx
3185*593dc095SDavid du Colombier xor ecx, ecx // zero ecx before using cl & cx in loop below
3186*593dc095SDavid du Colombier sub edx, bpp // Set edx = ebx - bpp
3187*593dc095SDavid du Colombier dpthlp2:
3188*593dc095SDavid du Colombier xor eax, eax
3189*593dc095SDavid du Colombier // pav = p - a = (a + b - c) - a = b - c
3190*593dc095SDavid du Colombier mov al, [esi + ebx] // load Prior(x) into al
3191*593dc095SDavid du Colombier mov cl, [esi + edx] // load Prior(x-bpp) into cl
3192*593dc095SDavid du Colombier sub eax, ecx // subtract Prior(x-bpp)
3193*593dc095SDavid du Colombier mov patemp, eax // Save pav for later use
3194*593dc095SDavid du Colombier xor eax, eax
3195*593dc095SDavid du Colombier // pbv = p - b = (a + b - c) - b = a - c
3196*593dc095SDavid du Colombier mov al, [edi + edx] // load Raw(x-bpp) into al
3197*593dc095SDavid du Colombier sub eax, ecx // subtract Prior(x-bpp)
3198*593dc095SDavid du Colombier mov ecx, eax
3199*593dc095SDavid du Colombier // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3200*593dc095SDavid du Colombier add eax, patemp // pcv = pav + pbv
3201*593dc095SDavid du Colombier // pc = abs(pcv)
3202*593dc095SDavid du Colombier test eax, 0x80000000
3203*593dc095SDavid du Colombier jz dpthpca2
3204*593dc095SDavid du Colombier neg eax // reverse sign of neg values
3205*593dc095SDavid du Colombier dpthpca2:
3206*593dc095SDavid du Colombier mov pctemp, eax // save pc for later use
3207*593dc095SDavid du Colombier // pb = abs(pbv)
3208*593dc095SDavid du Colombier test ecx, 0x80000000
3209*593dc095SDavid du Colombier jz dpthpba2
3210*593dc095SDavid du Colombier neg ecx // reverse sign of neg values
3211*593dc095SDavid du Colombier dpthpba2:
3212*593dc095SDavid du Colombier mov pbtemp, ecx // save pb for later use
3213*593dc095SDavid du Colombier // pa = abs(pav)
3214*593dc095SDavid du Colombier mov eax, patemp
3215*593dc095SDavid du Colombier test eax, 0x80000000
3216*593dc095SDavid du Colombier jz dpthpaa2
3217*593dc095SDavid du Colombier neg eax // reverse sign of neg values
3218*593dc095SDavid du Colombier dpthpaa2:
3219*593dc095SDavid du Colombier mov patemp, eax // save pa for later use
3220*593dc095SDavid du Colombier // test if pa <= pb
3221*593dc095SDavid du Colombier cmp eax, ecx
3222*593dc095SDavid du Colombier jna dpthabb2
3223*593dc095SDavid du Colombier // pa > pb; now test if pb <= pc
3224*593dc095SDavid du Colombier cmp ecx, pctemp
3225*593dc095SDavid du Colombier jna dpthbbc2
3226*593dc095SDavid du Colombier // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3227*593dc095SDavid du Colombier mov cl, [esi + edx] // load Prior(x-bpp) into cl
3228*593dc095SDavid du Colombier jmp dpthpaeth2
3229*593dc095SDavid du Colombier dpthbbc2:
3230*593dc095SDavid du Colombier // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3231*593dc095SDavid du Colombier mov cl, [esi + ebx] // load Prior(x) into cl
3232*593dc095SDavid du Colombier jmp dpthpaeth2
3233*593dc095SDavid du Colombier dpthabb2:
3234*593dc095SDavid du Colombier // pa <= pb; now test if pa <= pc
3235*593dc095SDavid du Colombier cmp eax, pctemp
3236*593dc095SDavid du Colombier jna dpthabc2
3237*593dc095SDavid du Colombier // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3238*593dc095SDavid du Colombier mov cl, [esi + edx] // load Prior(x-bpp) into cl
3239*593dc095SDavid du Colombier jmp dpthpaeth2
3240*593dc095SDavid du Colombier dpthabc2:
3241*593dc095SDavid du Colombier // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3242*593dc095SDavid du Colombier mov cl, [edi + edx] // load Raw(x-bpp) into cl
3243*593dc095SDavid du Colombier dpthpaeth2:
3244*593dc095SDavid du Colombier inc ebx
3245*593dc095SDavid du Colombier inc edx
3246*593dc095SDavid du Colombier // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3247*593dc095SDavid du Colombier add [edi + ebx - 1], cl
3248*593dc095SDavid du Colombier cmp ebx, FullLength
3249*593dc095SDavid du Colombier jb dpthlp2
3250*593dc095SDavid du Colombier dpthend:
3251*593dc095SDavid du Colombier emms // End MMX instructions; prep for possible FP instrs.
3252*593dc095SDavid du Colombier } // end _asm block
3253*593dc095SDavid du Colombier }
3254*593dc095SDavid du Colombier
3255*593dc095SDavid du Colombier // Optimized code for PNG Sub filter decoder
3256*593dc095SDavid du Colombier void /* PRIVATE */
png_read_filter_row_mmx_sub(png_row_infop row_info,png_bytep row)3257*593dc095SDavid du Colombier png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3258*593dc095SDavid du Colombier {
3259*593dc095SDavid du Colombier //int test;
3260*593dc095SDavid du Colombier int bpp;
3261*593dc095SDavid du Colombier png_uint_32 FullLength;
3262*593dc095SDavid du Colombier png_uint_32 MMXLength;
3263*593dc095SDavid du Colombier int diff;
3264*593dc095SDavid du Colombier
3265*593dc095SDavid du Colombier bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3266*593dc095SDavid du Colombier FullLength = row_info->rowbytes - bpp; // # of bytes to filter
3267*593dc095SDavid du Colombier _asm {
3268*593dc095SDavid du Colombier mov edi, row
3269*593dc095SDavid du Colombier mov esi, edi // lp = row
3270*593dc095SDavid du Colombier add edi, bpp // rp = row + bpp
3271*593dc095SDavid du Colombier xor eax, eax
3272*593dc095SDavid du Colombier // get # of bytes to alignment
3273*593dc095SDavid du Colombier mov diff, edi // take start of row
3274*593dc095SDavid du Colombier add diff, 0xf // add 7 + 8 to incr past
3275*593dc095SDavid du Colombier // alignment boundary
3276*593dc095SDavid du Colombier xor ebx, ebx
3277*593dc095SDavid du Colombier and diff, 0xfffffff8 // mask to alignment boundary
3278*593dc095SDavid du Colombier sub diff, edi // subtract from start ==> value
3279*593dc095SDavid du Colombier // ebx at alignment
3280*593dc095SDavid du Colombier jz dsubgo
3281*593dc095SDavid du Colombier // fix alignment
3282*593dc095SDavid du Colombier dsublp1:
3283*593dc095SDavid du Colombier mov al, [esi+ebx]
3284*593dc095SDavid du Colombier add [edi+ebx], al
3285*593dc095SDavid du Colombier inc ebx
3286*593dc095SDavid du Colombier cmp ebx, diff
3287*593dc095SDavid du Colombier jb dsublp1
3288*593dc095SDavid du Colombier dsubgo:
3289*593dc095SDavid du Colombier mov ecx, FullLength
3290*593dc095SDavid du Colombier mov edx, ecx
3291*593dc095SDavid du Colombier sub edx, ebx // subtract alignment fix
3292*593dc095SDavid du Colombier and edx, 0x00000007 // calc bytes over mult of 8
3293*593dc095SDavid du Colombier sub ecx, edx // drop over bytes from length
3294*593dc095SDavid du Colombier mov MMXLength, ecx
3295*593dc095SDavid du Colombier } // end _asm block
3296*593dc095SDavid du Colombier
3297*593dc095SDavid du Colombier // Now do the math for the rest of the row
3298*593dc095SDavid du Colombier switch ( bpp )
3299*593dc095SDavid du Colombier {
3300*593dc095SDavid du Colombier case 3:
3301*593dc095SDavid du Colombier {
3302*593dc095SDavid du Colombier ActiveMask.use = 0x0000ffffff000000;
3303*593dc095SDavid du Colombier ShiftBpp.use = 24; // == 3 * 8
3304*593dc095SDavid du Colombier ShiftRem.use = 40; // == 64 - 24
3305*593dc095SDavid du Colombier _asm {
3306*593dc095SDavid du Colombier mov edi, row
3307*593dc095SDavid du Colombier movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3308*593dc095SDavid du Colombier mov esi, edi // lp = row
3309*593dc095SDavid du Colombier add edi, bpp // rp = row + bpp
3310*593dc095SDavid du Colombier movq mm6, mm7
3311*593dc095SDavid du Colombier mov ebx, diff
3312*593dc095SDavid du Colombier psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3313*593dc095SDavid du Colombier // byte group
3314*593dc095SDavid du Colombier // PRIME the pump (load the first Raw(x-bpp) data set
3315*593dc095SDavid du Colombier movq mm1, [edi+ebx-8]
3316*593dc095SDavid du Colombier dsub3lp:
3317*593dc095SDavid du Colombier psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3318*593dc095SDavid du Colombier // no need for mask; shift clears inactive bytes
3319*593dc095SDavid du Colombier // Add 1st active group
3320*593dc095SDavid du Colombier movq mm0, [edi+ebx]
3321*593dc095SDavid du Colombier paddb mm0, mm1
3322*593dc095SDavid du Colombier // Add 2nd active group
3323*593dc095SDavid du Colombier movq mm1, mm0 // mov updated Raws to mm1
3324*593dc095SDavid du Colombier psllq mm1, ShiftBpp // shift data to position correctly
3325*593dc095SDavid du Colombier pand mm1, mm7 // mask to use only 2nd active group
3326*593dc095SDavid du Colombier paddb mm0, mm1
3327*593dc095SDavid du Colombier // Add 3rd active group
3328*593dc095SDavid du Colombier movq mm1, mm0 // mov updated Raws to mm1
3329*593dc095SDavid du Colombier psllq mm1, ShiftBpp // shift data to position correctly
3330*593dc095SDavid du Colombier pand mm1, mm6 // mask to use only 3rd active group
3331*593dc095SDavid du Colombier add ebx, 8
3332*593dc095SDavid du Colombier paddb mm0, mm1
3333*593dc095SDavid du Colombier cmp ebx, MMXLength
3334*593dc095SDavid du Colombier movq [edi+ebx-8], mm0 // Write updated Raws back to array
3335*593dc095SDavid du Colombier // Prep for doing 1st add at top of loop
3336*593dc095SDavid du Colombier movq mm1, mm0
3337*593dc095SDavid du Colombier jb dsub3lp
3338*593dc095SDavid du Colombier } // end _asm block
3339*593dc095SDavid du Colombier }
3340*593dc095SDavid du Colombier break;
3341*593dc095SDavid du Colombier
3342*593dc095SDavid du Colombier case 1:
3343*593dc095SDavid du Colombier {
3344*593dc095SDavid du Colombier // Placed here just in case this is a duplicate of the
3345*593dc095SDavid du Colombier // non-MMX code for the SUB filter in png_read_filter_row below
3346*593dc095SDavid du Colombier //
3347*593dc095SDavid du Colombier // png_bytep rp;
3348*593dc095SDavid du Colombier // png_bytep lp;
3349*593dc095SDavid du Colombier // png_uint_32 i;
3350*593dc095SDavid du Colombier // bpp = (row_info->pixel_depth + 7) >> 3;
3351*593dc095SDavid du Colombier // for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3352*593dc095SDavid du Colombier // i < row_info->rowbytes; i++, rp++, lp++)
3353*593dc095SDavid du Colombier // {
3354*593dc095SDavid du Colombier // *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3355*593dc095SDavid du Colombier // }
3356*593dc095SDavid du Colombier _asm {
3357*593dc095SDavid du Colombier mov ebx, diff
3358*593dc095SDavid du Colombier mov edi, row
3359*593dc095SDavid du Colombier cmp ebx, FullLength
3360*593dc095SDavid du Colombier jnb dsub1end
3361*593dc095SDavid du Colombier mov esi, edi // lp = row
3362*593dc095SDavid du Colombier xor eax, eax
3363*593dc095SDavid du Colombier add edi, bpp // rp = row + bpp
3364*593dc095SDavid du Colombier dsub1lp:
3365*593dc095SDavid du Colombier mov al, [esi+ebx]
3366*593dc095SDavid du Colombier add [edi+ebx], al
3367*593dc095SDavid du Colombier inc ebx
3368*593dc095SDavid du Colombier cmp ebx, FullLength
3369*593dc095SDavid du Colombier jb dsub1lp
3370*593dc095SDavid du Colombier dsub1end:
3371*593dc095SDavid du Colombier } // end _asm block
3372*593dc095SDavid du Colombier }
3373*593dc095SDavid du Colombier return;
3374*593dc095SDavid du Colombier
3375*593dc095SDavid du Colombier case 6:
3376*593dc095SDavid du Colombier case 7:
3377*593dc095SDavid du Colombier case 4:
3378*593dc095SDavid du Colombier case 5:
3379*593dc095SDavid du Colombier {
3380*593dc095SDavid du Colombier ShiftBpp.use = bpp << 3;
3381*593dc095SDavid du Colombier ShiftRem.use = 64 - ShiftBpp.use;
3382*593dc095SDavid du Colombier _asm {
3383*593dc095SDavid du Colombier mov edi, row
3384*593dc095SDavid du Colombier mov ebx, diff
3385*593dc095SDavid du Colombier mov esi, edi // lp = row
3386*593dc095SDavid du Colombier add edi, bpp // rp = row + bpp
3387*593dc095SDavid du Colombier // PRIME the pump (load the first Raw(x-bpp) data set
3388*593dc095SDavid du Colombier movq mm1, [edi+ebx-8]
3389*593dc095SDavid du Colombier dsub4lp:
3390*593dc095SDavid du Colombier psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3391*593dc095SDavid du Colombier // no need for mask; shift clears inactive bytes
3392*593dc095SDavid du Colombier movq mm0, [edi+ebx]
3393*593dc095SDavid du Colombier paddb mm0, mm1
3394*593dc095SDavid du Colombier // Add 2nd active group
3395*593dc095SDavid du Colombier movq mm1, mm0 // mov updated Raws to mm1
3396*593dc095SDavid du Colombier psllq mm1, ShiftBpp // shift data to position correctly
3397*593dc095SDavid du Colombier // there is no need for any mask
3398*593dc095SDavid du Colombier // since shift clears inactive bits/bytes
3399*593dc095SDavid du Colombier add ebx, 8
3400*593dc095SDavid du Colombier paddb mm0, mm1
3401*593dc095SDavid du Colombier cmp ebx, MMXLength
3402*593dc095SDavid du Colombier movq [edi+ebx-8], mm0
3403*593dc095SDavid du Colombier movq mm1, mm0 // Prep for doing 1st add at top of loop
3404*593dc095SDavid du Colombier jb dsub4lp
3405*593dc095SDavid du Colombier } // end _asm block
3406*593dc095SDavid du Colombier }
3407*593dc095SDavid du Colombier break;
3408*593dc095SDavid du Colombier
3409*593dc095SDavid du Colombier case 2:
3410*593dc095SDavid du Colombier {
3411*593dc095SDavid du Colombier ActiveMask.use = 0x00000000ffff0000;
3412*593dc095SDavid du Colombier ShiftBpp.use = 16; // == 2 * 8
3413*593dc095SDavid du Colombier ShiftRem.use = 48; // == 64 - 16
3414*593dc095SDavid du Colombier _asm {
3415*593dc095SDavid du Colombier movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3416*593dc095SDavid du Colombier mov ebx, diff
3417*593dc095SDavid du Colombier movq mm6, mm7
3418*593dc095SDavid du Colombier mov edi, row
3419*593dc095SDavid du Colombier psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3420*593dc095SDavid du Colombier // byte group
3421*593dc095SDavid du Colombier mov esi, edi // lp = row
3422*593dc095SDavid du Colombier movq mm5, mm6
3423*593dc095SDavid du Colombier add edi, bpp // rp = row + bpp
3424*593dc095SDavid du Colombier psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active
3425*593dc095SDavid du Colombier // byte group
3426*593dc095SDavid du Colombier // PRIME the pump (load the first Raw(x-bpp) data set
3427*593dc095SDavid du Colombier movq mm1, [edi+ebx-8]
3428*593dc095SDavid du Colombier dsub2lp:
3429*593dc095SDavid du Colombier // Add 1st active group
3430*593dc095SDavid du Colombier psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3431*593dc095SDavid du Colombier // no need for mask; shift clears inactive
3432*593dc095SDavid du Colombier // bytes
3433*593dc095SDavid du Colombier movq mm0, [edi+ebx]
3434*593dc095SDavid du Colombier paddb mm0, mm1
3435*593dc095SDavid du Colombier // Add 2nd active group
3436*593dc095SDavid du Colombier movq mm1, mm0 // mov updated Raws to mm1
3437*593dc095SDavid du Colombier psllq mm1, ShiftBpp // shift data to position correctly
3438*593dc095SDavid du Colombier pand mm1, mm7 // mask to use only 2nd active group
3439*593dc095SDavid du Colombier paddb mm0, mm1
3440*593dc095SDavid du Colombier // Add 3rd active group
3441*593dc095SDavid du Colombier movq mm1, mm0 // mov updated Raws to mm1
3442*593dc095SDavid du Colombier psllq mm1, ShiftBpp // shift data to position correctly
3443*593dc095SDavid du Colombier pand mm1, mm6 // mask to use only 3rd active group
3444*593dc095SDavid du Colombier paddb mm0, mm1
3445*593dc095SDavid du Colombier // Add 4th active group
3446*593dc095SDavid du Colombier movq mm1, mm0 // mov updated Raws to mm1
3447*593dc095SDavid du Colombier psllq mm1, ShiftBpp // shift data to position correctly
3448*593dc095SDavid du Colombier pand mm1, mm5 // mask to use only 4th active group
3449*593dc095SDavid du Colombier add ebx, 8
3450*593dc095SDavid du Colombier paddb mm0, mm1
3451*593dc095SDavid du Colombier cmp ebx, MMXLength
3452*593dc095SDavid du Colombier movq [edi+ebx-8], mm0 // Write updated Raws back to array
3453*593dc095SDavid du Colombier movq mm1, mm0 // Prep for doing 1st add at top of loop
3454*593dc095SDavid du Colombier jb dsub2lp
3455*593dc095SDavid du Colombier } // end _asm block
3456*593dc095SDavid du Colombier }
3457*593dc095SDavid du Colombier break;
3458*593dc095SDavid du Colombier case 8:
3459*593dc095SDavid du Colombier {
3460*593dc095SDavid du Colombier _asm {
3461*593dc095SDavid du Colombier mov edi, row
3462*593dc095SDavid du Colombier mov ebx, diff
3463*593dc095SDavid du Colombier mov esi, edi // lp = row
3464*593dc095SDavid du Colombier add edi, bpp // rp = row + bpp
3465*593dc095SDavid du Colombier mov ecx, MMXLength
3466*593dc095SDavid du Colombier movq mm7, [edi+ebx-8] // PRIME the pump (load the first
3467*593dc095SDavid du Colombier // Raw(x-bpp) data set
3468*593dc095SDavid du Colombier and ecx, 0x0000003f // calc bytes over mult of 64
3469*593dc095SDavid du Colombier dsub8lp:
3470*593dc095SDavid du Colombier movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes
3471*593dc095SDavid du Colombier paddb mm0, mm7
3472*593dc095SDavid du Colombier movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes
3473*593dc095SDavid du Colombier movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes
3474*593dc095SDavid du Colombier // Now mm0 will be used as Raw(x-bpp) for
3475*593dc095SDavid du Colombier // the 2nd group of 8 bytes. This will be
3476*593dc095SDavid du Colombier // repeated for each group of 8 bytes with
3477*593dc095SDavid du Colombier // the 8th group being used as the Raw(x-bpp)
3478*593dc095SDavid du Colombier // for the 1st group of the next loop.
3479*593dc095SDavid du Colombier paddb mm1, mm0
3480*593dc095SDavid du Colombier movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes
3481*593dc095SDavid du Colombier movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes
3482*593dc095SDavid du Colombier paddb mm2, mm1
3483*593dc095SDavid du Colombier movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes
3484*593dc095SDavid du Colombier movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes
3485*593dc095SDavid du Colombier paddb mm3, mm2
3486*593dc095SDavid du Colombier movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes
3487*593dc095SDavid du Colombier movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes
3488*593dc095SDavid du Colombier paddb mm4, mm3
3489*593dc095SDavid du Colombier movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes
3490*593dc095SDavid du Colombier movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes
3491*593dc095SDavid du Colombier paddb mm5, mm4
3492*593dc095SDavid du Colombier movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes
3493*593dc095SDavid du Colombier movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes
3494*593dc095SDavid du Colombier paddb mm6, mm5
3495*593dc095SDavid du Colombier movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes
3496*593dc095SDavid du Colombier movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes
3497*593dc095SDavid du Colombier add ebx, 64
3498*593dc095SDavid du Colombier paddb mm7, mm6
3499*593dc095SDavid du Colombier cmp ebx, ecx
3500*593dc095SDavid du Colombier movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes
3501*593dc095SDavid du Colombier jb dsub8lp
3502*593dc095SDavid du Colombier cmp ebx, MMXLength
3503*593dc095SDavid du Colombier jnb dsub8lt8
3504*593dc095SDavid du Colombier dsub8lpA:
3505*593dc095SDavid du Colombier movq mm0, [edi+ebx]
3506*593dc095SDavid du Colombier add ebx, 8
3507*593dc095SDavid du Colombier paddb mm0, mm7
3508*593dc095SDavid du Colombier cmp ebx, MMXLength
3509*593dc095SDavid du Colombier movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx
3510*593dc095SDavid du Colombier movq mm7, mm0 // Move calculated Raw(x) data to mm1 to
3511*593dc095SDavid du Colombier // be the new Raw(x-bpp) for the next loop
3512*593dc095SDavid du Colombier jb dsub8lpA
3513*593dc095SDavid du Colombier dsub8lt8:
3514*593dc095SDavid du Colombier } // end _asm block
3515*593dc095SDavid du Colombier }
3516*593dc095SDavid du Colombier break;
3517*593dc095SDavid du Colombier
3518*593dc095SDavid du Colombier default: // bpp greater than 8 bytes
3519*593dc095SDavid du Colombier {
3520*593dc095SDavid du Colombier _asm {
3521*593dc095SDavid du Colombier mov ebx, diff
3522*593dc095SDavid du Colombier mov edi, row
3523*593dc095SDavid du Colombier mov esi, edi // lp = row
3524*593dc095SDavid du Colombier add edi, bpp // rp = row + bpp
3525*593dc095SDavid du Colombier dsubAlp:
3526*593dc095SDavid du Colombier movq mm0, [edi+ebx]
3527*593dc095SDavid du Colombier movq mm1, [esi+ebx]
3528*593dc095SDavid du Colombier add ebx, 8
3529*593dc095SDavid du Colombier paddb mm0, mm1
3530*593dc095SDavid du Colombier cmp ebx, MMXLength
3531*593dc095SDavid du Colombier movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset
3532*593dc095SDavid du Colombier // add ebx
3533*593dc095SDavid du Colombier jb dsubAlp
3534*593dc095SDavid du Colombier } // end _asm block
3535*593dc095SDavid du Colombier }
3536*593dc095SDavid du Colombier break;
3537*593dc095SDavid du Colombier
3538*593dc095SDavid du Colombier } // end switch ( bpp )
3539*593dc095SDavid du Colombier
3540*593dc095SDavid du Colombier _asm {
3541*593dc095SDavid du Colombier mov ebx, MMXLength
3542*593dc095SDavid du Colombier mov edi, row
3543*593dc095SDavid du Colombier cmp ebx, FullLength
3544*593dc095SDavid du Colombier jnb dsubend
3545*593dc095SDavid du Colombier mov esi, edi // lp = row
3546*593dc095SDavid du Colombier xor eax, eax
3547*593dc095SDavid du Colombier add edi, bpp // rp = row + bpp
3548*593dc095SDavid du Colombier dsublp2:
3549*593dc095SDavid du Colombier mov al, [esi+ebx]
3550*593dc095SDavid du Colombier add [edi+ebx], al
3551*593dc095SDavid du Colombier inc ebx
3552*593dc095SDavid du Colombier cmp ebx, FullLength
3553*593dc095SDavid du Colombier jb dsublp2
3554*593dc095SDavid du Colombier dsubend:
3555*593dc095SDavid du Colombier emms // End MMX instructions; prep for possible FP instrs.
3556*593dc095SDavid du Colombier } // end _asm block
3557*593dc095SDavid du Colombier }
3558*593dc095SDavid du Colombier
3559*593dc095SDavid du Colombier // Optimized code for PNG Up filter decoder
3560*593dc095SDavid du Colombier void /* PRIVATE */
png_read_filter_row_mmx_up(png_row_infop row_info,png_bytep row,png_bytep prev_row)3561*593dc095SDavid du Colombier png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3562*593dc095SDavid du Colombier png_bytep prev_row)
3563*593dc095SDavid du Colombier {
3564*593dc095SDavid du Colombier png_uint_32 len;
3565*593dc095SDavid du Colombier len = row_info->rowbytes; // # of bytes to filter
3566*593dc095SDavid du Colombier _asm {
3567*593dc095SDavid du Colombier mov edi, row
3568*593dc095SDavid du Colombier // get # of bytes to alignment
3569*593dc095SDavid du Colombier mov ecx, edi
3570*593dc095SDavid du Colombier xor ebx, ebx
3571*593dc095SDavid du Colombier add ecx, 0x7
3572*593dc095SDavid du Colombier xor eax, eax
3573*593dc095SDavid du Colombier and ecx, 0xfffffff8
3574*593dc095SDavid du Colombier mov esi, prev_row
3575*593dc095SDavid du Colombier sub ecx, edi
3576*593dc095SDavid du Colombier jz dupgo
3577*593dc095SDavid du Colombier // fix alignment
3578*593dc095SDavid du Colombier duplp1:
3579*593dc095SDavid du Colombier mov al, [edi+ebx]
3580*593dc095SDavid du Colombier add al, [esi+ebx]
3581*593dc095SDavid du Colombier inc ebx
3582*593dc095SDavid du Colombier cmp ebx, ecx
3583*593dc095SDavid du Colombier mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3584*593dc095SDavid du Colombier jb duplp1
3585*593dc095SDavid du Colombier dupgo:
3586*593dc095SDavid du Colombier mov ecx, len
3587*593dc095SDavid du Colombier mov edx, ecx
3588*593dc095SDavid du Colombier sub edx, ebx // subtract alignment fix
3589*593dc095SDavid du Colombier and edx, 0x0000003f // calc bytes over mult of 64
3590*593dc095SDavid du Colombier sub ecx, edx // drop over bytes from length
3591*593dc095SDavid du Colombier // Unrolled loop - use all MMX registers and interleave to reduce
3592*593dc095SDavid du Colombier // number of branch instructions (loops) and reduce partial stalls
3593*593dc095SDavid du Colombier duploop:
3594*593dc095SDavid du Colombier movq mm1, [esi+ebx]
3595*593dc095SDavid du Colombier movq mm0, [edi+ebx]
3596*593dc095SDavid du Colombier movq mm3, [esi+ebx+8]
3597*593dc095SDavid du Colombier paddb mm0, mm1
3598*593dc095SDavid du Colombier movq mm2, [edi+ebx+8]
3599*593dc095SDavid du Colombier movq [edi+ebx], mm0
3600*593dc095SDavid du Colombier paddb mm2, mm3
3601*593dc095SDavid du Colombier movq mm5, [esi+ebx+16]
3602*593dc095SDavid du Colombier movq [edi+ebx+8], mm2
3603*593dc095SDavid du Colombier movq mm4, [edi+ebx+16]
3604*593dc095SDavid du Colombier movq mm7, [esi+ebx+24]
3605*593dc095SDavid du Colombier paddb mm4, mm5
3606*593dc095SDavid du Colombier movq mm6, [edi+ebx+24]
3607*593dc095SDavid du Colombier movq [edi+ebx+16], mm4
3608*593dc095SDavid du Colombier paddb mm6, mm7
3609*593dc095SDavid du Colombier movq mm1, [esi+ebx+32]
3610*593dc095SDavid du Colombier movq [edi+ebx+24], mm6
3611*593dc095SDavid du Colombier movq mm0, [edi+ebx+32]
3612*593dc095SDavid du Colombier movq mm3, [esi+ebx+40]
3613*593dc095SDavid du Colombier paddb mm0, mm1
3614*593dc095SDavid du Colombier movq mm2, [edi+ebx+40]
3615*593dc095SDavid du Colombier movq [edi+ebx+32], mm0
3616*593dc095SDavid du Colombier paddb mm2, mm3
3617*593dc095SDavid du Colombier movq mm5, [esi+ebx+48]
3618*593dc095SDavid du Colombier movq [edi+ebx+40], mm2
3619*593dc095SDavid du Colombier movq mm4, [edi+ebx+48]
3620*593dc095SDavid du Colombier movq mm7, [esi+ebx+56]
3621*593dc095SDavid du Colombier paddb mm4, mm5
3622*593dc095SDavid du Colombier movq mm6, [edi+ebx+56]
3623*593dc095SDavid du Colombier movq [edi+ebx+48], mm4
3624*593dc095SDavid du Colombier add ebx, 64
3625*593dc095SDavid du Colombier paddb mm6, mm7
3626*593dc095SDavid du Colombier cmp ebx, ecx
3627*593dc095SDavid du Colombier movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
3628*593dc095SDavid du Colombier // -8 to offset add ebx
3629*593dc095SDavid du Colombier jb duploop
3630*593dc095SDavid du Colombier
3631*593dc095SDavid du Colombier cmp edx, 0 // Test for bytes over mult of 64
3632*593dc095SDavid du Colombier jz dupend
3633*593dc095SDavid du Colombier
3634*593dc095SDavid du Colombier
3635*593dc095SDavid du Colombier // 2 lines added by lcreeve at netins.net
3636*593dc095SDavid du Colombier // (mail 11 Jul 98 in png-implement list)
3637*593dc095SDavid du Colombier cmp edx, 8 //test for less than 8 bytes
3638*593dc095SDavid du Colombier jb duplt8
3639*593dc095SDavid du Colombier
3640*593dc095SDavid du Colombier
3641*593dc095SDavid du Colombier add ecx, edx
3642*593dc095SDavid du Colombier and edx, 0x00000007 // calc bytes over mult of 8
3643*593dc095SDavid du Colombier sub ecx, edx // drop over bytes from length
3644*593dc095SDavid du Colombier jz duplt8
3645*593dc095SDavid du Colombier // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
3646*593dc095SDavid du Colombier duplpA:
3647*593dc095SDavid du Colombier movq mm1, [esi+ebx]
3648*593dc095SDavid du Colombier movq mm0, [edi+ebx]
3649*593dc095SDavid du Colombier add ebx, 8
3650*593dc095SDavid du Colombier paddb mm0, mm1
3651*593dc095SDavid du Colombier cmp ebx, ecx
3652*593dc095SDavid du Colombier movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
3653*593dc095SDavid du Colombier jb duplpA
3654*593dc095SDavid du Colombier cmp edx, 0 // Test for bytes over mult of 8
3655*593dc095SDavid du Colombier jz dupend
3656*593dc095SDavid du Colombier duplt8:
3657*593dc095SDavid du Colombier xor eax, eax
3658*593dc095SDavid du Colombier add ecx, edx // move over byte count into counter
3659*593dc095SDavid du Colombier // Loop using x86 registers to update remaining bytes
3660*593dc095SDavid du Colombier duplp2:
3661*593dc095SDavid du Colombier mov al, [edi + ebx]
3662*593dc095SDavid du Colombier add al, [esi + ebx]
3663*593dc095SDavid du Colombier inc ebx
3664*593dc095SDavid du Colombier cmp ebx, ecx
3665*593dc095SDavid du Colombier mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3666*593dc095SDavid du Colombier jb duplp2
3667*593dc095SDavid du Colombier dupend:
3668*593dc095SDavid du Colombier // Conversion of filtered row completed
3669*593dc095SDavid du Colombier emms // End MMX instructions; prep for possible FP instrs.
3670*593dc095SDavid du Colombier } // end _asm block
3671*593dc095SDavid du Colombier }
3672*593dc095SDavid du Colombier
3673*593dc095SDavid du Colombier
3674*593dc095SDavid du Colombier // Optimized png_read_filter_row routines
3675*593dc095SDavid du Colombier void /* PRIVATE */
png_read_filter_row(png_structp png_ptr,png_row_infop row_info,png_bytep row,png_bytep prev_row,int filter)3676*593dc095SDavid du Colombier png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3677*593dc095SDavid du Colombier row, png_bytep prev_row, int filter)
3678*593dc095SDavid du Colombier {
3679*593dc095SDavid du Colombier #ifdef PNG_DEBUG
3680*593dc095SDavid du Colombier char filnm[10];
3681*593dc095SDavid du Colombier #endif
3682*593dc095SDavid du Colombier
3683*593dc095SDavid du Colombier if (mmx_supported == 2) {
3684*593dc095SDavid du Colombier #if !defined(PNG_1_0_X)
3685*593dc095SDavid du Colombier /* this should have happened in png_init_mmx_flags() already */
3686*593dc095SDavid du Colombier png_warning(png_ptr, "asm_flags may not have been initialized");
3687*593dc095SDavid du Colombier #endif
3688*593dc095SDavid du Colombier png_mmx_support();
3689*593dc095SDavid du Colombier }
3690*593dc095SDavid du Colombier
3691*593dc095SDavid du Colombier #ifdef PNG_DEBUG
3692*593dc095SDavid du Colombier png_debug(1, "in png_read_filter_row\n");
3693*593dc095SDavid du Colombier switch (filter)
3694*593dc095SDavid du Colombier {
3695*593dc095SDavid du Colombier case 0: sprintf(filnm, "none");
3696*593dc095SDavid du Colombier break;
3697*593dc095SDavid du Colombier #if !defined(PNG_1_0_X)
3698*593dc095SDavid du Colombier case 1: sprintf(filnm, "sub-%s",
3699*593dc095SDavid du Colombier (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
3700*593dc095SDavid du Colombier break;
3701*593dc095SDavid du Colombier case 2: sprintf(filnm, "up-%s",
3702*593dc095SDavid du Colombier (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
3703*593dc095SDavid du Colombier break;
3704*593dc095SDavid du Colombier case 3: sprintf(filnm, "avg-%s",
3705*593dc095SDavid du Colombier (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
3706*593dc095SDavid du Colombier break;
3707*593dc095SDavid du Colombier case 4: sprintf(filnm, "Paeth-%s",
3708*593dc095SDavid du Colombier (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
3709*593dc095SDavid du Colombier break;
3710*593dc095SDavid du Colombier #else
3711*593dc095SDavid du Colombier case 1: sprintf(filnm, "sub");
3712*593dc095SDavid du Colombier break;
3713*593dc095SDavid du Colombier case 2: sprintf(filnm, "up");
3714*593dc095SDavid du Colombier break;
3715*593dc095SDavid du Colombier case 3: sprintf(filnm, "avg");
3716*593dc095SDavid du Colombier break;
3717*593dc095SDavid du Colombier case 4: sprintf(filnm, "Paeth");
3718*593dc095SDavid du Colombier break;
3719*593dc095SDavid du Colombier #endif
3720*593dc095SDavid du Colombier default: sprintf(filnm, "unknw");
3721*593dc095SDavid du Colombier break;
3722*593dc095SDavid du Colombier }
3723*593dc095SDavid du Colombier png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3724*593dc095SDavid du Colombier png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3725*593dc095SDavid du Colombier (int)((row_info->pixel_depth + 7) >> 3));
3726*593dc095SDavid du Colombier png_debug1(0,"len=%8d, ", row_info->rowbytes);
3727*593dc095SDavid du Colombier #endif /* PNG_DEBUG */
3728*593dc095SDavid du Colombier
3729*593dc095SDavid du Colombier switch (filter)
3730*593dc095SDavid du Colombier {
3731*593dc095SDavid du Colombier case PNG_FILTER_VALUE_NONE:
3732*593dc095SDavid du Colombier break;
3733*593dc095SDavid du Colombier
3734*593dc095SDavid du Colombier case PNG_FILTER_VALUE_SUB:
3735*593dc095SDavid du Colombier {
3736*593dc095SDavid du Colombier #if !defined(PNG_1_0_X)
3737*593dc095SDavid du Colombier if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
3738*593dc095SDavid du Colombier (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3739*593dc095SDavid du Colombier (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3740*593dc095SDavid du Colombier #else
3741*593dc095SDavid du Colombier if (mmx_supported)
3742*593dc095SDavid du Colombier #endif
3743*593dc095SDavid du Colombier {
3744*593dc095SDavid du Colombier png_read_filter_row_mmx_sub(row_info, row);
3745*593dc095SDavid du Colombier }
3746*593dc095SDavid du Colombier else
3747*593dc095SDavid du Colombier {
3748*593dc095SDavid du Colombier png_uint_32 i;
3749*593dc095SDavid du Colombier png_uint_32 istop = row_info->rowbytes;
3750*593dc095SDavid du Colombier png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3751*593dc095SDavid du Colombier png_bytep rp = row + bpp;
3752*593dc095SDavid du Colombier png_bytep lp = row;
3753*593dc095SDavid du Colombier
3754*593dc095SDavid du Colombier for (i = bpp; i < istop; i++)
3755*593dc095SDavid du Colombier {
3756*593dc095SDavid du Colombier *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3757*593dc095SDavid du Colombier rp++;
3758*593dc095SDavid du Colombier }
3759*593dc095SDavid du Colombier }
3760*593dc095SDavid du Colombier break;
3761*593dc095SDavid du Colombier }
3762*593dc095SDavid du Colombier
3763*593dc095SDavid du Colombier case PNG_FILTER_VALUE_UP:
3764*593dc095SDavid du Colombier {
3765*593dc095SDavid du Colombier #if !defined(PNG_1_0_X)
3766*593dc095SDavid du Colombier if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
3767*593dc095SDavid du Colombier (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3768*593dc095SDavid du Colombier (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3769*593dc095SDavid du Colombier #else
3770*593dc095SDavid du Colombier if (mmx_supported)
3771*593dc095SDavid du Colombier #endif
3772*593dc095SDavid du Colombier {
3773*593dc095SDavid du Colombier png_read_filter_row_mmx_up(row_info, row, prev_row);
3774*593dc095SDavid du Colombier }
3775*593dc095SDavid du Colombier else
3776*593dc095SDavid du Colombier {
3777*593dc095SDavid du Colombier png_uint_32 i;
3778*593dc095SDavid du Colombier png_uint_32 istop = row_info->rowbytes;
3779*593dc095SDavid du Colombier png_bytep rp = row;
3780*593dc095SDavid du Colombier png_bytep pp = prev_row;
3781*593dc095SDavid du Colombier
3782*593dc095SDavid du Colombier for (i = 0; i < istop; ++i)
3783*593dc095SDavid du Colombier {
3784*593dc095SDavid du Colombier *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3785*593dc095SDavid du Colombier rp++;
3786*593dc095SDavid du Colombier }
3787*593dc095SDavid du Colombier }
3788*593dc095SDavid du Colombier break;
3789*593dc095SDavid du Colombier }
3790*593dc095SDavid du Colombier
3791*593dc095SDavid du Colombier case PNG_FILTER_VALUE_AVG:
3792*593dc095SDavid du Colombier {
3793*593dc095SDavid du Colombier #if !defined(PNG_1_0_X)
3794*593dc095SDavid du Colombier if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
3795*593dc095SDavid du Colombier (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3796*593dc095SDavid du Colombier (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3797*593dc095SDavid du Colombier #else
3798*593dc095SDavid du Colombier if (mmx_supported)
3799*593dc095SDavid du Colombier #endif
3800*593dc095SDavid du Colombier {
3801*593dc095SDavid du Colombier png_read_filter_row_mmx_avg(row_info, row, prev_row);
3802*593dc095SDavid du Colombier }
3803*593dc095SDavid du Colombier else
3804*593dc095SDavid du Colombier {
3805*593dc095SDavid du Colombier png_uint_32 i;
3806*593dc095SDavid du Colombier png_bytep rp = row;
3807*593dc095SDavid du Colombier png_bytep pp = prev_row;
3808*593dc095SDavid du Colombier png_bytep lp = row;
3809*593dc095SDavid du Colombier png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3810*593dc095SDavid du Colombier png_uint_32 istop = row_info->rowbytes - bpp;
3811*593dc095SDavid du Colombier
3812*593dc095SDavid du Colombier for (i = 0; i < bpp; i++)
3813*593dc095SDavid du Colombier {
3814*593dc095SDavid du Colombier *rp = (png_byte)(((int)(*rp) +
3815*593dc095SDavid du Colombier ((int)(*pp++) >> 1)) & 0xff);
3816*593dc095SDavid du Colombier rp++;
3817*593dc095SDavid du Colombier }
3818*593dc095SDavid du Colombier
3819*593dc095SDavid du Colombier for (i = 0; i < istop; i++)
3820*593dc095SDavid du Colombier {
3821*593dc095SDavid du Colombier *rp = (png_byte)(((int)(*rp) +
3822*593dc095SDavid du Colombier ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3823*593dc095SDavid du Colombier rp++;
3824*593dc095SDavid du Colombier }
3825*593dc095SDavid du Colombier }
3826*593dc095SDavid du Colombier break;
3827*593dc095SDavid du Colombier }
3828*593dc095SDavid du Colombier
3829*593dc095SDavid du Colombier case PNG_FILTER_VALUE_PAETH:
3830*593dc095SDavid du Colombier {
3831*593dc095SDavid du Colombier #if !defined(PNG_1_0_X)
3832*593dc095SDavid du Colombier if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
3833*593dc095SDavid du Colombier (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3834*593dc095SDavid du Colombier (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3835*593dc095SDavid du Colombier #else
3836*593dc095SDavid du Colombier if (mmx_supported)
3837*593dc095SDavid du Colombier #endif
3838*593dc095SDavid du Colombier {
3839*593dc095SDavid du Colombier png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3840*593dc095SDavid du Colombier }
3841*593dc095SDavid du Colombier else
3842*593dc095SDavid du Colombier {
3843*593dc095SDavid du Colombier png_uint_32 i;
3844*593dc095SDavid du Colombier png_bytep rp = row;
3845*593dc095SDavid du Colombier png_bytep pp = prev_row;
3846*593dc095SDavid du Colombier png_bytep lp = row;
3847*593dc095SDavid du Colombier png_bytep cp = prev_row;
3848*593dc095SDavid du Colombier png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3849*593dc095SDavid du Colombier png_uint_32 istop=row_info->rowbytes - bpp;
3850*593dc095SDavid du Colombier
3851*593dc095SDavid du Colombier for (i = 0; i < bpp; i++)
3852*593dc095SDavid du Colombier {
3853*593dc095SDavid du Colombier *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3854*593dc095SDavid du Colombier rp++;
3855*593dc095SDavid du Colombier }
3856*593dc095SDavid du Colombier
3857*593dc095SDavid du Colombier for (i = 0; i < istop; i++) // use leftover rp,pp
3858*593dc095SDavid du Colombier {
3859*593dc095SDavid du Colombier int a, b, c, pa, pb, pc, p;
3860*593dc095SDavid du Colombier
3861*593dc095SDavid du Colombier a = *lp++;
3862*593dc095SDavid du Colombier b = *pp++;
3863*593dc095SDavid du Colombier c = *cp++;
3864*593dc095SDavid du Colombier
3865*593dc095SDavid du Colombier p = b - c;
3866*593dc095SDavid du Colombier pc = a - c;
3867*593dc095SDavid du Colombier
3868*593dc095SDavid du Colombier #ifdef PNG_USE_ABS
3869*593dc095SDavid du Colombier pa = abs(p);
3870*593dc095SDavid du Colombier pb = abs(pc);
3871*593dc095SDavid du Colombier pc = abs(p + pc);
3872*593dc095SDavid du Colombier #else
3873*593dc095SDavid du Colombier pa = p < 0 ? -p : p;
3874*593dc095SDavid du Colombier pb = pc < 0 ? -pc : pc;
3875*593dc095SDavid du Colombier pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3876*593dc095SDavid du Colombier #endif
3877*593dc095SDavid du Colombier
3878*593dc095SDavid du Colombier /*
3879*593dc095SDavid du Colombier if (pa <= pb && pa <= pc)
3880*593dc095SDavid du Colombier p = a;
3881*593dc095SDavid du Colombier else if (pb <= pc)
3882*593dc095SDavid du Colombier p = b;
3883*593dc095SDavid du Colombier else
3884*593dc095SDavid du Colombier p = c;
3885*593dc095SDavid du Colombier */
3886*593dc095SDavid du Colombier
3887*593dc095SDavid du Colombier p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3888*593dc095SDavid du Colombier
3889*593dc095SDavid du Colombier *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3890*593dc095SDavid du Colombier rp++;
3891*593dc095SDavid du Colombier }
3892*593dc095SDavid du Colombier }
3893*593dc095SDavid du Colombier break;
3894*593dc095SDavid du Colombier }
3895*593dc095SDavid du Colombier
3896*593dc095SDavid du Colombier default:
3897*593dc095SDavid du Colombier png_warning(png_ptr, "Ignoring bad row filter type");
3898*593dc095SDavid du Colombier *row=0;
3899*593dc095SDavid du Colombier break;
3900*593dc095SDavid du Colombier }
3901*593dc095SDavid du Colombier }
3902*593dc095SDavid du Colombier
3903*593dc095SDavid du Colombier #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */
3904