xref: /plan9-contrib/sys/src/cmd/unix/drawterm/libmemdraw/draw.c (revision 8ccd4a6360d974db7bd7bbd4f37e7018419ea908)
1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <memdraw.h>
5 
6 int drawdebug;
7 static int	tablesbuilt;
8 
9 /* perfect approximation to NTSC = .299r+.587g+.114b when 0 ≤ r,g,b < 256 */
10 #define RGB2K(r,g,b)	((156763*(r)+307758*(g)+59769*(b))>>19)
11 
12 /*
13  * for 0 ≤ x ≤ 255*255, (x*0x0101+0x100)>>16 is a perfect approximation.
14  * for 0 ≤ x < (1<<16), x/255 = ((x+1)*0x0101)>>16 is a perfect approximation.
15  * the last one is perfect for all up to 1<<16, avoids a multiply, but requires a rathole.
16  */
17 /* #define DIV255(x) (((x)*257+256)>>16)  */
18 #define DIV255(x) ((((x)+1)*257)>>16)
19 /* #define DIV255(x) (tmp=(x)+1, (tmp+(tmp>>8))>>8) */
20 
21 #define MUL(x, y, t)	(t = (x)*(y)+128, (t+(t>>8))>>8)
22 #define MASK13	0xFF00FF00
23 #define MASK02	0x00FF00FF
24 #define MUL13(a, x, t)		(t = (a)*(((x)&MASK13)>>8)+128, ((t+((t>>8)&MASK02))>>8)&MASK02)
25 #define MUL02(a, x, t)		(t = (a)*(((x)&MASK02)>>0)+128, ((t+((t>>8)&MASK02))>>8)&MASK02)
26 #define MUL0123(a, x, s, t)	((MUL13(a, x, s)<<8)|MUL02(a, x, t))
27 
28 #define MUL2(u, v, x, y)	(t = (u)*(v)+(x)*(y)+256, (t+(t>>8))>>8)
29 
30 static void mktables(void);
31 typedef int Subdraw(Memdrawparam*);
32 static Subdraw chardraw, alphadraw, memoptdraw;
33 
34 static Memimage*	memones;
35 static Memimage*	memzeros;
36 Memimage *memwhite;
37 Memimage *memblack;
38 Memimage *memtransparent;
39 Memimage *memopaque;
40 
41 int	_ifmt(Fmt*);
42 
43 void
44 _memimageinit(void)
45 {
46 	static int didinit = 0;
47 
48 	if(didinit)
49 		return;
50 
51 	didinit = 1;
52 
53 	mktables();
54 	_memmkcmap();
55 
56 	fmtinstall('R', Rfmt);
57 	fmtinstall('P', Pfmt);
58 	fmtinstall('b', _ifmt);
59 
60 	memones = allocmemimage(Rect(0,0,1,1), GREY1);
61 	memones->flags |= Frepl;
62 	memones->clipr = Rect(-0x3FFFFFF, -0x3FFFFFF, 0x3FFFFFF, 0x3FFFFFF);
63 	*byteaddr(memones, ZP) = ~0;
64 
65 	memzeros = allocmemimage(Rect(0,0,1,1), GREY1);
66 	memzeros->flags |= Frepl;
67 	memzeros->clipr = Rect(-0x3FFFFFF, -0x3FFFFFF, 0x3FFFFFF, 0x3FFFFFF);
68 	*byteaddr(memzeros, ZP) = 0;
69 
70 	if(memones == nil || memzeros == nil)
71 		assert(0 /*cannot initialize memimage library */);	/* RSC BUG */
72 
73 	memwhite = memones;
74 	memblack = memzeros;
75 	memopaque = memones;
76 	memtransparent = memzeros;
77 }
78 
79 ulong _imgtorgba(Memimage*, ulong);
80 ulong _rgbatoimg(Memimage*, ulong);
81 ulong _pixelbits(Memimage*, Point);
82 
83 #define DBG if(0)
84 static Memdrawparam par;
85 
86 Memdrawparam*
87 _memimagedrawsetup(Memimage *dst, Rectangle r, Memimage *src, Point p0, Memimage *mask, Point p1, int op)
88 {
89 
90 	if(mask == nil)
91 		mask = memopaque;
92 
93 DBG	print("memimagedraw %p/%luX %R @ %p %p/%luX %P %p/%luX %P... ", dst, dst->chan, r, dst->data->bdata, src, src->chan, p0, mask, mask->chan, p1);
94 
95 	if(drawclip(dst, &r, src, &p0, mask, &p1, &par.sr, &par.mr) == 0){
96 //		if(drawdebug)
97 //			iprint("empty clipped rectangle\n");
98 		return nil;
99 	}
100 
101 	if(op < Clear || op > SoverD){
102 //		if(drawdebug)
103 //			iprint("op out of range: %d\n", op);
104 		return nil;
105 	}
106 
107 	par.op = op;
108 	par.dst = dst;
109 	par.r = r;
110 	par.src = src;
111 	/* par.sr set by drawclip */
112 	par.mask = mask;
113 	/* par.mr set by drawclip */
114 
115 	par.state = 0;
116 	if(src->flags&Frepl){
117 		par.state |= Replsrc;
118 		if(Dx(src->r)==1 && Dy(src->r)==1){
119 			par.sval = _pixelbits(src, src->r.min);
120 			par.state |= Simplesrc;
121 			par.srgba = _imgtorgba(src, par.sval);
122 			par.sdval = _rgbatoimg(dst, par.srgba);
123 			if((par.srgba&0xFF) == 0 && (op&DoutS)){
124 //				if (drawdebug) iprint("fill with transparent source\n");
125 				return nil;	/* no-op successfully handled */
126 			}
127 		}
128 	}
129 
130 	if(mask->flags & Frepl){
131 		par.state |= Replmask;
132 		if(Dx(mask->r)==1 && Dy(mask->r)==1){
133 			par.mval = _pixelbits(mask, mask->r.min);
134 			if(par.mval == 0 && (op&DoutS)){
135 //				if(drawdebug) iprint("fill with zero mask\n");
136 				return nil;	/* no-op successfully handled */
137 			}
138 			par.state |= Simplemask;
139 			if(par.mval == ~0)
140 				par.state |= Fullmask;
141 			par.mrgba = _imgtorgba(mask, par.mval);
142 		}
143 	}
144 
145 //	if(drawdebug)
146 //		iprint("dr %R sr %R mr %R...", r, par.sr, par.mr);
147 DBG print("draw dr %R sr %R mr %R %lux\n", r, par.sr, par.mr, par.state);
148 
149 	return &par;
150 }
151 
152 void
153 _memimagedraw(Memdrawparam *par)
154 {
155 	if (par == nil)
156 		return;
157 
158 	/*
159 	 * Now that we've clipped the parameters down to be consistent, we
160 	 * simply try sub-drawing routines in order until we find one that was able
161 	 * to handle us.  If the sub-drawing routine returns zero, it means it was
162 	 * unable to satisfy the request, so we do not return.
163 	 */
164 
165 	/*
166 	 * Hardware support.  Each video driver provides this function,
167 	 * which checks to see if there is anything it can help with.
168 	 * There could be an if around this checking to see if dst is in video memory.
169 	 */
170 DBG print("test hwdraw\n");
171 	if(hwdraw(par)){
172 //if(drawdebug) iprint("hw handled\n");
173 DBG print("hwdraw handled\n");
174 		return;
175 	}
176 	/*
177 	 * Optimizations using memmove and memset.
178 	 */
179 DBG print("test memoptdraw\n");
180 	if(memoptdraw(par)){
181 //if(drawdebug) iprint("memopt handled\n");
182 DBG print("memopt handled\n");
183 		return;
184 	}
185 
186 	/*
187 	 * Character drawing.
188 	 * Solid source color being painted through a boolean mask onto a high res image.
189 	 */
190 DBG print("test chardraw\n");
191 	if(chardraw(par)){
192 //if(drawdebug) iprint("chardraw handled\n");
193 DBG print("chardraw handled\n");
194 		return;
195 	}
196 
197 	/*
198 	 * General calculation-laden case that does alpha for each pixel.
199 	 */
200 DBG print("do alphadraw\n");
201 	alphadraw(par);
202 //if(drawdebug) iprint("alphadraw handled\n");
203 DBG print("alphadraw handled\n");
204 }
205 #undef DBG
206 
207 /*
208  * Clip the destination rectangle further based on the properties of the
209  * source and mask rectangles.  Once the destination rectangle is properly
210  * clipped, adjust the source and mask rectangles to be the same size.
211  * Then if source or mask is replicated, move its clipped rectangle
212  * so that its minimum point falls within the repl rectangle.
213  *
214  * Return zero if the final rectangle is null.
215  */
216 int
217 drawclip(Memimage *dst, Rectangle *r, Memimage *src, Point *p0, Memimage *mask, Point *p1, Rectangle *sr, Rectangle *mr)
218 {
219 	Point rmin, delta;
220 	int splitcoords;
221 	Rectangle omr;
222 
223 	if(r->min.x>=r->max.x || r->min.y>=r->max.y)
224 		return 0;
225 	splitcoords = (p0->x!=p1->x) || (p0->y!=p1->y);
226 	/* clip to destination */
227 	rmin = r->min;
228 	if(!rectclip(r, dst->r) || !rectclip(r, dst->clipr))
229 		return 0;
230 	/* move mask point */
231 	p1->x += r->min.x-rmin.x;
232 	p1->y += r->min.y-rmin.y;
233 	/* move source point */
234 	p0->x += r->min.x-rmin.x;
235 	p0->y += r->min.y-rmin.y;
236 	/* map destination rectangle into source */
237 	sr->min = *p0;
238 	sr->max.x = p0->x+Dx(*r);
239 	sr->max.y = p0->y+Dy(*r);
240 	/* sr is r in source coordinates; clip to source */
241 	if(!(src->flags&Frepl) && !rectclip(sr, src->r))
242 		return 0;
243 	if(!rectclip(sr, src->clipr))
244 		return 0;
245 	/* compute and clip rectangle in mask */
246 	if(splitcoords){
247 		/* move mask point with source */
248 		p1->x += sr->min.x-p0->x;
249 		p1->y += sr->min.y-p0->y;
250 		mr->min = *p1;
251 		mr->max.x = p1->x+Dx(*sr);
252 		mr->max.y = p1->y+Dy(*sr);
253 		omr = *mr;
254 		/* mr is now rectangle in mask; clip it */
255 		if(!(mask->flags&Frepl) && !rectclip(mr, mask->r))
256 			return 0;
257 		if(!rectclip(mr, mask->clipr))
258 			return 0;
259 		/* reflect any clips back to source */
260 		sr->min.x += mr->min.x-omr.min.x;
261 		sr->min.y += mr->min.y-omr.min.y;
262 		sr->max.x += mr->max.x-omr.max.x;
263 		sr->max.y += mr->max.y-omr.max.y;
264 		*p1 = mr->min;
265 	}else{
266 		if(!(mask->flags&Frepl) && !rectclip(sr, mask->r))
267 			return 0;
268 		if(!rectclip(sr, mask->clipr))
269 			return 0;
270 		*p1 = sr->min;
271 	}
272 
273 	/* move source clipping back to destination */
274 	delta.x = r->min.x - p0->x;
275 	delta.y = r->min.y - p0->y;
276 	r->min.x = sr->min.x + delta.x;
277 	r->min.y = sr->min.y + delta.y;
278 	r->max.x = sr->max.x + delta.x;
279 	r->max.y = sr->max.y + delta.y;
280 
281 	/* move source rectangle so sr->min is in src->r */
282 	if(src->flags&Frepl) {
283 		delta.x = drawreplxy(src->r.min.x, src->r.max.x, sr->min.x) - sr->min.x;
284 		delta.y = drawreplxy(src->r.min.y, src->r.max.y, sr->min.y) - sr->min.y;
285 		sr->min.x += delta.x;
286 		sr->min.y += delta.y;
287 		sr->max.x += delta.x;
288 		sr->max.y += delta.y;
289 	}
290 	*p0 = sr->min;
291 
292 	/* move mask point so it is in mask->r */
293 	*p1 = drawrepl(mask->r, *p1);
294 	mr->min = *p1;
295 	mr->max.x = p1->x+Dx(*sr);
296 	mr->max.y = p1->y+Dy(*sr);
297 
298 	assert(Dx(*sr) == Dx(*mr) && Dx(*mr) == Dx(*r));
299 	assert(Dy(*sr) == Dy(*mr) && Dy(*mr) == Dy(*r));
300 	assert(ptinrect(*p0, src->r));
301 	assert(ptinrect(*p1, mask->r));
302 	assert(ptinrect(r->min, dst->r));
303 
304 	return 1;
305 }
306 
307 /*
308  * Conversion tables.
309  */
310 static uchar replbit[1+8][256];		/* replbit[x][y] is the replication of the x-bit quantity y to 8-bit depth */
311 
312 /*
313  * bitmap of how to replicate n bits to fill 8, for 1 ≤ n ≤ 8.
314  * the X's are where to put the bottom (ones) bit of the n-bit pattern.
315  * only the top 8 bits of the result are actually used.
316  * (the lower 8 bits are needed to get bits in the right place
317  * when n is not a divisor of 8.)
318  *
319  * Should check to see if its easier to just refer to replmul than
320  * use the precomputed values in replbit.  On PCs it may well
321  * be; on machines with slow multiply instructions it probably isn't.
322  */
323 #define a ((((((((((((((((0
324 #define X *2+1)
325 #define _ *2)
326 static int replmul[1+8] = {
327 	0,
328 	a X X X X X X X X X X X X X X X X,
329 	a _ X _ X _ X _ X _ X _ X _ X _ X,
330 	a _ _ X _ _ X _ _ X _ _ X _ _ X _,
331 	a _ _ _ X _ _ _ X _ _ _ X _ _ _ X,
332 	a _ _ _ _ X _ _ _ _ X _ _ _ _ X _,
333 	a _ _ _ _ _ X _ _ _ _ _ X _ _ _ _,
334 	a _ _ _ _ _ _ X _ _ _ _ _ _ X _ _,
335 	a _ _ _ _ _ _ _ X _ _ _ _ _ _ _ X,
336 };
337 #undef a
338 #undef X
339 #undef _
340 
341 static void
342 mktables(void)
343 {
344 	int i, j, small;
345 
346 	if(tablesbuilt)
347 		return;
348 
349 	fmtinstall('R', Rfmt);
350 	fmtinstall('P', Pfmt);
351 	tablesbuilt = 1;
352 
353 	/* bit replication up to 8 bits */
354 	for(i=0; i<256; i++){
355 		for(j=0; j<=8; j++){	/* j <= 8 [sic] */
356 			small = i & ((1<<j)-1);
357 			replbit[j][i] = (small*replmul[j])>>8;
358 		}
359 	}
360 
361 }
362 
363 static uchar ones = 0xff;
364 
365 /*
366  * General alpha drawing case.  Can handle anything.
367  */
368 typedef struct	Buffer	Buffer;
369 struct Buffer {
370 	/* used by most routines */
371 	uchar	*red;
372 	uchar	*grn;
373 	uchar	*blu;
374 	uchar	*alpha;
375 	uchar	*grey;
376 	ulong	*rgba;
377 	int	delta;	/* number of bytes to add to pointer to get next pixel to the right */
378 
379 	/* used by boolcalc* for mask data */
380 	uchar	*m;		/* ptr to mask data r.min byte; like p->bytermin */
381 	int		mskip;	/* no. of left bits to skip in *m */
382 	uchar	*bm;		/* ptr to mask data img->r.min byte; like p->bytey0s */
383 	int		bmskip;	/* no. of left bits to skip in *bm */
384 	uchar	*em;		/* ptr to mask data img->r.max.x byte; like p->bytey0e */
385 	int		emskip;	/* no. of right bits to skip in *em */
386 };
387 
388 typedef struct	Param	Param;
389 typedef Buffer	Readfn(Param*, uchar*, int);
390 typedef void	Writefn(Param*, uchar*, Buffer);
391 typedef Buffer	Calcfn(Buffer, Buffer, Buffer, int, int, int);
392 
393 enum {
394 	MAXBCACHE = 16
395 };
396 
397 /* giant rathole to customize functions with */
398 struct Param {
399 	Readfn	*replcall;
400 	Readfn	*greymaskcall;
401 	Readfn	*convreadcall;
402 	Writefn	*convwritecall;
403 
404 	Memimage *img;
405 	Rectangle	r;
406 	int	dx;	/* of r */
407 	int	needbuf;
408 	int	convgrey;
409 	int	alphaonly;
410 
411 	uchar	*bytey0s;		/* byteaddr(Pt(img->r.min.x, img->r.min.y)) */
412 	uchar	*bytermin;	/* byteaddr(Pt(r.min.x, img->r.min.y)) */
413 	uchar	*bytey0e;		/* byteaddr(Pt(img->r.max.x, img->r.min.y)) */
414 	int		bwidth;
415 
416 	int	replcache;	/* if set, cache buffers */
417 	Buffer	bcache[MAXBCACHE];
418 	ulong	bfilled;
419 	uchar	*bufbase;
420 	int	bufoff;
421 	int	bufdelta;
422 
423 	int	dir;
424 
425 	int	convbufoff;
426 	uchar	*convbuf;
427 	Param	*convdpar;
428 	int	convdx;
429 };
430 
431 static uchar *drawbuf;
432 static int	ndrawbuf;
433 static int	mdrawbuf;
434 static Param spar, mpar, dpar;	/* easier on the stacks */
435 static Readfn	greymaskread, replread, readptr;
436 static Writefn	nullwrite;
437 static Calcfn	alphacalc0, alphacalc14, alphacalc2810, alphacalc3679, alphacalc5, alphacalc11, alphacalcS;
438 static Calcfn	boolcalc14, boolcalc236789, boolcalc1011;
439 
440 static Readfn*	readfn(Memimage*);
441 static Readfn*	readalphafn(Memimage*);
442 static Writefn*	writefn(Memimage*);
443 
444 static Calcfn*	boolcopyfn(Memimage*, Memimage*);
445 static Readfn*	convfn(Memimage*, Param*, Memimage*, Param*);
446 
447 static Calcfn *alphacalc[Ncomp] =
448 {
449 	alphacalc0,		/* Clear */
450 	alphacalc14,		/* DoutS */
451 	alphacalc2810,		/* SoutD */
452 	alphacalc3679,		/* DxorS */
453 	alphacalc14,		/* DinS */
454 	alphacalc5,		/* D */
455 	alphacalc3679,		/* DatopS */
456 	alphacalc3679,		/* DoverS */
457 	alphacalc2810,		/* SinD */
458 	alphacalc3679,		/* SatopD */
459 	alphacalc2810,		/* S */
460 	alphacalc11,		/* SoverD */
461 };
462 
463 static Calcfn *boolcalc[Ncomp] =
464 {
465 	alphacalc0,		/* Clear */
466 	boolcalc14,		/* DoutS */
467 	boolcalc236789,		/* SoutD */
468 	boolcalc236789,		/* DxorS */
469 	boolcalc14,		/* DinS */
470 	alphacalc5,		/* D */
471 	boolcalc236789,		/* DatopS */
472 	boolcalc236789,		/* DoverS */
473 	boolcalc236789,		/* SinD */
474 	boolcalc236789,		/* SatopD */
475 	boolcalc1011,		/* S */
476 	boolcalc1011,		/* SoverD */
477 };
478 
479 static int
480 allocdrawbuf(void)
481 {
482 	uchar *p;
483 
484 	if(ndrawbuf > mdrawbuf){
485 		p = realloc(drawbuf, ndrawbuf);
486 		if(p == nil){
487 			werrstr("memimagedraw out of memory");
488 			return -1;
489 		}
490 		drawbuf = p;
491 		mdrawbuf = ndrawbuf;
492 	}
493 	return 0;
494 }
495 
496 static Param
497 getparam(Memimage *img, Rectangle r, int convgrey, int needbuf)
498 {
499 	Param p;
500 	int nbuf;
501 
502 	memset(&p, 0, sizeof p);
503 
504 	p.img = img;
505 	p.r = r;
506 	p.dx = Dx(r);
507 	p.needbuf = needbuf;
508 	p.convgrey = convgrey;
509 
510 	assert(img->r.min.x <= r.min.x && r.min.x < img->r.max.x);
511 
512 	p.bytey0s = byteaddr(img, Pt(img->r.min.x, img->r.min.y));
513 	p.bytermin = byteaddr(img, Pt(r.min.x, img->r.min.y));
514 	p.bytey0e = byteaddr(img, Pt(img->r.max.x, img->r.min.y));
515 	p.bwidth = sizeof(ulong)*img->width;
516 
517 	assert(p.bytey0s <= p.bytermin && p.bytermin <= p.bytey0e);
518 
519 	if(p.r.min.x == p.img->r.min.x)
520 		assert(p.bytermin == p.bytey0s);
521 
522 	nbuf = 1;
523 	if((img->flags&Frepl) && Dy(img->r) <= MAXBCACHE && Dy(img->r) < Dy(r)){
524 		p.replcache = 1;
525 		nbuf = Dy(img->r);
526 	}
527 	p.bufdelta = 4*p.dx;
528 	p.bufoff = ndrawbuf;
529 	ndrawbuf += p.bufdelta*nbuf;
530 
531 	return p;
532 }
533 
534 static void
535 clipy(Memimage *img, int *y)
536 {
537 	int dy;
538 
539 	dy = Dy(img->r);
540 	if(*y == dy)
541 		*y = 0;
542 	else if(*y == -1)
543 		*y = dy-1;
544 	assert(0 <= *y && *y < dy);
545 }
546 
547 static void
548 dumpbuf(char *s, Buffer b, int n)
549 {
550 	int i;
551 	uchar *p;
552 
553 	print("%s", s);
554 	for(i=0; i<n; i++){
555 		print(" ");
556 		if((p=b.grey)){
557 			print(" k%.2uX", *p);
558 			b.grey += b.delta;
559 		}else{
560 			if((p=b.red)){
561 				print(" r%.2uX", *p);
562 				b.red += b.delta;
563 			}
564 			if((p=b.grn)){
565 				print(" g%.2uX", *p);
566 				b.grn += b.delta;
567 			}
568 			if((p=b.blu)){
569 				print(" b%.2uX", *p);
570 				b.blu += b.delta;
571 			}
572 		}
573 		if((p=b.alpha) != &ones){
574 			print(" α%.2uX", *p);
575 			b.alpha += b.delta;
576 		}
577 	}
578 	print("\n");
579 }
580 
581 /*
582  * For each scan line, we expand the pixels from source, mask, and destination
583  * into byte-aligned red, green, blue, alpha, and grey channels.  If buffering is not
584  * needed and the channels were already byte-aligned (grey8, rgb24, rgba32, rgb32),
585  * the readers need not copy the data: they can simply return pointers to the data.
586  * If the destination image is grey and the source is not, it is converted using the NTSC
587  * formula.
588  *
589  * Once we have all the channels, we call either rgbcalc or greycalc, depending on
590  * whether the destination image is color.  This is allowed to overwrite the dst buffer (perhaps
591  * the actual data, perhaps a copy) with its result.  It should only overwrite the dst buffer
592  * with the same format (i.e. red bytes with red bytes, etc.)  A new buffer is returned from
593  * the calculator, and that buffer is passed to a function to write it to the destination.
594  * If the buffer is already pointing at the destination, the writing function is a no-op.
595  */
596 #define DBG if(0)
597 static int
598 alphadraw(Memdrawparam *par)
599 {
600 	int isgrey, starty, endy, op;
601 	int needbuf, dsty, srcy, masky;
602 	int y, dir, dx, dy;
603 	Buffer bsrc, bdst, bmask;
604 	Readfn *rdsrc, *rdmask, *rddst;
605 	Calcfn *calc;
606 	Writefn *wrdst;
607 	Memimage *src, *mask, *dst;
608 	Rectangle r, sr, mr;
609 
610 	r = par->r;
611 	dx = Dx(r);
612 	dy = Dy(r);
613 
614 	ndrawbuf = 0;
615 
616 	src = par->src;
617 	mask = par->mask;
618 	dst = par->dst;
619 	sr = par->sr;
620 	mr = par->mr;
621 	op = par->op;
622 
623 	isgrey = dst->flags&Fgrey;
624 
625 	/*
626 	 * Buffering when src and dst are the same bitmap is sufficient but not
627 	 * necessary.  There are stronger conditions we could use.  We could
628 	 * check to see if the rectangles intersect, and if simply moving in the
629 	 * correct y direction can avoid the need to buffer.
630 	 */
631 	needbuf = (src->data == dst->data);
632 
633 	spar = getparam(src, sr, isgrey, needbuf);
634 	dpar = getparam(dst, r, isgrey, needbuf);
635 	mpar = getparam(mask, mr, 0, needbuf);
636 
637 	dir = (needbuf && byteaddr(dst, r.min) > byteaddr(src, sr.min)) ? -1 : 1;
638 	spar.dir = mpar.dir = dpar.dir = dir;
639 
640 	/*
641 	 * If the mask is purely boolean, we can convert from src to dst format
642 	 * when we read src, and then just copy it to dst where the mask tells us to.
643 	 * This requires a boolean (1-bit grey) mask and lack of a source alpha channel.
644 	 *
645 	 * The computation is accomplished by assigning the function pointers as follows:
646 	 *	rdsrc - read and convert source into dst format in a buffer
647 	 * 	rdmask - convert mask to bytes, set pointer to it
648 	 * 	rddst - fill with pointer to real dst data, but do no reads
649 	 *	calc - copy src onto dst when mask says to.
650 	 *	wrdst - do nothing
651 	 * This is slightly sleazy, since things aren't doing exactly what their names say,
652 	 * but it avoids a fair amount of code duplication to make this a case here
653 	 * rather than have a separate booldraw.
654 	 */
655 //if(drawdebug) iprint("flag %lud mchan %lux=?%x dd %d\n", src->flags&Falpha, mask->chan, GREY1, dst->depth);
656 	if(!(src->flags&Falpha) && mask->chan == GREY1 && dst->depth >= 8 && op == SoverD){
657 //if(drawdebug) iprint("boolcopy...");
658 		rdsrc = convfn(dst, &dpar, src, &spar);
659 		rddst = readptr;
660 		rdmask = readfn(mask);
661 		calc = boolcopyfn(dst, mask);
662 		wrdst = nullwrite;
663 	}else{
664 		/* usual alphadraw parameter fetching */
665 		rdsrc = readfn(src);
666 		rddst = readfn(dst);
667 		wrdst = writefn(dst);
668 		calc = alphacalc[op];
669 
670 		/*
671 		 * If there is no alpha channel, we'll ask for a grey channel
672 		 * and pretend it is the alpha.
673 		 */
674 		if(mask->flags&Falpha){
675 			rdmask = readalphafn(mask);
676 			mpar.alphaonly = 1;
677 		}else{
678 			mpar.greymaskcall = readfn(mask);
679 			mpar.convgrey = 1;
680 			rdmask = greymaskread;
681 
682 			/*
683 			 * Should really be above, but then boolcopyfns would have
684 			 * to deal with bit alignment, and I haven't written that.
685 			 *
686 			 * This is a common case for things like ellipse drawing.
687 			 * When there's no alpha involved and the mask is boolean,
688 			 * we can avoid all the division and multiplication.
689 			 */
690 			if(mask->chan == GREY1 && !(src->flags&Falpha))
691 				calc = boolcalc[op];
692 			else if(op == SoverD && !(src->flags&Falpha))
693 				calc = alphacalcS;
694 		}
695 	}
696 
697 	/*
698 	 * If the image has a small enough repl rectangle,
699 	 * we can just read each line once and cache them.
700 	 */
701 	if(spar.replcache){
702 		spar.replcall = rdsrc;
703 		rdsrc = replread;
704 	}
705 	if(mpar.replcache){
706 		mpar.replcall = rdmask;
707 		rdmask = replread;
708 	}
709 
710 	if(allocdrawbuf() < 0)
711 		return 0;
712 
713 	/*
714 	 * Before we were saving only offsets from drawbuf in the parameter
715 	 * structures; now that drawbuf has been grown to accomodate us,
716 	 * we can fill in the pointers.
717 	 */
718 	spar.bufbase = drawbuf+spar.bufoff;
719 	mpar.bufbase = drawbuf+mpar.bufoff;
720 	dpar.bufbase = drawbuf+dpar.bufoff;
721 	spar.convbuf = drawbuf+spar.convbufoff;
722 
723 	if(dir == 1){
724 		starty = 0;
725 		endy = dy;
726 	}else{
727 		starty = dy-1;
728 		endy = -1;
729 	}
730 
731 	/*
732 	 * srcy, masky, and dsty are offsets from the top of their
733 	 * respective Rectangles.  they need to be contained within
734 	 * the rectangles, so clipy can keep them there without division.
735  	 */
736 	srcy = (starty + sr.min.y - src->r.min.y)%Dy(src->r);
737 	masky = (starty + mr.min.y - mask->r.min.y)%Dy(mask->r);
738 	dsty = starty + r.min.y - dst->r.min.y;
739 
740 	assert(0 <= srcy && srcy < Dy(src->r));
741 	assert(0 <= masky && masky < Dy(mask->r));
742 	assert(0 <= dsty && dsty < Dy(dst->r));
743 
744 	for(y=starty; y!=endy; y+=dir, srcy+=dir, masky+=dir, dsty+=dir){
745 		clipy(src, &srcy);
746 		clipy(dst, &dsty);
747 		clipy(mask, &masky);
748 
749 		bsrc = rdsrc(&spar, spar.bufbase, srcy);
750 DBG print("[");
751 		bmask = rdmask(&mpar, mpar.bufbase, masky);
752 DBG print("]\n");
753 		bdst = rddst(&dpar, dpar.bufbase, dsty);
754 DBG		dumpbuf("src", bsrc, dx);
755 DBG		dumpbuf("mask", bmask, dx);
756 DBG		dumpbuf("dst", bdst, dx);
757 		bdst = calc(bdst, bsrc, bmask, dx, isgrey, op);
758 		wrdst(&dpar, dpar.bytermin+dsty*dpar.bwidth, bdst);
759 	}
760 
761 	return 1;
762 }
763 #undef DBG
764 
765 static Buffer
766 alphacalc0(Buffer bdst, Buffer b1, Buffer b2, int dx, int grey, int op)
767 {
768 	USED(grey);
769 	USED(op);
770 	memset(bdst.rgba, 0, dx*bdst.delta);
771 	return bdst;
772 }
773 
774 static Buffer
775 alphacalc14(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
776 {
777 	Buffer obdst;
778 	int fd, sadelta;
779 	int i, sa, ma, q;
780 	ulong s, t;
781 
782 	obdst = bdst;
783 	sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;
784 	q = bsrc.delta == 4 && bdst.delta == 4;
785 
786 	for(i=0; i<dx; i++){
787 		sa = *bsrc.alpha;
788 		ma = *bmask.alpha;
789 		fd = MUL(sa, ma, t);
790 		if(op == DoutS)
791 			fd = 255-fd;
792 
793 		if(grey){
794 			*bdst.grey = MUL(fd, *bdst.grey, t);
795 			bsrc.grey += bsrc.delta;
796 			bdst.grey += bdst.delta;
797 		}else{
798 			if(q){
799 				*bdst.rgba = MUL0123(fd, *bdst.rgba, s, t);
800 				bsrc.rgba++;
801 				bdst.rgba++;
802 				bsrc.alpha += sadelta;
803 				bmask.alpha += bmask.delta;
804 				continue;
805 			}
806 			*bdst.red = MUL(fd, *bdst.red, t);
807 			*bdst.grn = MUL(fd, *bdst.grn, t);
808 			*bdst.blu = MUL(fd, *bdst.blu, t);
809 			bsrc.red += bsrc.delta;
810 			bsrc.blu += bsrc.delta;
811 			bsrc.grn += bsrc.delta;
812 			bdst.red += bdst.delta;
813 			bdst.blu += bdst.delta;
814 			bdst.grn += bdst.delta;
815 		}
816 		if(bdst.alpha != &ones){
817 			*bdst.alpha = MUL(fd, *bdst.alpha, t);
818 			bdst.alpha += bdst.delta;
819 		}
820 		bmask.alpha += bmask.delta;
821 		bsrc.alpha += sadelta;
822 	}
823 	return obdst;
824 }
825 
826 static Buffer
827 alphacalc2810(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
828 {
829 	Buffer obdst;
830 	int fs, sadelta;
831 	int i, ma, da, q;
832 	ulong s, t;
833 
834 	obdst = bdst;
835 	sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;
836 	q = bsrc.delta == 4 && bdst.delta == 4;
837 
838 	for(i=0; i<dx; i++){
839 		ma = *bmask.alpha;
840 		da = *bdst.alpha;
841 		if(op == SoutD)
842 			da = 255-da;
843 		fs = ma;
844 		if(op != S)
845 			fs = MUL(fs, da, t);
846 
847 		if(grey){
848 			*bdst.grey = MUL(fs, *bsrc.grey, t);
849 			bsrc.grey += bsrc.delta;
850 			bdst.grey += bdst.delta;
851 		}else{
852 			if(q){
853 				*bdst.rgba = MUL0123(fs, *bsrc.rgba, s, t);
854 				bsrc.rgba++;
855 				bdst.rgba++;
856 				bmask.alpha += bmask.delta;
857 				bdst.alpha += bdst.delta;
858 				continue;
859 			}
860 			*bdst.red = MUL(fs, *bsrc.red, t);
861 			*bdst.grn = MUL(fs, *bsrc.grn, t);
862 			*bdst.blu = MUL(fs, *bsrc.blu, t);
863 			bsrc.red += bsrc.delta;
864 			bsrc.blu += bsrc.delta;
865 			bsrc.grn += bsrc.delta;
866 			bdst.red += bdst.delta;
867 			bdst.blu += bdst.delta;
868 			bdst.grn += bdst.delta;
869 		}
870 		if(bdst.alpha != &ones){
871 			*bdst.alpha = MUL(fs, *bsrc.alpha, t);
872 			bdst.alpha += bdst.delta;
873 		}
874 		bmask.alpha += bmask.delta;
875 		bsrc.alpha += sadelta;
876 	}
877 	return obdst;
878 }
879 
880 static Buffer
881 alphacalc3679(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
882 {
883 	Buffer obdst;
884 	int fs, fd, sadelta;
885 	int i, sa, ma, da, q;
886 	ulong s, t, u, v;
887 
888 	obdst = bdst;
889 	sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;
890 	q = bsrc.delta == 4 && bdst.delta == 4;
891 
892 	for(i=0; i<dx; i++){
893 		sa = *bsrc.alpha;
894 		ma = *bmask.alpha;
895 		da = *bdst.alpha;
896 		if(op == SatopD)
897 			fs = MUL(ma, da, t);
898 		else
899 			fs = MUL(ma, 255-da, t);
900 		if(op == DoverS)
901 			fd = 255;
902 		else{
903 			fd = MUL(sa, ma, t);
904 			if(op != DatopS)
905 				fd = 255-fd;
906 		}
907 
908 		if(grey){
909 			*bdst.grey = MUL(fs, *bsrc.grey, s)+MUL(fd, *bdst.grey, t);
910 			bsrc.grey += bsrc.delta;
911 			bdst.grey += bdst.delta;
912 		}else{
913 			if(q){
914 				*bdst.rgba = MUL0123(fs, *bsrc.rgba, s, t)+MUL0123(fd, *bdst.rgba, u, v);
915 				bsrc.rgba++;
916 				bdst.rgba++;
917 				bsrc.alpha += sadelta;
918 				bmask.alpha += bmask.delta;
919 				bdst.alpha += bdst.delta;
920 				continue;
921 			}
922 			*bdst.red = MUL(fs, *bsrc.red, s)+MUL(fd, *bdst.red, t);
923 			*bdst.grn = MUL(fs, *bsrc.grn, s)+MUL(fd, *bdst.grn, t);
924 			*bdst.blu = MUL(fs, *bsrc.blu, s)+MUL(fd, *bdst.blu, t);
925 			bsrc.red += bsrc.delta;
926 			bsrc.blu += bsrc.delta;
927 			bsrc.grn += bsrc.delta;
928 			bdst.red += bdst.delta;
929 			bdst.blu += bdst.delta;
930 			bdst.grn += bdst.delta;
931 		}
932 		if(bdst.alpha != &ones){
933 			*bdst.alpha = MUL(fs, sa, s)+MUL(fd, da, t);
934 			bdst.alpha += bdst.delta;
935 		}
936 		bmask.alpha += bmask.delta;
937 		bsrc.alpha += sadelta;
938 	}
939 	return obdst;
940 }
941 
942 static Buffer
943 alphacalc5(Buffer bdst, Buffer b1, Buffer b2, int dx, int grey, int op)
944 {
945 	USED(dx);
946 	USED(grey);
947 	USED(op);
948 	return bdst;
949 }
950 
951 static Buffer
952 alphacalc11(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
953 {
954 	Buffer obdst;
955 	int fd, sadelta;
956 	int i, sa, ma, q;
957 	ulong s, t, u, v;
958 
959 	USED(op);
960 	obdst = bdst;
961 	sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;
962 	q = bsrc.delta == 4 && bdst.delta == 4;
963 
964 	for(i=0; i<dx; i++){
965 		sa = *bsrc.alpha;
966 		ma = *bmask.alpha;
967 		fd = 255-MUL(sa, ma, t);
968 
969 		if(grey){
970 			*bdst.grey = MUL(ma, *bsrc.grey, s)+MUL(fd, *bdst.grey, t);
971 			bsrc.grey += bsrc.delta;
972 			bdst.grey += bdst.delta;
973 		}else{
974 			if(q){
975 				*bdst.rgba = MUL0123(ma, *bsrc.rgba, s, t)+MUL0123(fd, *bdst.rgba, u, v);
976 				bsrc.rgba++;
977 				bdst.rgba++;
978 				bsrc.alpha += sadelta;
979 				bmask.alpha += bmask.delta;
980 				continue;
981 			}
982 			*bdst.red = MUL(ma, *bsrc.red, s)+MUL(fd, *bdst.red, t);
983 			*bdst.grn = MUL(ma, *bsrc.grn, s)+MUL(fd, *bdst.grn, t);
984 			*bdst.blu = MUL(ma, *bsrc.blu, s)+MUL(fd, *bdst.blu, t);
985 			bsrc.red += bsrc.delta;
986 			bsrc.blu += bsrc.delta;
987 			bsrc.grn += bsrc.delta;
988 			bdst.red += bdst.delta;
989 			bdst.blu += bdst.delta;
990 			bdst.grn += bdst.delta;
991 		}
992 		if(bdst.alpha != &ones){
993 			*bdst.alpha = MUL(ma, sa, s)+MUL(fd, *bdst.alpha, t);
994 			bdst.alpha += bdst.delta;
995 		}
996 		bmask.alpha += bmask.delta;
997 		bsrc.alpha += sadelta;
998 	}
999 	return obdst;
1000 }
1001 
1002 /*
1003 not used yet
1004 source and mask alpha 1
1005 static Buffer
1006 alphacalcS0(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
1007 {
1008 	Buffer obdst;
1009 	int i;
1010 
1011 	USED(op);
1012 	obdst = bdst;
1013 	if(bsrc.delta == bdst.delta){
1014 		memmove(bdst.rgba, bsrc.rgba, dx*bdst.delta);
1015 		return obdst;
1016 	}
1017 	for(i=0; i<dx; i++){
1018 		if(grey){
1019 			*bdst.grey = *bsrc.grey;
1020 			bsrc.grey += bsrc.delta;
1021 			bdst.grey += bdst.delta;
1022 		}else{
1023 			*bdst.red = *bsrc.red;
1024 			*bdst.grn = *bsrc.grn;
1025 			*bdst.blu = *bsrc.blu;
1026 			bsrc.red += bsrc.delta;
1027 			bsrc.blu += bsrc.delta;
1028 			bsrc.grn += bsrc.delta;
1029 			bdst.red += bdst.delta;
1030 			bdst.blu += bdst.delta;
1031 			bdst.grn += bdst.delta;
1032 		}
1033 		if(bdst.alpha != &ones){
1034 			*bdst.alpha = 255;
1035 			bdst.alpha += bdst.delta;
1036 		}
1037 	}
1038 	return obdst;
1039 }
1040 */
1041 
1042 /* source alpha 1 */
1043 static Buffer
1044 alphacalcS(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
1045 {
1046 	Buffer obdst;
1047 	int fd;
1048 	int i, ma;
1049 	ulong s, t;
1050 
1051 	USED(op);
1052 	obdst = bdst;
1053 
1054 	for(i=0; i<dx; i++){
1055 		ma = *bmask.alpha;
1056 		fd = 255-ma;
1057 
1058 		if(grey){
1059 			*bdst.grey = MUL(ma, *bsrc.grey, s)+MUL(fd, *bdst.grey, t);
1060 			bsrc.grey += bsrc.delta;
1061 			bdst.grey += bdst.delta;
1062 		}else{
1063 			*bdst.red = MUL(ma, *bsrc.red, s)+MUL(fd, *bdst.red, t);
1064 			*bdst.grn = MUL(ma, *bsrc.grn, s)+MUL(fd, *bdst.grn, t);
1065 			*bdst.blu = MUL(ma, *bsrc.blu, s)+MUL(fd, *bdst.blu, t);
1066 			bsrc.red += bsrc.delta;
1067 			bsrc.blu += bsrc.delta;
1068 			bsrc.grn += bsrc.delta;
1069 			bdst.red += bdst.delta;
1070 			bdst.blu += bdst.delta;
1071 			bdst.grn += bdst.delta;
1072 		}
1073 		if(bdst.alpha != &ones){
1074 			*bdst.alpha = ma+MUL(fd, *bdst.alpha, t);
1075 			bdst.alpha += bdst.delta;
1076 		}
1077 		bmask.alpha += bmask.delta;
1078 	}
1079 	return obdst;
1080 }
1081 
1082 static Buffer
1083 boolcalc14(Buffer bdst, Buffer b1, Buffer bmask, int dx, int grey, int op)
1084 {
1085 	Buffer obdst;
1086 	int i, ma, zero;
1087 
1088 	obdst = bdst;
1089 
1090 	for(i=0; i<dx; i++){
1091 		ma = *bmask.alpha;
1092 		zero = ma ? op == DoutS : op == DinS;
1093 
1094 		if(grey){
1095 			if(zero)
1096 				*bdst.grey = 0;
1097 			bdst.grey += bdst.delta;
1098 		}else{
1099 			if(zero)
1100 				*bdst.red = *bdst.grn = *bdst.blu = 0;
1101 			bdst.red += bdst.delta;
1102 			bdst.blu += bdst.delta;
1103 			bdst.grn += bdst.delta;
1104 		}
1105 		bmask.alpha += bmask.delta;
1106 		if(bdst.alpha != &ones){
1107 			if(zero)
1108 				*bdst.alpha = 0;
1109 			bdst.alpha += bdst.delta;
1110 		}
1111 	}
1112 	return obdst;
1113 }
1114 
1115 static Buffer
1116 boolcalc236789(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
1117 {
1118 	Buffer obdst;
1119 	int fs, fd;
1120 	int i, ma, da, zero;
1121 	ulong s, t;
1122 
1123 	obdst = bdst;
1124 	zero = !(op&1);
1125 
1126 	for(i=0; i<dx; i++){
1127 		ma = *bmask.alpha;
1128 		da = *bdst.alpha;
1129 		fs = da;
1130 		if(op&2)
1131 			fs = 255-da;
1132 		fd = 0;
1133 		if(op&4)
1134 			fd = 255;
1135 
1136 		if(grey){
1137 			if(ma)
1138 				*bdst.grey = MUL(fs, *bsrc.grey, s)+MUL(fd, *bdst.grey, t);
1139 			else if(zero)
1140 				*bdst.grey = 0;
1141 			bsrc.grey += bsrc.delta;
1142 			bdst.grey += bdst.delta;
1143 		}else{
1144 			if(ma){
1145 				*bdst.red = MUL(fs, *bsrc.red, s)+MUL(fd, *bdst.red, t);
1146 				*bdst.grn = MUL(fs, *bsrc.grn, s)+MUL(fd, *bdst.grn, t);
1147 				*bdst.blu = MUL(fs, *bsrc.blu, s)+MUL(fd, *bdst.blu, t);
1148 			}
1149 			else if(zero)
1150 				*bdst.red = *bdst.grn = *bdst.blu = 0;
1151 			bsrc.red += bsrc.delta;
1152 			bsrc.blu += bsrc.delta;
1153 			bsrc.grn += bsrc.delta;
1154 			bdst.red += bdst.delta;
1155 			bdst.blu += bdst.delta;
1156 			bdst.grn += bdst.delta;
1157 		}
1158 		bmask.alpha += bmask.delta;
1159 		if(bdst.alpha != &ones){
1160 			if(ma)
1161 				*bdst.alpha = fs+MUL(fd, da, t);
1162 			else if(zero)
1163 				*bdst.alpha = 0;
1164 			bdst.alpha += bdst.delta;
1165 		}
1166 	}
1167 	return obdst;
1168 }
1169 
1170 static Buffer
1171 boolcalc1011(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
1172 {
1173 	Buffer obdst;
1174 	int i, ma, zero;
1175 
1176 	obdst = bdst;
1177 	zero = !(op&1);
1178 
1179 	for(i=0; i<dx; i++){
1180 		ma = *bmask.alpha;
1181 
1182 		if(grey){
1183 			if(ma)
1184 				*bdst.grey = *bsrc.grey;
1185 			else if(zero)
1186 				*bdst.grey = 0;
1187 			bsrc.grey += bsrc.delta;
1188 			bdst.grey += bdst.delta;
1189 		}else{
1190 			if(ma){
1191 				*bdst.red = *bsrc.red;
1192 				*bdst.grn = *bsrc.grn;
1193 				*bdst.blu = *bsrc.blu;
1194 			}
1195 			else if(zero)
1196 				*bdst.red = *bdst.grn = *bdst.blu = 0;
1197 			bsrc.red += bsrc.delta;
1198 			bsrc.blu += bsrc.delta;
1199 			bsrc.grn += bsrc.delta;
1200 			bdst.red += bdst.delta;
1201 			bdst.blu += bdst.delta;
1202 			bdst.grn += bdst.delta;
1203 		}
1204 		bmask.alpha += bmask.delta;
1205 		if(bdst.alpha != &ones){
1206 			if(ma)
1207 				*bdst.alpha = 255;
1208 			else if(zero)
1209 				*bdst.alpha = 0;
1210 			bdst.alpha += bdst.delta;
1211 		}
1212 	}
1213 	return obdst;
1214 }
1215 /*
1216  * Replicated cached scan line read.  Call the function listed in the Param,
1217  * but cache the result so that for replicated images we only do the work once.
1218  */
1219 static Buffer
1220 replread(Param *p, uchar *s, int y)
1221 {
1222 	Buffer *b;
1223 
1224 	USED(s);
1225 	b = &p->bcache[y];
1226 	if((p->bfilled & (1<<y)) == 0){
1227 		p->bfilled |= 1<<y;
1228 		*b = p->replcall(p, p->bufbase+y*p->bufdelta, y);
1229 	}
1230 	return *b;
1231 }
1232 
1233 /*
1234  * Alpha reading function that simply relabels the grey pointer.
1235  */
1236 static Buffer
1237 greymaskread(Param *p, uchar *buf, int y)
1238 {
1239 	Buffer b;
1240 
1241 	b = p->greymaskcall(p, buf, y);
1242 	b.alpha = b.grey;
1243 	return b;
1244 }
1245 
1246 #define DBG if(0)
1247 static Buffer
1248 readnbit(Param *p, uchar *buf, int y)
1249 {
1250 	Buffer b;
1251 	Memimage *img;
1252 	uchar *repl, *r, *w, *ow, bits;
1253 	int i, n, sh, depth, x, dx, npack, nbits;
1254 
1255 	b.rgba = (ulong*)buf;
1256 	b.grey = w = buf;
1257 	b.red = b.blu = b.grn = w;
1258 	b.alpha = &ones;
1259 	b.delta = 1;
1260 
1261 	dx = p->dx;
1262 	img = p->img;
1263 	depth = img->depth;
1264 	repl = &replbit[depth][0];
1265 	npack = 8/depth;
1266 	sh = 8-depth;
1267 
1268 	/* copy from p->r.min.x until end of repl rectangle */
1269 	x = p->r.min.x;
1270 	n = dx;
1271 	if(n > p->img->r.max.x - x)
1272 		n = p->img->r.max.x - x;
1273 
1274 	r = p->bytermin + y*p->bwidth;
1275 DBG print("readnbit dx %d %p=%p+%d*%d, *r=%d fetch %d ", dx, r, p->bytermin, y, p->bwidth, *r, n);
1276 	bits = *r++;
1277 	nbits = 8;
1278 	if((i=x&(npack-1))){
1279 DBG print("throwaway %d...", i);
1280 		bits <<= depth*i;
1281 		nbits -= depth*i;
1282 	}
1283 	for(i=0; i<n; i++){
1284 		if(nbits == 0){
1285 DBG print("(%.2ux)...", *r);
1286 			bits = *r++;
1287 			nbits = 8;
1288 		}
1289 		*w++ = repl[bits>>sh];
1290 DBG print("bit %x...", repl[bits>>sh]);
1291 		bits <<= depth;
1292 		nbits -= depth;
1293 	}
1294 	dx -= n;
1295 	if(dx == 0)
1296 		return b;
1297 
1298 	assert(x+i == p->img->r.max.x);
1299 
1300 	/* copy from beginning of repl rectangle until where we were before. */
1301 	x = p->img->r.min.x;
1302 	n = dx;
1303 	if(n > p->r.min.x - x)
1304 		n = p->r.min.x - x;
1305 
1306 	r = p->bytey0s + y*p->bwidth;
1307 DBG print("x=%d r=%p...", x, r);
1308 	bits = *r++;
1309 	nbits = 8;
1310 	if((i=x&(npack-1))){
1311 		bits <<= depth*i;
1312 		nbits -= depth*i;
1313 	}
1314 DBG print("nbits=%d...", nbits);
1315 	for(i=0; i<n; i++){
1316 		if(nbits == 0){
1317 			bits = *r++;
1318 			nbits = 8;
1319 		}
1320 		*w++ = repl[bits>>sh];
1321 DBG print("bit %x...", repl[bits>>sh]);
1322 		bits <<= depth;
1323 		nbits -= depth;
1324 DBG print("bits %x nbits %d...", bits, nbits);
1325 	}
1326 	dx -= n;
1327 	if(dx == 0)
1328 		return b;
1329 
1330 	assert(dx > 0);
1331 	/* now we have exactly one full scan line: just replicate the buffer itself until we are done */
1332 	ow = buf;
1333 	while(dx--)
1334 		*w++ = *ow++;
1335 
1336 	return b;
1337 }
1338 #undef DBG
1339 
1340 #define DBG if(0)
1341 static void
1342 writenbit(Param *p, uchar *w, Buffer src)
1343 {
1344 	uchar *r;
1345 	ulong bits;
1346 	int i, sh, depth, npack, nbits, x, ex;
1347 
1348 	assert(src.grey != nil && src.delta == 1);
1349 
1350 	x = p->r.min.x;
1351 	ex = x+p->dx;
1352 	depth = p->img->depth;
1353 	npack = 8/depth;
1354 
1355 	i=x&(npack-1);
1356 	bits = i ? (*w >> (8-depth*i)) : 0;
1357 	nbits = depth*i;
1358 	sh = 8-depth;
1359 	r = src.grey;
1360 
1361 	for(; x<ex; x++){
1362 		bits <<= depth;
1363 DBG print(" %x", *r);
1364 		bits |= (*r++ >> sh);
1365 		nbits += depth;
1366 		if(nbits == 8){
1367 			*w++ = bits;
1368 			nbits = 0;
1369 		}
1370 	}
1371 
1372 	if(nbits){
1373 		sh = 8-nbits;
1374 		bits <<= sh;
1375 		bits |= *w & ((1<<sh)-1);
1376 		*w = bits;
1377 	}
1378 DBG print("\n");
1379 	return;
1380 }
1381 #undef DBG
1382 
1383 static Buffer
1384 readcmap(Param *p, uchar *buf, int y)
1385 {
1386 	Buffer b;
1387 	int a, convgrey, copyalpha, dx, i, m;
1388 	uchar *q, *cmap, *begin, *end, *r, *w;
1389 
1390 	begin = p->bytey0s + y*p->bwidth;
1391 	r = p->bytermin + y*p->bwidth;
1392 	end = p->bytey0e + y*p->bwidth;
1393 	cmap = p->img->cmap->cmap2rgb;
1394 	convgrey = p->convgrey;
1395 	copyalpha = (p->img->flags&Falpha) ? 1 : 0;
1396 
1397 	w = buf;
1398 	dx = p->dx;
1399 	if(copyalpha){
1400 		b.alpha = buf++;
1401 		a = p->img->shift[CAlpha]/8;
1402 		m = p->img->shift[CMap]/8;
1403 		for(i=0; i<dx; i++){
1404 			*w++ = r[a];
1405 			q = cmap+r[m]*3;
1406 			r += 2;
1407 			if(r == end)
1408 				r = begin;
1409 			if(convgrey){
1410 				*w++ = RGB2K(q[0], q[1], q[2]);
1411 			}else{
1412 				*w++ = q[2];	/* blue */
1413 				*w++ = q[1];	/* green */
1414 				*w++ = q[0];	/* red */
1415 			}
1416 		}
1417 	}else{
1418 		b.alpha = &ones;
1419 		for(i=0; i<dx; i++){
1420 			q = cmap+*r++*3;
1421 			if(r == end)
1422 				r = begin;
1423 			if(convgrey){
1424 				*w++ = RGB2K(q[0], q[1], q[2]);
1425 			}else{
1426 				*w++ = q[2];	/* blue */
1427 				*w++ = q[1];	/* green */
1428 				*w++ = q[0];	/* red */
1429 			}
1430 		}
1431 	}
1432 
1433 	b.rgba = (ulong*)(buf-copyalpha);
1434 
1435 	if(convgrey){
1436 		b.grey = buf;
1437 		b.red = b.blu = b.grn = buf;
1438 		b.delta = 1+copyalpha;
1439 	}else{
1440 		b.blu = buf;
1441 		b.grn = buf+1;
1442 		b.red = buf+2;
1443 		b.grey = nil;
1444 		b.delta = 3+copyalpha;
1445 	}
1446 	return b;
1447 }
1448 
1449 static void
1450 writecmap(Param *p, uchar *w, Buffer src)
1451 {
1452 	uchar *cmap, *red, *grn, *blu;
1453 	int i, dx, delta;
1454 
1455 	cmap = p->img->cmap->rgb2cmap;
1456 
1457 	delta = src.delta;
1458 	red= src.red;
1459 	grn = src.grn;
1460 	blu = src.blu;
1461 
1462 	dx = p->dx;
1463 	for(i=0; i<dx; i++, red+=delta, grn+=delta, blu+=delta)
1464 		*w++ = cmap[(*red>>4)*256+(*grn>>4)*16+(*blu>>4)];
1465 }
1466 
1467 #define DBG if(0)
1468 static Buffer
1469 readbyte(Param *p, uchar *buf, int y)
1470 {
1471 	Buffer b;
1472 	Memimage *img;
1473 	int dx, isgrey, convgrey, alphaonly, copyalpha, i, nb;
1474 	uchar *begin, *end, *r, *w, *rrepl, *grepl, *brepl, *arepl, *krepl;
1475 	uchar ured, ugrn, ublu;
1476 	ulong u;
1477 
1478 	img = p->img;
1479 	begin = p->bytey0s + y*p->bwidth;
1480 	r = p->bytermin + y*p->bwidth;
1481 	end = p->bytey0e + y*p->bwidth;
1482 
1483 	w = buf;
1484 	dx = p->dx;
1485 	nb = img->depth/8;
1486 
1487 	convgrey = p->convgrey;	/* convert rgb to grey */
1488 	isgrey = img->flags&Fgrey;
1489 	alphaonly = p->alphaonly;
1490 	copyalpha = (img->flags&Falpha) ? 1 : 0;
1491 
1492 DBG print("copyalpha %d alphaonly %d convgrey %d isgrey %d\n", copyalpha, alphaonly, convgrey, isgrey);
1493 	/* if we can, avoid processing everything */
1494 	if(!(img->flags&Frepl) && !convgrey && (img->flags&Fbytes)){
1495 		memset(&b, 0, sizeof b);
1496 		if(p->needbuf){
1497 			memmove(buf, r, dx*nb);
1498 			r = buf;
1499 		}
1500 		b.rgba = (ulong*)r;
1501 		if(copyalpha)
1502 			b.alpha = r+img->shift[CAlpha]/8;
1503 		else
1504 			b.alpha = &ones;
1505 		if(isgrey){
1506 			b.grey = r+img->shift[CGrey]/8;
1507 			b.red = b.grn = b.blu = b.grey;
1508 		}else{
1509 			b.red = r+img->shift[CRed]/8;
1510 			b.grn = r+img->shift[CGreen]/8;
1511 			b.blu = r+img->shift[CBlue]/8;
1512 		}
1513 		b.delta = nb;
1514 		return b;
1515 	}
1516 
1517 DBG print("2\n");
1518 	rrepl = replbit[img->nbits[CRed]];
1519 	grepl = replbit[img->nbits[CGreen]];
1520 	brepl = replbit[img->nbits[CBlue]];
1521 	arepl = replbit[img->nbits[CAlpha]];
1522 	krepl = replbit[img->nbits[CGrey]];
1523 
1524 	for(i=0; i<dx; i++){
1525 		u = r[0] | (r[1]<<8) | (r[2]<<16) | (r[3]<<24);
1526 		if(copyalpha) {
1527 			*w++ = arepl[(u>>img->shift[CAlpha]) & img->mask[CAlpha]];
1528 DBG print("a %x\n", w[-1]);
1529 		}
1530 
1531 		if(isgrey)
1532 			*w++ = krepl[(u >> img->shift[CGrey]) & img->mask[CGrey]];
1533 		else if(!alphaonly){
1534 			ured = rrepl[(u >> img->shift[CRed]) & img->mask[CRed]];
1535 			ugrn = grepl[(u >> img->shift[CGreen]) & img->mask[CGreen]];
1536 			ublu = brepl[(u >> img->shift[CBlue]) & img->mask[CBlue]];
1537 			if(convgrey){
1538 DBG print("g %x %x %x\n", ured, ugrn, ublu);
1539 				*w++ = RGB2K(ured, ugrn, ublu);
1540 DBG print("%x\n", w[-1]);
1541 			}else{
1542 				*w++ = brepl[(u >> img->shift[CBlue]) & img->mask[CBlue]];
1543 				*w++ = grepl[(u >> img->shift[CGreen]) & img->mask[CGreen]];
1544 				*w++ = rrepl[(u >> img->shift[CRed]) & img->mask[CRed]];
1545 			}
1546 		}
1547 		r += nb;
1548 		if(r == end)
1549 			r = begin;
1550 	}
1551 
1552 	b.alpha = copyalpha ? buf : &ones;
1553 	b.rgba = (ulong*)buf;
1554 	if(alphaonly){
1555 		b.red = b.grn = b.blu = b.grey = nil;
1556 		if(!copyalpha)
1557 			b.rgba = nil;
1558 		b.delta = 1;
1559 	}else if(isgrey || convgrey){
1560 		b.grey = buf+copyalpha;
1561 		b.red = b.grn = b.blu = buf+copyalpha;
1562 		b.delta = copyalpha+1;
1563 DBG print("alpha %x grey %x\n", b.alpha ? *b.alpha : 0xFF, *b.grey);
1564 	}else{
1565 		b.blu = buf+copyalpha;
1566 		b.grn = buf+copyalpha+1;
1567 		b.grey = nil;
1568 		b.red = buf+copyalpha+2;
1569 		b.delta = copyalpha+3;
1570 	}
1571 	return b;
1572 }
1573 #undef DBG
1574 
1575 #define DBG if(0)
1576 static void
1577 writebyte(Param *p, uchar *w, Buffer src)
1578 {
1579 	Memimage *img;
1580 	int i, isalpha, isgrey, nb, delta, dx, adelta;
1581 	uchar ff, *red, *grn, *blu, *grey, *alpha;
1582 	ulong u, mask;
1583 
1584 	img = p->img;
1585 
1586 	red = src.red;
1587 	grn = src.grn;
1588 	blu = src.blu;
1589 	alpha = src.alpha;
1590 	delta = src.delta;
1591 	grey = src.grey;
1592 	dx = p->dx;
1593 
1594 	nb = img->depth/8;
1595 	mask = (nb==4) ? 0 : ~((1<<img->depth)-1);
1596 
1597 	isalpha = img->flags&Falpha;
1598 	isgrey = img->flags&Fgrey;
1599 	adelta = src.delta;
1600 
1601 	if(isalpha && (alpha == nil || alpha == &ones)){
1602 		ff = 0xFF;
1603 		alpha = &ff;
1604 		adelta = 0;
1605 	}
1606 
1607 	for(i=0; i<dx; i++){
1608 		u = w[0] | (w[1]<<8) | (w[2]<<16) | (w[3]<<24);
1609 DBG print("u %.8lux...", u);
1610 		u &= mask;
1611 DBG print("&mask %.8lux...", u);
1612 		if(isgrey){
1613 			u |= ((*grey >> (8-img->nbits[CGrey])) & img->mask[CGrey]) << img->shift[CGrey];
1614 DBG print("|grey %.8lux...", u);
1615 			grey += delta;
1616 		}else{
1617 			u |= ((*red >> (8-img->nbits[CRed])) & img->mask[CRed]) << img->shift[CRed];
1618 			u |= ((*grn >> (8-img->nbits[CGreen])) & img->mask[CGreen]) << img->shift[CGreen];
1619 			u |= ((*blu >> (8-img->nbits[CBlue])) & img->mask[CBlue]) << img->shift[CBlue];
1620 			red += delta;
1621 			grn += delta;
1622 			blu += delta;
1623 DBG print("|rgb %.8lux...", u);
1624 		}
1625 
1626 		if(isalpha){
1627 			u |= ((*alpha >> (8-img->nbits[CAlpha])) & img->mask[CAlpha]) << img->shift[CAlpha];
1628 			alpha += adelta;
1629 DBG print("|alpha %.8lux...", u);
1630 		}
1631 
1632 		w[0] = u;
1633 		w[1] = u>>8;
1634 		w[2] = u>>16;
1635 		w[3] = u>>24;
1636 		w += nb;
1637 	}
1638 }
1639 #undef DBG
1640 
1641 static Readfn*
1642 readfn(Memimage *img)
1643 {
1644 	if(img->depth < 8)
1645 		return readnbit;
1646 	if(img->nbits[CMap] == 8)
1647 		return readcmap;
1648 	return readbyte;
1649 }
1650 
1651 static Readfn*
1652 readalphafn(Memimage *m)
1653 {
1654 	USED(m);
1655 	return readbyte;
1656 }
1657 
1658 static Writefn*
1659 writefn(Memimage *img)
1660 {
1661 	if(img->depth < 8)
1662 		return writenbit;
1663 	if(img->chan == CMAP8)
1664 		return writecmap;
1665 	return writebyte;
1666 }
1667 
1668 static void
1669 nullwrite(Param *p, uchar *s, Buffer b)
1670 {
1671 	USED(p);
1672 	USED(s);
1673 }
1674 
1675 static Buffer
1676 readptr(Param *p, uchar *s, int y)
1677 {
1678 	Buffer b;
1679 	uchar *q;
1680 
1681 	USED(s);
1682 	q = p->bytermin + y*p->bwidth;
1683 	b.red = q;	/* ptr to data */
1684 	b.grn = b.blu = b.grey = b.alpha = nil;
1685 	b.rgba = (ulong*)q;
1686 	b.delta = p->img->depth/8;
1687 	return b;
1688 }
1689 
1690 static Buffer
1691 boolmemmove(Buffer bdst, Buffer bsrc, Buffer b1, int dx, int i, int o)
1692 {
1693 	USED(i);
1694 	USED(o);
1695 	memmove(bdst.red, bsrc.red, dx*bdst.delta);
1696 	return bdst;
1697 }
1698 
1699 static Buffer
1700 boolcopy8(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int i, int o)
1701 {
1702 	uchar *m, *r, *w, *ew;
1703 
1704 	USED(i);
1705 	USED(o);
1706 	m = bmask.grey;
1707 	w = bdst.red;
1708 	r = bsrc.red;
1709 	ew = w+dx;
1710 	for(; w < ew; w++,r++)
1711 		if(*m++)
1712 			*w = *r;
1713 	return bdst;	/* not used */
1714 }
1715 
1716 static Buffer
1717 boolcopy16(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int i, int o)
1718 {
1719 	uchar *m;
1720 	ushort *r, *w, *ew;
1721 
1722 	USED(i);
1723 	USED(o);
1724 	m = bmask.grey;
1725 	w = (ushort*)bdst.red;
1726 	r = (ushort*)bsrc.red;
1727 	ew = w+dx;
1728 	for(; w < ew; w++,r++)
1729 		if(*m++)
1730 			*w = *r;
1731 	return bdst;	/* not used */
1732 }
1733 
1734 static Buffer
1735 boolcopy24(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int i, int o)
1736 {
1737 	uchar *m;
1738 	uchar *r, *w, *ew;
1739 
1740 	USED(i);
1741 	USED(o);
1742 	m = bmask.grey;
1743 	w = bdst.red;
1744 	r = bsrc.red;
1745 	ew = w+dx*3;
1746 	while(w < ew){
1747 		if(*m++){
1748 			*w++ = *r++;
1749 			*w++ = *r++;
1750 			*w++ = *r++;
1751 		}else{
1752 			w += 3;
1753 			r += 3;
1754 		}
1755 	}
1756 	return bdst;	/* not used */
1757 }
1758 
1759 static Buffer
1760 boolcopy32(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int i, int o)
1761 {
1762 	uchar *m;
1763 	ulong *r, *w, *ew;
1764 
1765 	USED(i);
1766 	USED(o);
1767 	m = bmask.grey;
1768 	w = (ulong*)bdst.red;
1769 	r = (ulong*)bsrc.red;
1770 	ew = w+dx;
1771 	for(; w < ew; w++,r++)
1772 		if(*m++)
1773 			*w = *r;
1774 	return bdst;	/* not used */
1775 }
1776 
1777 static Buffer
1778 genconv(Param *p, uchar *buf, int y)
1779 {
1780 	Buffer b;
1781 	int nb;
1782 	uchar *r, *w, *ew;
1783 
1784 	/* read from source into RGB format in convbuf */
1785 	b = p->convreadcall(p, p->convbuf, y);
1786 
1787 	/* write RGB format into dst format in buf */
1788 	p->convwritecall(p->convdpar, buf, b);
1789 
1790 	if(p->convdx){
1791 		nb = p->convdpar->img->depth/8;
1792 		r = buf;
1793 		w = buf+nb*p->dx;
1794 		ew = buf+nb*p->convdx;
1795 		while(w<ew)
1796 			*w++ = *r++;
1797 	}
1798 
1799 	b.red = buf;
1800 	b.blu = b.grn = b.grey = b.alpha = nil;
1801 	b.rgba = (ulong*)buf;
1802 	b.delta = 0;
1803 
1804 	return b;
1805 }
1806 
1807 static Readfn*
1808 convfn(Memimage *dst, Param *dpar, Memimage *src, Param *spar)
1809 {
1810 	if(dst->chan == src->chan && !(src->flags&Frepl)){
1811 //if(drawdebug) iprint("readptr...");
1812 		return readptr;
1813 	}
1814 
1815 	if(dst->chan==CMAP8 && (src->chan==GREY1||src->chan==GREY2||src->chan==GREY4)){
1816 		/* cheat because we know the replicated value is exactly the color map entry. */
1817 //if(drawdebug) iprint("Readnbit...");
1818 		return readnbit;
1819 	}
1820 
1821 	spar->convreadcall = readfn(src);
1822 	spar->convwritecall = writefn(dst);
1823 	spar->convdpar = dpar;
1824 
1825 	/* allocate a conversion buffer */
1826 	spar->convbufoff = ndrawbuf;
1827 	ndrawbuf += spar->dx*4;
1828 
1829 	if(spar->dx > Dx(spar->img->r)){
1830 		spar->convdx = spar->dx;
1831 		spar->dx = Dx(spar->img->r);
1832 	}
1833 
1834 //if(drawdebug) iprint("genconv...");
1835 	return genconv;
1836 }
1837 
1838 ulong
1839 _pixelbits(Memimage *i, Point pt)
1840 {
1841 	uchar *p;
1842 	ulong val;
1843 	int off, bpp, npack;
1844 
1845 	val = 0;
1846 	p = byteaddr(i, pt);
1847 	switch(bpp=i->depth){
1848 	case 1:
1849 	case 2:
1850 	case 4:
1851 		npack = 8/bpp;
1852 		off = pt.x%npack;
1853 		val = p[0] >> bpp*(npack-1-off);
1854 		val &= (1<<bpp)-1;
1855 		break;
1856 	case 8:
1857 		val = p[0];
1858 		break;
1859 	case 16:
1860 		val = p[0]|(p[1]<<8);
1861 		break;
1862 	case 24:
1863 		val = p[0]|(p[1]<<8)|(p[2]<<16);
1864 		break;
1865 	case 32:
1866 		val = p[0]|(p[1]<<8)|(p[2]<<16)|(p[3]<<24);
1867 		break;
1868 	}
1869 	while(bpp<32){
1870 		val |= val<<bpp;
1871 		bpp *= 2;
1872 	}
1873 	return val;
1874 }
1875 
1876 static Calcfn*
1877 boolcopyfn(Memimage *img, Memimage *mask)
1878 {
1879 	if(mask->flags&Frepl && Dx(mask->r)==1 && Dy(mask->r)==1 && pixelbits(mask, mask->r.min)==~0)
1880 		return boolmemmove;
1881 
1882 	switch(img->depth){
1883 	case 8:
1884 		return boolcopy8;
1885 	case 16:
1886 		return boolcopy16;
1887 	case 24:
1888 		return boolcopy24;
1889 	case 32:
1890 		return boolcopy32;
1891 	default:
1892 		assert(0 /* boolcopyfn */);
1893 	}
1894 	return nil;
1895 }
1896 
1897 /*
1898  * Optimized draw for filling and scrolling; uses memset and memmove.
1899  *
1900 static void
1901 memsetb(void *vp, uchar val, int n)
1902 {
1903 	uchar *p, *ep;
1904 
1905 	p = vp;
1906 	ep = p+n;
1907 	while(p<ep)
1908 		*p++ = val;
1909 }
1910 */
1911 
1912 static void
1913 memsets(void *vp, ushort val, int n)
1914 {
1915 	ushort *p, *ep;
1916 
1917 	p = vp;
1918 	ep = p+n;
1919 	while(p<ep)
1920 		*p++ = val;
1921 }
1922 
1923 static void
1924 memsetl(void *vp, ulong val, int n)
1925 {
1926 	ulong *p, *ep;
1927 
1928 	p = vp;
1929 	ep = p+n;
1930 	while(p<ep)
1931 		*p++ = val;
1932 }
1933 
1934 static void
1935 memset24(void *vp, ulong val, int n)
1936 {
1937 	uchar *p, *ep;
1938 	uchar a,b,c;
1939 
1940 	p = vp;
1941 	ep = p+3*n;
1942 	a = val;
1943 	b = val>>8;
1944 	c = val>>16;
1945 	while(p<ep){
1946 		*p++ = a;
1947 		*p++ = b;
1948 		*p++ = c;
1949 	}
1950 }
1951 
1952 ulong
1953 _imgtorgba(Memimage *img, ulong val)
1954 {
1955 	uchar r, g, b, a;
1956 	int nb, ov, v;
1957 	ulong chan;
1958 	uchar *p;
1959 
1960 	a = 0xFF;
1961 	r = g = b = 0xAA;	/* garbage */
1962 	for(chan=img->chan; chan; chan>>=8){
1963 		nb = NBITS(chan);
1964 		ov = v = val&((1<<nb)-1);
1965 		val >>= nb;
1966 
1967 		while(nb < 8){
1968 			v |= v<<nb;
1969 			nb *= 2;
1970 		}
1971 		v >>= (nb-8);
1972 
1973 		switch(TYPE(chan)){
1974 		case CRed:
1975 			r = v;
1976 			break;
1977 		case CGreen:
1978 			g = v;
1979 			break;
1980 		case CBlue:
1981 			b = v;
1982 			break;
1983 		case CAlpha:
1984 			a = v;
1985 			break;
1986 		case CGrey:
1987 			r = g = b = v;
1988 			break;
1989 		case CMap:
1990 			p = img->cmap->cmap2rgb+3*ov;
1991 			r = *p++;
1992 			g = *p++;
1993 			b = *p;
1994 			break;
1995 		}
1996 	}
1997 	return (r<<24)|(g<<16)|(b<<8)|a;
1998 }
1999 
2000 ulong
2001 _rgbatoimg(Memimage *img, ulong rgba)
2002 {
2003 	ulong chan;
2004 	int d, nb;
2005 	ulong v;
2006 	uchar *p, r, g, b, a, m;
2007 
2008 	v = 0;
2009 	r = rgba>>24;
2010 	g = rgba>>16;
2011 	b = rgba>>8;
2012 	a = rgba;
2013 	d = 0;
2014 	for(chan=img->chan; chan; chan>>=8){
2015 		nb = NBITS(chan);
2016 		switch(TYPE(chan)){
2017 		case CRed:
2018 			v |= (r>>(8-nb))<<d;
2019 			break;
2020 		case CGreen:
2021 			v |= (g>>(8-nb))<<d;
2022 			break;
2023 		case CBlue:
2024 			v |= (b>>(8-nb))<<d;
2025 			break;
2026 		case CAlpha:
2027 			v |= (a>>(8-nb))<<d;
2028 			break;
2029 		case CMap:
2030 			p = img->cmap->rgb2cmap;
2031 			m = p[(r>>4)*256+(g>>4)*16+(b>>4)];
2032 			v |= (m>>(8-nb))<<d;
2033 			break;
2034 		case CGrey:
2035 			m = RGB2K(r,g,b);
2036 			v |= (m>>(8-nb))<<d;
2037 			break;
2038 		}
2039 		d += nb;
2040 	}
2041 //	print("rgba2img %.8lux = %.*lux\n", rgba, 2*d/8, v);
2042 	return v;
2043 }
2044 
2045 #define DBG if(0)
2046 static int
2047 memoptdraw(Memdrawparam *par)
2048 {
2049 	int m, y, dy, dx, op;
2050 	ulong v;
2051 	Memimage *src;
2052 	Memimage *dst;
2053 
2054 	dx = Dx(par->r);
2055 	dy = Dy(par->r);
2056 	src = par->src;
2057 	dst = par->dst;
2058 	op = par->op;
2059 
2060 DBG print("state %lux mval %lux dd %d\n", par->state, par->mval, dst->depth);
2061 	/*
2062 	 * If we have an opaque mask and source is one opaque pixel we can convert to the
2063 	 * destination format and just replicate with memset.
2064 	 */
2065 	m = Simplesrc|Simplemask|Fullmask;
2066 	if((par->state&m)==m && (par->srgba&0xFF) == 0xFF && (op ==S || op == SoverD)){
2067 		uchar *dp, p[4];
2068 		int d, dwid, ppb, np, nb;
2069 		uchar lm, rm;
2070 
2071 DBG print("memopt, dst %p, dst->data->bdata %p\n", dst, dst->data->bdata);
2072 		dwid = dst->width*sizeof(ulong);
2073 		dp = byteaddr(dst, par->r.min);
2074 		v = par->sdval;
2075 DBG print("sdval %lud, depth %d\n", v, dst->depth);
2076 		switch(dst->depth){
2077 		case 1:
2078 		case 2:
2079 		case 4:
2080 			for(d=dst->depth; d<8; d*=2)
2081 				v |= (v<<d);
2082 			ppb = 8/dst->depth;	/* pixels per byte */
2083 			m = ppb-1;
2084 			/* left edge */
2085 			np = par->r.min.x&m;		/* no. pixels unused on left side of word */
2086 			dx -= (ppb-np);
2087 			nb = 8 - np * dst->depth;		/* no. bits used on right side of word */
2088 			lm = (1<<nb)-1;
2089 DBG print("np %d x %d nb %d lm %ux ppb %d m %ux\n", np, par->r.min.x, nb, lm, ppb, m);
2090 
2091 			/* right edge */
2092 			np = par->r.max.x&m;	/* no. pixels used on left side of word */
2093 			dx -= np;
2094 			nb = 8 - np * dst->depth;		/* no. bits unused on right side of word */
2095 			rm = ~((1<<nb)-1);
2096 DBG print("np %d x %d nb %d rm %ux ppb %d m %ux\n", np, par->r.max.x, nb, rm, ppb, m);
2097 
2098 DBG print("dx %d Dx %d\n", dx, Dx(par->r));
2099 			/* lm, rm are masks that are 1 where we should touch the bits */
2100 			if(dx < 0){	/* just one byte */
2101 				lm &= rm;
2102 				for(y=0; y<dy; y++, dp+=dwid)
2103 					*dp ^= (v ^ *dp) & lm;
2104 			}else if(dx == 0){	/* no full bytes */
2105 				if(lm)
2106 					dwid--;
2107 
2108 				for(y=0; y<dy; y++, dp+=dwid){
2109 					if(lm){
2110 DBG print("dp %p v %lux lm %ux (v ^ *dp) & lm %lux\n", dp, v, lm, (v^*dp)&lm);
2111 						*dp ^= (v ^ *dp) & lm;
2112 						dp++;
2113 					}
2114 					*dp ^= (v ^ *dp) & rm;
2115 				}
2116 			}else{		/* full bytes in middle */
2117 				dx /= ppb;
2118 				if(lm)
2119 					dwid--;
2120 				dwid -= dx;
2121 
2122 				for(y=0; y<dy; y++, dp+=dwid){
2123 					if(lm){
2124 						*dp ^= (v ^ *dp) & lm;
2125 						dp++;
2126 					}
2127 					memset(dp, v, dx);
2128 					dp += dx;
2129 					*dp ^= (v ^ *dp) & rm;
2130 				}
2131 			}
2132 			return 1;
2133 		case 8:
2134 			for(y=0; y<dy; y++, dp+=dwid)
2135 				memset(dp, v, dx);
2136 			return 1;
2137 		case 16:
2138 			p[0] = v;		/* make little endian */
2139 			p[1] = v>>8;
2140 			v = *(ushort*)p;
2141 DBG print("dp=%p; dx=%d; for(y=0; y<%d; y++, dp+=%d)\nmemsets(dp, v, dx);\n",
2142 	dp, dx, dy, dwid);
2143 			for(y=0; y<dy; y++, dp+=dwid)
2144 				memsets(dp, v, dx);
2145 			return 1;
2146 		case 24:
2147 			for(y=0; y<dy; y++, dp+=dwid)
2148 				memset24(dp, v, dx);
2149 			return 1;
2150 		case 32:
2151 			p[0] = v;		/* make little endian */
2152 			p[1] = v>>8;
2153 			p[2] = v>>16;
2154 			p[3] = v>>24;
2155 			v = *(ulong*)p;
2156 			for(y=0; y<dy; y++, dp+=dwid)
2157 				memsetl(dp, v, dx);
2158 			return 1;
2159 		default:
2160 			assert(0 /* bad dest depth in memoptdraw */);
2161 		}
2162 	}
2163 
2164 	/*
2165 	 * If no source alpha, an opaque mask, we can just copy the
2166 	 * source onto the destination.  If the channels are the same and
2167 	 * the source is not replicated, memmove suffices.
2168 	 */
2169 	m = Simplemask|Fullmask;
2170 	if((par->state&(m|Replsrc))==m && src->depth >= 8
2171 	&& src->chan == dst->chan && !(src->flags&Falpha) && (op == S || op == SoverD)){
2172 		uchar *sp, *dp;
2173 		long swid, dwid, nb;
2174 		int dir;
2175 
2176 		if(src->data == dst->data && byteaddr(dst, par->r.min) > byteaddr(src, par->sr.min))
2177 			dir = -1;
2178 		else
2179 			dir = 1;
2180 
2181 		swid = src->width*sizeof(ulong);
2182 		dwid = dst->width*sizeof(ulong);
2183 		sp = byteaddr(src, par->sr.min);
2184 		dp = byteaddr(dst, par->r.min);
2185 		if(dir == -1){
2186 			sp += (dy-1)*swid;
2187 			dp += (dy-1)*dwid;
2188 			swid = -swid;
2189 			dwid = -dwid;
2190 		}
2191 		nb = (dx*src->depth)/8;
2192 		for(y=0; y<dy; y++, sp+=swid, dp+=dwid)
2193 			memmove(dp, sp, nb);
2194 		return 1;
2195 	}
2196 
2197 	/*
2198 	 * If we have a 1-bit mask, 1-bit source, and 1-bit destination, and
2199 	 * they're all bit aligned, we can just use bit operators.  This happens
2200 	 * when we're manipulating boolean masks, e.g. in the arc code.
2201 	 */
2202 	if((par->state&(Simplemask|Simplesrc|Replmask|Replsrc))==0
2203 	&& dst->chan==GREY1 && src->chan==GREY1 && par->mask->chan==GREY1
2204 	&& (par->r.min.x&7)==(par->sr.min.x&7) && (par->r.min.x&7)==(par->mr.min.x&7)){
2205 		uchar *sp, *dp, *mp;
2206 		uchar lm, rm;
2207 		long swid, dwid, mwid;
2208 		int i, x, dir;
2209 
2210 		sp = byteaddr(src, par->sr.min);
2211 		dp = byteaddr(dst, par->r.min);
2212 		mp = byteaddr(par->mask, par->mr.min);
2213 		swid = src->width*sizeof(ulong);
2214 		dwid = dst->width*sizeof(ulong);
2215 		mwid = par->mask->width*sizeof(ulong);
2216 
2217 		if(src->data == dst->data && byteaddr(dst, par->r.min) > byteaddr(src, par->sr.min)){
2218 			dir = -1;
2219 		}else
2220 			dir = 1;
2221 
2222 		lm = 0xFF>>(par->r.min.x&7);
2223 		rm = 0xFF<<(8-(par->r.max.x&7));
2224 		dx -= (8-(par->r.min.x&7)) + (par->r.max.x&7);
2225 
2226 		if(dx < 0){	/* one byte wide */
2227 			lm &= rm;
2228 			if(dir == -1){
2229 				dp += dwid*(dy-1);
2230 				sp += swid*(dy-1);
2231 				mp += mwid*(dy-1);
2232 				dwid = -dwid;
2233 				swid = -swid;
2234 				mwid = -mwid;
2235 			}
2236 			for(y=0; y<dy; y++){
2237 				*dp ^= (*dp ^ *sp) & *mp & lm;
2238 				dp += dwid;
2239 				sp += swid;
2240 				mp += mwid;
2241 			}
2242 			return 1;
2243 		}
2244 
2245 		dx /= 8;
2246 		if(dir == 1){
2247 			i = (lm!=0)+dx+(rm!=0);
2248 			mwid -= i;
2249 			swid -= i;
2250 			dwid -= i;
2251 			for(y=0; y<dy; y++, dp+=dwid, sp+=swid, mp+=mwid){
2252 				if(lm){
2253 					*dp ^= (*dp ^ *sp++) & *mp++ & lm;
2254 					dp++;
2255 				}
2256 				for(x=0; x<dx; x++){
2257 					*dp ^= (*dp ^ *sp++) & *mp++;
2258 					dp++;
2259 				}
2260 				if(rm){
2261 					*dp ^= (*dp ^ *sp++) & *mp++ & rm;
2262 					dp++;
2263 				}
2264 			}
2265 			return 1;
2266 		}else{
2267 		/* dir == -1 */
2268 			i = (lm!=0)+dx+(rm!=0);
2269 			dp += dwid*(dy-1)+i-1;
2270 			sp += swid*(dy-1)+i-1;
2271 			mp += mwid*(dy-1)+i-1;
2272 			dwid = -dwid+i;
2273 			swid = -swid+i;
2274 			mwid = -mwid+i;
2275 			for(y=0; y<dy; y++, dp+=dwid, sp+=swid, mp+=mwid){
2276 				if(rm){
2277 					*dp ^= (*dp ^ *sp--) & *mp-- & rm;
2278 					dp--;
2279 				}
2280 				for(x=0; x<dx; x++){
2281 					*dp ^= (*dp ^ *sp--) & *mp--;
2282 					dp--;
2283 				}
2284 				if(lm){
2285 					*dp ^= (*dp ^ *sp--) & *mp-- & lm;
2286 					dp--;
2287 				}
2288 			}
2289 		}
2290 		return 1;
2291 	}
2292 	return 0;
2293 }
2294 #undef DBG
2295 
2296 /*
2297  * Boolean character drawing.
2298  * Solid opaque color through a 1-bit greyscale mask.
2299  */
2300 #define DBG if(0)
2301 static int
2302 chardraw(Memdrawparam *par)
2303 {
2304 	ulong bits;
2305 	int i, ddepth, dy, dx, x, bx, ex, y, npack, bsh, depth, op;
2306 	ulong v, maskwid, dstwid;
2307 	uchar *wp, *rp, *q, *wc;
2308 	ushort *ws;
2309 	ulong *wl;
2310 	uchar sp[4];
2311 	Rectangle r, mr;
2312 	Memimage *mask, *src, *dst;
2313 
2314 if(0) if(drawdebug) iprint("chardraw? mf %lux md %d sf %lux dxs %d dys %d dd %d ddat %p sdat %p\n",
2315 		par->mask->flags, par->mask->depth, par->src->flags,
2316 		Dx(par->src->r), Dy(par->src->r), par->dst->depth, par->dst->data, par->src->data);
2317 
2318 	mask = par->mask;
2319 	src = par->src;
2320 	dst = par->dst;
2321 	r = par->r;
2322 	mr = par->mr;
2323 	op = par->op;
2324 
2325 	if((par->state&(Replsrc|Simplesrc|Replmask)) != (Replsrc|Simplesrc)
2326 	|| mask->depth != 1 || src->flags&Falpha || dst->depth<8 || dst->data==src->data
2327 	|| op != SoverD)
2328 		return 0;
2329 
2330 //if(drawdebug) iprint("chardraw...");
2331 
2332 	depth = mask->depth;
2333 	maskwid = mask->width*sizeof(ulong);
2334 	rp = byteaddr(mask, mr.min);
2335 	npack = 8/depth;
2336 	bsh = (mr.min.x % npack) * depth;
2337 
2338 	wp = byteaddr(dst, r.min);
2339 	dstwid = dst->width*sizeof(ulong);
2340 DBG print("bsh %d\n", bsh);
2341 	dy = Dy(r);
2342 	dx = Dx(r);
2343 
2344 	ddepth = dst->depth;
2345 
2346 	/*
2347 	 * for loop counts from bsh to bsh+dx
2348 	 *
2349 	 * we want the bottom bits to be the amount
2350 	 * to shift the pixels down, so for n≡0 (mod 8) we want
2351 	 * bottom bits 7.  for n≡1, 6, etc.
2352 	 * the bits come from -n-1.
2353 	 */
2354 
2355 	bx = -bsh-1;
2356 	ex = -bsh-1-dx;
2357 	bits = 0;
2358 	v = par->sdval;
2359 
2360 	/* make little endian */
2361 	sp[0] = v;
2362 	sp[1] = v>>8;
2363 	sp[2] = v>>16;
2364 	sp[3] = v>>24;
2365 
2366 //print("sp %x %x %x %x\n", sp[0], sp[1], sp[2], sp[3]);
2367 	for(y=0; y<dy; y++, rp+=maskwid, wp+=dstwid){
2368 		q = rp;
2369 		if(bsh)
2370 			bits = *q++;
2371 		switch(ddepth){
2372 		case 8:
2373 //if(drawdebug) iprint("8loop...");
2374 			wc = wp;
2375 			for(x=bx; x>ex; x--, wc++){
2376 				i = x&7;
2377 				if(i == 8-1)
2378 					bits = *q++;
2379 DBG print("bits %lux sh %d...", bits, i);
2380 				if((bits>>i)&1)
2381 					*wc = v;
2382 			}
2383 			break;
2384 		case 16:
2385 			ws = (ushort*)wp;
2386 			v = *(ushort*)sp;
2387 			for(x=bx; x>ex; x--, ws++){
2388 				i = x&7;
2389 				if(i == 8-1)
2390 					bits = *q++;
2391 DBG print("bits %lux sh %d...", bits, i);
2392 				if((bits>>i)&1)
2393 					*ws = v;
2394 			}
2395 			break;
2396 		case 24:
2397 			wc = wp;
2398 			for(x=bx; x>ex; x--, wc+=3){
2399 				i = x&7;
2400 				if(i == 8-1)
2401 					bits = *q++;
2402 DBG print("bits %lux sh %d...", bits, i);
2403 				if((bits>>i)&1){
2404 					wc[0] = sp[0];
2405 					wc[1] = sp[1];
2406 					wc[2] = sp[2];
2407 				}
2408 			}
2409 			break;
2410 		case 32:
2411 			wl = (ulong*)wp;
2412 			v = *(ulong*)sp;
2413 			for(x=bx; x>ex; x--, wl++){
2414 				i = x&7;
2415 				if(i == 8-1)
2416 					bits = *q++;
2417 DBG iprint("bits %lux sh %d...", bits, i);
2418 				if((bits>>i)&1)
2419 					*wl = v;
2420 			}
2421 			break;
2422 		}
2423 	}
2424 
2425 DBG print("\n");
2426 	return 1;
2427 }
2428 #undef DBG
2429 
2430 
2431 /*
2432  * Fill entire byte with replicated (if necessary) copy of source pixel,
2433  * assuming destination ldepth is >= source ldepth.
2434  *
2435  * This code is just plain wrong for >8bpp.
2436  *
2437 ulong
2438 membyteval(Memimage *src)
2439 {
2440 	int i, val, bpp;
2441 	uchar uc;
2442 
2443 	unloadmemimage(src, src->r, &uc, 1);
2444 	bpp = src->depth;
2445 	uc <<= (src->r.min.x&(7/src->depth))*src->depth;
2446 	uc &= ~(0xFF>>bpp);
2447 	// pixel value is now in high part of byte. repeat throughout byte
2448 	val = uc;
2449 	for(i=bpp; i<8; i<<=1)
2450 		val |= val>>i;
2451 	return val;
2452 }
2453  *
2454  */
2455 
2456 void
2457 _memfillcolor(Memimage *i, ulong val)
2458 {
2459 	ulong bits;
2460 	int d, y;
2461 	uchar p[4];
2462 
2463 	if(val == DNofill)
2464 		return;
2465 
2466 	bits = _rgbatoimg(i, val);
2467 	switch(i->depth){
2468 	case 24:	/* 24-bit images suck */
2469 		for(y=i->r.min.y; y<i->r.max.y; y++)
2470 			memset24(byteaddr(i, Pt(i->r.min.x, y)), bits, Dx(i->r));
2471 		break;
2472 	default:	/* 1, 2, 4, 8, 16, 32 */
2473 		for(d=i->depth; d<32; d*=2)
2474 			bits = (bits << d) | bits;
2475 		p[0] = bits;		/* make little endian */
2476 		p[1] = bits>>8;
2477 		p[2] = bits>>16;
2478 		p[3] = bits>>24;
2479 		bits = *(ulong*)p;
2480 		memsetl(wordaddr(i, i->r.min), bits, i->width*Dy(i->r));
2481 		break;
2482 	}
2483 }
2484 
2485