xref: /plan9/sys/src/cmd/unix/drawterm/libmemdraw/draw.c (revision ec59a3ddbfceee0efe34584c2c9981a5e5ff1ec4)
1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <memdraw.h>
5 
6 int drawdebug;
7 static int	tablesbuilt;
8 
9 /* perfect approximation to NTSC = .299r+.587g+.114b when 0 ≤ r,g,b < 256 */
10 #define RGB2K(r,g,b)	((156763*(r)+307758*(g)+59769*(b))>>19)
11 
12 /*
13  * for 0 ≤ x ≤ 255*255, (x*0x0101+0x100)>>16 is a perfect approximation.
14  * for 0 ≤ x < (1<<16), x/255 = ((x+1)*0x0101)>>16 is a perfect approximation.
15  * the last one is perfect for all up to 1<<16, avoids a multiply, but requires a rathole.
16  */
17 /* #define DIV255(x) (((x)*257+256)>>16)  */
18 #define DIV255(x) ((((x)+1)*257)>>16)
19 /* #define DIV255(x) (tmp=(x)+1, (tmp+(tmp>>8))>>8) */
20 
21 #define MUL(x, y, t)	(t = (x)*(y)+128, (t+(t>>8))>>8)
22 #define MASK13	0xFF00FF00
23 #define MASK02	0x00FF00FF
24 #define MUL13(a, x, t)		(t = (a)*(((x)&MASK13)>>8)+128, ((t+((t>>8)&MASK02))>>8)&MASK02)
25 #define MUL02(a, x, t)		(t = (a)*(((x)&MASK02)>>0)+128, ((t+((t>>8)&MASK02))>>8)&MASK02)
26 #define MUL0123(a, x, s, t)	((MUL13(a, x, s)<<8)|MUL02(a, x, t))
27 
28 #define MUL2(u, v, x, y)	(t = (u)*(v)+(x)*(y)+256, (t+(t>>8))>>8)
29 
30 static void mktables(void);
31 typedef int Subdraw(Memdrawparam*);
32 static Subdraw chardraw, alphadraw, memoptdraw;
33 
34 static Memimage*	memones;
35 static Memimage*	memzeros;
36 Memimage *memwhite;
37 Memimage *memblack;
38 Memimage *memtransparent;
39 Memimage *memopaque;
40 
41 int	_ifmt(Fmt*);
42 
43 void
44 _memimageinit(void)
45 {
46 	static int didinit = 0;
47 
48 	if(didinit)
49 		return;
50 
51 	didinit = 1;
52 
53 	mktables();
54 	_memmkcmap();
55 
56 	fmtinstall('R', Rfmt);
57 	fmtinstall('P', Pfmt);
58 
59 	memones = allocmemimage(Rect(0,0,1,1), GREY1);
60 	memones->flags |= Frepl;
61 	memones->clipr = Rect(-0x3FFFFFF, -0x3FFFFFF, 0x3FFFFFF, 0x3FFFFFF);
62 	*byteaddr(memones, ZP) = ~0;
63 
64 	memzeros = allocmemimage(Rect(0,0,1,1), GREY1);
65 	memzeros->flags |= Frepl;
66 	memzeros->clipr = Rect(-0x3FFFFFF, -0x3FFFFFF, 0x3FFFFFF, 0x3FFFFFF);
67 	*byteaddr(memzeros, ZP) = 0;
68 
69 	if(memones == nil || memzeros == nil)
70 		assert(0 /*cannot initialize memimage library */);	/* RSC BUG */
71 
72 	memwhite = memones;
73 	memblack = memzeros;
74 	memopaque = memones;
75 	memtransparent = memzeros;
76 }
77 
78 ulong _imgtorgba(Memimage*, ulong);
79 ulong _rgbatoimg(Memimage*, ulong);
80 ulong _pixelbits(Memimage*, Point);
81 
82 #define DBG if(0)
83 static Memdrawparam par;
84 
85 Memdrawparam*
86 _memimagedrawsetup(Memimage *dst, Rectangle r, Memimage *src, Point p0, Memimage *mask, Point p1, int op)
87 {
88 
89 	if(mask == nil)
90 		mask = memopaque;
91 
92 DBG	print("memimagedraw %p/%luX %R @ %p %p/%luX %P %p/%luX %P... ", dst, dst->chan, r, dst->data->bdata, src, src->chan, p0, mask, mask->chan, p1);
93 
94 	if(drawclip(dst, &r, src, &p0, mask, &p1, &par.sr, &par.mr) == 0){
95 //		if(drawdebug)
96 //			iprint("empty clipped rectangle\n");
97 		return nil;
98 	}
99 
100 	if(op < Clear || op > SoverD){
101 //		if(drawdebug)
102 //			iprint("op out of range: %d\n", op);
103 		return nil;
104 	}
105 
106 	par.op = op;
107 	par.dst = dst;
108 	par.r = r;
109 	par.src = src;
110 	/* par.sr set by drawclip */
111 	par.mask = mask;
112 	/* par.mr set by drawclip */
113 
114 	par.state = 0;
115 	if(src->flags&Frepl){
116 		par.state |= Replsrc;
117 		if(Dx(src->r)==1 && Dy(src->r)==1){
118 			par.sval = _pixelbits(src, src->r.min);
119 			par.state |= Simplesrc;
120 			par.srgba = _imgtorgba(src, par.sval);
121 			par.sdval = _rgbatoimg(dst, par.srgba);
122 			if((par.srgba&0xFF) == 0 && (op&DoutS)){
123 //				if (drawdebug) iprint("fill with transparent source\n");
124 				return nil;	/* no-op successfully handled */
125 			}
126 		}
127 	}
128 
129 	if(mask->flags & Frepl){
130 		par.state |= Replmask;
131 		if(Dx(mask->r)==1 && Dy(mask->r)==1){
132 			par.mval = _pixelbits(mask, mask->r.min);
133 			if(par.mval == 0 && (op&DoutS)){
134 //				if(drawdebug) iprint("fill with zero mask\n");
135 				return nil;	/* no-op successfully handled */
136 			}
137 			par.state |= Simplemask;
138 			if(par.mval == ~0)
139 				par.state |= Fullmask;
140 			par.mrgba = _imgtorgba(mask, par.mval);
141 		}
142 	}
143 
144 //	if(drawdebug)
145 //		iprint("dr %R sr %R mr %R...", r, par.sr, par.mr);
146 DBG print("draw dr %R sr %R mr %R %lux\n", r, par.sr, par.mr, par.state);
147 
148 	return &par;
149 }
150 
151 void
152 _memimagedraw(Memdrawparam *par)
153 {
154 	if (par == nil)
155 		return;
156 
157 	/*
158 	 * Now that we've clipped the parameters down to be consistent, we
159 	 * simply try sub-drawing routines in order until we find one that was able
160 	 * to handle us.  If the sub-drawing routine returns zero, it means it was
161 	 * unable to satisfy the request, so we do not return.
162 	 */
163 
164 	/*
165 	 * Hardware support.  Each video driver provides this function,
166 	 * which checks to see if there is anything it can help with.
167 	 * There could be an if around this checking to see if dst is in video memory.
168 	 */
169 DBG print("test hwdraw\n");
170 	if(hwdraw(par)){
171 //if(drawdebug) iprint("hw handled\n");
172 DBG print("hwdraw handled\n");
173 		return;
174 	}
175 	/*
176 	 * Optimizations using memmove and memset.
177 	 */
178 DBG print("test memoptdraw\n");
179 	if(memoptdraw(par)){
180 //if(drawdebug) iprint("memopt handled\n");
181 DBG print("memopt handled\n");
182 		return;
183 	}
184 
185 	/*
186 	 * Character drawing.
187 	 * Solid source color being painted through a boolean mask onto a high res image.
188 	 */
189 DBG print("test chardraw\n");
190 	if(chardraw(par)){
191 //if(drawdebug) iprint("chardraw handled\n");
192 DBG print("chardraw handled\n");
193 		return;
194 	}
195 
196 	/*
197 	 * General calculation-laden case that does alpha for each pixel.
198 	 */
199 DBG print("do alphadraw\n");
200 	alphadraw(par);
201 //if(drawdebug) iprint("alphadraw handled\n");
202 DBG print("alphadraw handled\n");
203 }
204 #undef DBG
205 
206 /*
207  * Clip the destination rectangle further based on the properties of the
208  * source and mask rectangles.  Once the destination rectangle is properly
209  * clipped, adjust the source and mask rectangles to be the same size.
210  * Then if source or mask is replicated, move its clipped rectangle
211  * so that its minimum point falls within the repl rectangle.
212  *
213  * Return zero if the final rectangle is null.
214  */
215 int
216 drawclip(Memimage *dst, Rectangle *r, Memimage *src, Point *p0, Memimage *mask, Point *p1, Rectangle *sr, Rectangle *mr)
217 {
218 	Point rmin, delta;
219 	int splitcoords;
220 	Rectangle omr;
221 
222 	if(r->min.x>=r->max.x || r->min.y>=r->max.y)
223 		return 0;
224 	splitcoords = (p0->x!=p1->x) || (p0->y!=p1->y);
225 	/* clip to destination */
226 	rmin = r->min;
227 	if(!rectclip(r, dst->r) || !rectclip(r, dst->clipr))
228 		return 0;
229 	/* move mask point */
230 	p1->x += r->min.x-rmin.x;
231 	p1->y += r->min.y-rmin.y;
232 	/* move source point */
233 	p0->x += r->min.x-rmin.x;
234 	p0->y += r->min.y-rmin.y;
235 	/* map destination rectangle into source */
236 	sr->min = *p0;
237 	sr->max.x = p0->x+Dx(*r);
238 	sr->max.y = p0->y+Dy(*r);
239 	/* sr is r in source coordinates; clip to source */
240 	if(!(src->flags&Frepl) && !rectclip(sr, src->r))
241 		return 0;
242 	if(!rectclip(sr, src->clipr))
243 		return 0;
244 	/* compute and clip rectangle in mask */
245 	if(splitcoords){
246 		/* move mask point with source */
247 		p1->x += sr->min.x-p0->x;
248 		p1->y += sr->min.y-p0->y;
249 		mr->min = *p1;
250 		mr->max.x = p1->x+Dx(*sr);
251 		mr->max.y = p1->y+Dy(*sr);
252 		omr = *mr;
253 		/* mr is now rectangle in mask; clip it */
254 		if(!(mask->flags&Frepl) && !rectclip(mr, mask->r))
255 			return 0;
256 		if(!rectclip(mr, mask->clipr))
257 			return 0;
258 		/* reflect any clips back to source */
259 		sr->min.x += mr->min.x-omr.min.x;
260 		sr->min.y += mr->min.y-omr.min.y;
261 		sr->max.x += mr->max.x-omr.max.x;
262 		sr->max.y += mr->max.y-omr.max.y;
263 		*p1 = mr->min;
264 	}else{
265 		if(!(mask->flags&Frepl) && !rectclip(sr, mask->r))
266 			return 0;
267 		if(!rectclip(sr, mask->clipr))
268 			return 0;
269 		*p1 = sr->min;
270 	}
271 
272 	/* move source clipping back to destination */
273 	delta.x = r->min.x - p0->x;
274 	delta.y = r->min.y - p0->y;
275 	r->min.x = sr->min.x + delta.x;
276 	r->min.y = sr->min.y + delta.y;
277 	r->max.x = sr->max.x + delta.x;
278 	r->max.y = sr->max.y + delta.y;
279 
280 	/* move source rectangle so sr->min is in src->r */
281 	if(src->flags&Frepl) {
282 		delta.x = drawreplxy(src->r.min.x, src->r.max.x, sr->min.x) - sr->min.x;
283 		delta.y = drawreplxy(src->r.min.y, src->r.max.y, sr->min.y) - sr->min.y;
284 		sr->min.x += delta.x;
285 		sr->min.y += delta.y;
286 		sr->max.x += delta.x;
287 		sr->max.y += delta.y;
288 	}
289 	*p0 = sr->min;
290 
291 	/* move mask point so it is in mask->r */
292 	*p1 = drawrepl(mask->r, *p1);
293 	mr->min = *p1;
294 	mr->max.x = p1->x+Dx(*sr);
295 	mr->max.y = p1->y+Dy(*sr);
296 
297 	assert(Dx(*sr) == Dx(*mr) && Dx(*mr) == Dx(*r));
298 	assert(Dy(*sr) == Dy(*mr) && Dy(*mr) == Dy(*r));
299 	assert(ptinrect(*p0, src->r));
300 	assert(ptinrect(*p1, mask->r));
301 	assert(ptinrect(r->min, dst->r));
302 
303 	return 1;
304 }
305 
306 /*
307  * Conversion tables.
308  */
309 static uchar replbit[1+8][256];		/* replbit[x][y] is the replication of the x-bit quantity y to 8-bit depth */
310 
311 /*
312  * bitmap of how to replicate n bits to fill 8, for 1 ≤ n ≤ 8.
313  * the X's are where to put the bottom (ones) bit of the n-bit pattern.
314  * only the top 8 bits of the result are actually used.
315  * (the lower 8 bits are needed to get bits in the right place
316  * when n is not a divisor of 8.)
317  *
318  * Should check to see if its easier to just refer to replmul than
319  * use the precomputed values in replbit.  On PCs it may well
320  * be; on machines with slow multiply instructions it probably isn't.
321  */
322 #define a ((((((((((((((((0
323 #define X *2+1)
324 #define _ *2)
325 static int replmul[1+8] = {
326 	0,
327 	a X X X X X X X X X X X X X X X X,
328 	a _ X _ X _ X _ X _ X _ X _ X _ X,
329 	a _ _ X _ _ X _ _ X _ _ X _ _ X _,
330 	a _ _ _ X _ _ _ X _ _ _ X _ _ _ X,
331 	a _ _ _ _ X _ _ _ _ X _ _ _ _ X _,
332 	a _ _ _ _ _ X _ _ _ _ _ X _ _ _ _,
333 	a _ _ _ _ _ _ X _ _ _ _ _ _ X _ _,
334 	a _ _ _ _ _ _ _ X _ _ _ _ _ _ _ X,
335 };
336 #undef a
337 #undef X
338 #undef _
339 
340 static void
341 mktables(void)
342 {
343 	int i, j, small;
344 
345 	if(tablesbuilt)
346 		return;
347 
348 	fmtinstall('R', Rfmt);
349 	fmtinstall('P', Pfmt);
350 	tablesbuilt = 1;
351 
352 	/* bit replication up to 8 bits */
353 	for(i=0; i<256; i++){
354 		for(j=0; j<=8; j++){	/* j <= 8 [sic] */
355 			small = i & ((1<<j)-1);
356 			replbit[j][i] = (small*replmul[j])>>8;
357 		}
358 	}
359 
360 }
361 
362 static uchar ones = 0xff;
363 
364 /*
365  * General alpha drawing case.  Can handle anything.
366  */
367 typedef struct	Buffer	Buffer;
368 struct Buffer {
369 	/* used by most routines */
370 	uchar	*red;
371 	uchar	*grn;
372 	uchar	*blu;
373 	uchar	*alpha;
374 	uchar	*grey;
375 	ulong	*rgba;
376 	int	delta;	/* number of bytes to add to pointer to get next pixel to the right */
377 
378 	/* used by boolcalc* for mask data */
379 	uchar	*m;		/* ptr to mask data r.min byte; like p->bytermin */
380 	int		mskip;	/* no. of left bits to skip in *m */
381 	uchar	*bm;		/* ptr to mask data img->r.min byte; like p->bytey0s */
382 	int		bmskip;	/* no. of left bits to skip in *bm */
383 	uchar	*em;		/* ptr to mask data img->r.max.x byte; like p->bytey0e */
384 	int		emskip;	/* no. of right bits to skip in *em */
385 };
386 
387 typedef struct	Param	Param;
388 typedef Buffer	Readfn(Param*, uchar*, int);
389 typedef void	Writefn(Param*, uchar*, Buffer);
390 typedef Buffer	Calcfn(Buffer, Buffer, Buffer, int, int, int);
391 
392 enum {
393 	MAXBCACHE = 16
394 };
395 
396 /* giant rathole to customize functions with */
397 struct Param {
398 	Readfn	*replcall;
399 	Readfn	*greymaskcall;
400 	Readfn	*convreadcall;
401 	Writefn	*convwritecall;
402 
403 	Memimage *img;
404 	Rectangle	r;
405 	int	dx;	/* of r */
406 	int	needbuf;
407 	int	convgrey;
408 	int	alphaonly;
409 
410 	uchar	*bytey0s;		/* byteaddr(Pt(img->r.min.x, img->r.min.y)) */
411 	uchar	*bytermin;	/* byteaddr(Pt(r.min.x, img->r.min.y)) */
412 	uchar	*bytey0e;		/* byteaddr(Pt(img->r.max.x, img->r.min.y)) */
413 	int		bwidth;
414 
415 	int	replcache;	/* if set, cache buffers */
416 	Buffer	bcache[MAXBCACHE];
417 	ulong	bfilled;
418 	uchar	*bufbase;
419 	int	bufoff;
420 	int	bufdelta;
421 
422 	int	dir;
423 
424 	int	convbufoff;
425 	uchar	*convbuf;
426 	Param	*convdpar;
427 	int	convdx;
428 };
429 
430 static uchar *drawbuf;
431 static int	ndrawbuf;
432 static int	mdrawbuf;
433 static Param spar, mpar, dpar;	/* easier on the stacks */
434 static Readfn	greymaskread, replread, readptr;
435 static Writefn	nullwrite;
436 static Calcfn	alphacalc0, alphacalc14, alphacalc2810, alphacalc3679, alphacalc5, alphacalc11, alphacalcS;
437 static Calcfn	boolcalc14, boolcalc236789, boolcalc1011;
438 
439 static Readfn*	readfn(Memimage*);
440 static Readfn*	readalphafn(Memimage*);
441 static Writefn*	writefn(Memimage*);
442 
443 static Calcfn*	boolcopyfn(Memimage*, Memimage*);
444 static Readfn*	convfn(Memimage*, Param*, Memimage*, Param*);
445 
446 static Calcfn *alphacalc[Ncomp] =
447 {
448 	alphacalc0,		/* Clear */
449 	alphacalc14,		/* DoutS */
450 	alphacalc2810,		/* SoutD */
451 	alphacalc3679,		/* DxorS */
452 	alphacalc14,		/* DinS */
453 	alphacalc5,		/* D */
454 	alphacalc3679,		/* DatopS */
455 	alphacalc3679,		/* DoverS */
456 	alphacalc2810,		/* SinD */
457 	alphacalc3679,		/* SatopD */
458 	alphacalc2810,		/* S */
459 	alphacalc11,		/* SoverD */
460 };
461 
462 static Calcfn *boolcalc[Ncomp] =
463 {
464 	alphacalc0,		/* Clear */
465 	boolcalc14,		/* DoutS */
466 	boolcalc236789,		/* SoutD */
467 	boolcalc236789,		/* DxorS */
468 	boolcalc14,		/* DinS */
469 	alphacalc5,		/* D */
470 	boolcalc236789,		/* DatopS */
471 	boolcalc236789,		/* DoverS */
472 	boolcalc236789,		/* SinD */
473 	boolcalc236789,		/* SatopD */
474 	boolcalc1011,		/* S */
475 	boolcalc1011,		/* SoverD */
476 };
477 
478 static int
479 allocdrawbuf(void)
480 {
481 	uchar *p;
482 
483 	if(ndrawbuf > mdrawbuf){
484 		p = realloc(drawbuf, ndrawbuf);
485 		if(p == nil){
486 			werrstr("memimagedraw out of memory");
487 			return -1;
488 		}
489 		drawbuf = p;
490 		mdrawbuf = ndrawbuf;
491 	}
492 	return 0;
493 }
494 
495 static Param
496 getparam(Memimage *img, Rectangle r, int convgrey, int needbuf)
497 {
498 	Param p;
499 	int nbuf;
500 
501 	memset(&p, 0, sizeof p);
502 
503 	p.img = img;
504 	p.r = r;
505 	p.dx = Dx(r);
506 	p.needbuf = needbuf;
507 	p.convgrey = convgrey;
508 
509 	assert(img->r.min.x <= r.min.x && r.min.x < img->r.max.x);
510 
511 	p.bytey0s = byteaddr(img, Pt(img->r.min.x, img->r.min.y));
512 	p.bytermin = byteaddr(img, Pt(r.min.x, img->r.min.y));
513 	p.bytey0e = byteaddr(img, Pt(img->r.max.x, img->r.min.y));
514 	p.bwidth = sizeof(ulong)*img->width;
515 
516 	assert(p.bytey0s <= p.bytermin && p.bytermin <= p.bytey0e);
517 
518 	if(p.r.min.x == p.img->r.min.x)
519 		assert(p.bytermin == p.bytey0s);
520 
521 	nbuf = 1;
522 	if((img->flags&Frepl) && Dy(img->r) <= MAXBCACHE && Dy(img->r) < Dy(r)){
523 		p.replcache = 1;
524 		nbuf = Dy(img->r);
525 	}
526 	p.bufdelta = 4*p.dx;
527 	p.bufoff = ndrawbuf;
528 	ndrawbuf += p.bufdelta*nbuf;
529 
530 	return p;
531 }
532 
533 static void
534 clipy(Memimage *img, int *y)
535 {
536 	int dy;
537 
538 	dy = Dy(img->r);
539 	if(*y == dy)
540 		*y = 0;
541 	else if(*y == -1)
542 		*y = dy-1;
543 	assert(0 <= *y && *y < dy);
544 }
545 
546 static void
547 dumpbuf(char *s, Buffer b, int n)
548 {
549 	int i;
550 	uchar *p;
551 
552 	print("%s", s);
553 	for(i=0; i<n; i++){
554 		print(" ");
555 		if((p=b.grey)){
556 			print(" k%.2uX", *p);
557 			b.grey += b.delta;
558 		}else{
559 			if((p=b.red)){
560 				print(" r%.2uX", *p);
561 				b.red += b.delta;
562 			}
563 			if((p=b.grn)){
564 				print(" g%.2uX", *p);
565 				b.grn += b.delta;
566 			}
567 			if((p=b.blu)){
568 				print(" b%.2uX", *p);
569 				b.blu += b.delta;
570 			}
571 		}
572 		if((p=b.alpha) != &ones){
573 			print(" α%.2uX", *p);
574 			b.alpha += b.delta;
575 		}
576 	}
577 	print("\n");
578 }
579 
580 /*
581  * For each scan line, we expand the pixels from source, mask, and destination
582  * into byte-aligned red, green, blue, alpha, and grey channels.  If buffering is not
583  * needed and the channels were already byte-aligned (grey8, rgb24, rgba32, rgb32),
584  * the readers need not copy the data: they can simply return pointers to the data.
585  * If the destination image is grey and the source is not, it is converted using the NTSC
586  * formula.
587  *
588  * Once we have all the channels, we call either rgbcalc or greycalc, depending on
589  * whether the destination image is color.  This is allowed to overwrite the dst buffer (perhaps
590  * the actual data, perhaps a copy) with its result.  It should only overwrite the dst buffer
591  * with the same format (i.e. red bytes with red bytes, etc.)  A new buffer is returned from
592  * the calculator, and that buffer is passed to a function to write it to the destination.
593  * If the buffer is already pointing at the destination, the writing function is a no-op.
594  */
595 #define DBG if(0)
596 static int
597 alphadraw(Memdrawparam *par)
598 {
599 	int isgrey, starty, endy, op;
600 	int needbuf, dsty, srcy, masky;
601 	int y, dir, dx, dy;
602 	Buffer bsrc, bdst, bmask;
603 	Readfn *rdsrc, *rdmask, *rddst;
604 	Calcfn *calc;
605 	Writefn *wrdst;
606 	Memimage *src, *mask, *dst;
607 	Rectangle r, sr, mr;
608 
609 	r = par->r;
610 	dx = Dx(r);
611 	dy = Dy(r);
612 
613 	ndrawbuf = 0;
614 
615 	src = par->src;
616 	mask = par->mask;
617 	dst = par->dst;
618 	sr = par->sr;
619 	mr = par->mr;
620 	op = par->op;
621 
622 	isgrey = dst->flags&Fgrey;
623 
624 	/*
625 	 * Buffering when src and dst are the same bitmap is sufficient but not
626 	 * necessary.  There are stronger conditions we could use.  We could
627 	 * check to see if the rectangles intersect, and if simply moving in the
628 	 * correct y direction can avoid the need to buffer.
629 	 */
630 	needbuf = (src->data == dst->data);
631 
632 	spar = getparam(src, sr, isgrey, needbuf);
633 	dpar = getparam(dst, r, isgrey, needbuf);
634 	mpar = getparam(mask, mr, 0, needbuf);
635 
636 	dir = (needbuf && byteaddr(dst, r.min) > byteaddr(src, sr.min)) ? -1 : 1;
637 	spar.dir = mpar.dir = dpar.dir = dir;
638 
639 	/*
640 	 * If the mask is purely boolean, we can convert from src to dst format
641 	 * when we read src, and then just copy it to dst where the mask tells us to.
642 	 * This requires a boolean (1-bit grey) mask and lack of a source alpha channel.
643 	 *
644 	 * The computation is accomplished by assigning the function pointers as follows:
645 	 *	rdsrc - read and convert source into dst format in a buffer
646 	 * 	rdmask - convert mask to bytes, set pointer to it
647 	 * 	rddst - fill with pointer to real dst data, but do no reads
648 	 *	calc - copy src onto dst when mask says to.
649 	 *	wrdst - do nothing
650 	 * This is slightly sleazy, since things aren't doing exactly what their names say,
651 	 * but it avoids a fair amount of code duplication to make this a case here
652 	 * rather than have a separate booldraw.
653 	 */
654 //if(drawdebug) iprint("flag %lud mchan %lux=?%x dd %d\n", src->flags&Falpha, mask->chan, GREY1, dst->depth);
655 	if(!(src->flags&Falpha) && mask->chan == GREY1 && dst->depth >= 8 && op == SoverD){
656 //if(drawdebug) iprint("boolcopy...");
657 		rdsrc = convfn(dst, &dpar, src, &spar);
658 		rddst = readptr;
659 		rdmask = readfn(mask);
660 		calc = boolcopyfn(dst, mask);
661 		wrdst = nullwrite;
662 	}else{
663 		/* usual alphadraw parameter fetching */
664 		rdsrc = readfn(src);
665 		rddst = readfn(dst);
666 		wrdst = writefn(dst);
667 		calc = alphacalc[op];
668 
669 		/*
670 		 * If there is no alpha channel, we'll ask for a grey channel
671 		 * and pretend it is the alpha.
672 		 */
673 		if(mask->flags&Falpha){
674 			rdmask = readalphafn(mask);
675 			mpar.alphaonly = 1;
676 		}else{
677 			mpar.greymaskcall = readfn(mask);
678 			mpar.convgrey = 1;
679 			rdmask = greymaskread;
680 
681 			/*
682 			 * Should really be above, but then boolcopyfns would have
683 			 * to deal with bit alignment, and I haven't written that.
684 			 *
685 			 * This is a common case for things like ellipse drawing.
686 			 * When there's no alpha involved and the mask is boolean,
687 			 * we can avoid all the division and multiplication.
688 			 */
689 			if(mask->chan == GREY1 && !(src->flags&Falpha))
690 				calc = boolcalc[op];
691 			else if(op == SoverD && !(src->flags&Falpha))
692 				calc = alphacalcS;
693 		}
694 	}
695 
696 	/*
697 	 * If the image has a small enough repl rectangle,
698 	 * we can just read each line once and cache them.
699 	 */
700 	if(spar.replcache){
701 		spar.replcall = rdsrc;
702 		rdsrc = replread;
703 	}
704 	if(mpar.replcache){
705 		mpar.replcall = rdmask;
706 		rdmask = replread;
707 	}
708 
709 	if(allocdrawbuf() < 0)
710 		return 0;
711 
712 	/*
713 	 * Before we were saving only offsets from drawbuf in the parameter
714 	 * structures; now that drawbuf has been grown to accomodate us,
715 	 * we can fill in the pointers.
716 	 */
717 	spar.bufbase = drawbuf+spar.bufoff;
718 	mpar.bufbase = drawbuf+mpar.bufoff;
719 	dpar.bufbase = drawbuf+dpar.bufoff;
720 	spar.convbuf = drawbuf+spar.convbufoff;
721 
722 	if(dir == 1){
723 		starty = 0;
724 		endy = dy;
725 	}else{
726 		starty = dy-1;
727 		endy = -1;
728 	}
729 
730 	/*
731 	 * srcy, masky, and dsty are offsets from the top of their
732 	 * respective Rectangles.  they need to be contained within
733 	 * the rectangles, so clipy can keep them there without division.
734  	 */
735 	srcy = (starty + sr.min.y - src->r.min.y)%Dy(src->r);
736 	masky = (starty + mr.min.y - mask->r.min.y)%Dy(mask->r);
737 	dsty = starty + r.min.y - dst->r.min.y;
738 
739 	assert(0 <= srcy && srcy < Dy(src->r));
740 	assert(0 <= masky && masky < Dy(mask->r));
741 	assert(0 <= dsty && dsty < Dy(dst->r));
742 
743 	for(y=starty; y!=endy; y+=dir, srcy+=dir, masky+=dir, dsty+=dir){
744 		clipy(src, &srcy);
745 		clipy(dst, &dsty);
746 		clipy(mask, &masky);
747 
748 		bsrc = rdsrc(&spar, spar.bufbase, srcy);
749 DBG print("[");
750 		bmask = rdmask(&mpar, mpar.bufbase, masky);
751 DBG print("]\n");
752 		bdst = rddst(&dpar, dpar.bufbase, dsty);
753 DBG		dumpbuf("src", bsrc, dx);
754 DBG		dumpbuf("mask", bmask, dx);
755 DBG		dumpbuf("dst", bdst, dx);
756 		bdst = calc(bdst, bsrc, bmask, dx, isgrey, op);
757 		wrdst(&dpar, dpar.bytermin+dsty*dpar.bwidth, bdst);
758 	}
759 
760 	return 1;
761 }
762 #undef DBG
763 
764 static Buffer
765 alphacalc0(Buffer bdst, Buffer b1, Buffer b2, int dx, int grey, int op)
766 {
767 	USED(grey);
768 	USED(op);
769 	memset(bdst.rgba, 0, dx*bdst.delta);
770 	return bdst;
771 }
772 
773 static Buffer
774 alphacalc14(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
775 {
776 	Buffer obdst;
777 	int fd, sadelta;
778 	int i, sa, ma, q;
779 	ulong s, t;
780 
781 	obdst = bdst;
782 	sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;
783 	q = bsrc.delta == 4 && bdst.delta == 4;
784 
785 	for(i=0; i<dx; i++){
786 		sa = *bsrc.alpha;
787 		ma = *bmask.alpha;
788 		fd = MUL(sa, ma, t);
789 		if(op == DoutS)
790 			fd = 255-fd;
791 
792 		if(grey){
793 			*bdst.grey = MUL(fd, *bdst.grey, t);
794 			bsrc.grey += bsrc.delta;
795 			bdst.grey += bdst.delta;
796 		}else{
797 			if(q){
798 				*bdst.rgba = MUL0123(fd, *bdst.rgba, s, t);
799 				bsrc.rgba++;
800 				bdst.rgba++;
801 				bsrc.alpha += sadelta;
802 				bmask.alpha += bmask.delta;
803 				continue;
804 			}
805 			*bdst.red = MUL(fd, *bdst.red, t);
806 			*bdst.grn = MUL(fd, *bdst.grn, t);
807 			*bdst.blu = MUL(fd, *bdst.blu, t);
808 			bsrc.red += bsrc.delta;
809 			bsrc.blu += bsrc.delta;
810 			bsrc.grn += bsrc.delta;
811 			bdst.red += bdst.delta;
812 			bdst.blu += bdst.delta;
813 			bdst.grn += bdst.delta;
814 		}
815 		if(bdst.alpha != &ones){
816 			*bdst.alpha = MUL(fd, *bdst.alpha, t);
817 			bdst.alpha += bdst.delta;
818 		}
819 		bmask.alpha += bmask.delta;
820 		bsrc.alpha += sadelta;
821 	}
822 	return obdst;
823 }
824 
825 static Buffer
826 alphacalc2810(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
827 {
828 	Buffer obdst;
829 	int fs, sadelta;
830 	int i, ma, da, q;
831 	ulong s, t;
832 
833 	obdst = bdst;
834 	sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;
835 	q = bsrc.delta == 4 && bdst.delta == 4;
836 
837 	for(i=0; i<dx; i++){
838 		ma = *bmask.alpha;
839 		da = *bdst.alpha;
840 		if(op == SoutD)
841 			da = 255-da;
842 		fs = ma;
843 		if(op != S)
844 			fs = MUL(fs, da, t);
845 
846 		if(grey){
847 			*bdst.grey = MUL(fs, *bsrc.grey, t);
848 			bsrc.grey += bsrc.delta;
849 			bdst.grey += bdst.delta;
850 		}else{
851 			if(q){
852 				*bdst.rgba = MUL0123(fs, *bsrc.rgba, s, t);
853 				bsrc.rgba++;
854 				bdst.rgba++;
855 				bmask.alpha += bmask.delta;
856 				bdst.alpha += bdst.delta;
857 				continue;
858 			}
859 			*bdst.red = MUL(fs, *bsrc.red, t);
860 			*bdst.grn = MUL(fs, *bsrc.grn, t);
861 			*bdst.blu = MUL(fs, *bsrc.blu, t);
862 			bsrc.red += bsrc.delta;
863 			bsrc.blu += bsrc.delta;
864 			bsrc.grn += bsrc.delta;
865 			bdst.red += bdst.delta;
866 			bdst.blu += bdst.delta;
867 			bdst.grn += bdst.delta;
868 		}
869 		if(bdst.alpha != &ones){
870 			*bdst.alpha = MUL(fs, *bsrc.alpha, t);
871 			bdst.alpha += bdst.delta;
872 		}
873 		bmask.alpha += bmask.delta;
874 		bsrc.alpha += sadelta;
875 	}
876 	return obdst;
877 }
878 
879 static Buffer
880 alphacalc3679(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
881 {
882 	Buffer obdst;
883 	int fs, fd, sadelta;
884 	int i, sa, ma, da, q;
885 	ulong s, t, u, v;
886 
887 	obdst = bdst;
888 	sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;
889 	q = bsrc.delta == 4 && bdst.delta == 4;
890 
891 	for(i=0; i<dx; i++){
892 		sa = *bsrc.alpha;
893 		ma = *bmask.alpha;
894 		da = *bdst.alpha;
895 		if(op == SatopD)
896 			fs = MUL(ma, da, t);
897 		else
898 			fs = MUL(ma, 255-da, t);
899 		if(op == DoverS)
900 			fd = 255;
901 		else{
902 			fd = MUL(sa, ma, t);
903 			if(op != DatopS)
904 				fd = 255-fd;
905 		}
906 
907 		if(grey){
908 			*bdst.grey = MUL(fs, *bsrc.grey, s)+MUL(fd, *bdst.grey, t);
909 			bsrc.grey += bsrc.delta;
910 			bdst.grey += bdst.delta;
911 		}else{
912 			if(q){
913 				*bdst.rgba = MUL0123(fs, *bsrc.rgba, s, t)+MUL0123(fd, *bdst.rgba, u, v);
914 				bsrc.rgba++;
915 				bdst.rgba++;
916 				bsrc.alpha += sadelta;
917 				bmask.alpha += bmask.delta;
918 				bdst.alpha += bdst.delta;
919 				continue;
920 			}
921 			*bdst.red = MUL(fs, *bsrc.red, s)+MUL(fd, *bdst.red, t);
922 			*bdst.grn = MUL(fs, *bsrc.grn, s)+MUL(fd, *bdst.grn, t);
923 			*bdst.blu = MUL(fs, *bsrc.blu, s)+MUL(fd, *bdst.blu, t);
924 			bsrc.red += bsrc.delta;
925 			bsrc.blu += bsrc.delta;
926 			bsrc.grn += bsrc.delta;
927 			bdst.red += bdst.delta;
928 			bdst.blu += bdst.delta;
929 			bdst.grn += bdst.delta;
930 		}
931 		if(bdst.alpha != &ones){
932 			*bdst.alpha = MUL(fs, sa, s)+MUL(fd, da, t);
933 			bdst.alpha += bdst.delta;
934 		}
935 		bmask.alpha += bmask.delta;
936 		bsrc.alpha += sadelta;
937 	}
938 	return obdst;
939 }
940 
941 static Buffer
942 alphacalc5(Buffer bdst, Buffer b1, Buffer b2, int dx, int grey, int op)
943 {
944 	USED(dx);
945 	USED(grey);
946 	USED(op);
947 	return bdst;
948 }
949 
950 static Buffer
951 alphacalc11(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
952 {
953 	Buffer obdst;
954 	int fd, sadelta;
955 	int i, sa, ma, q;
956 	ulong s, t, u, v;
957 
958 	USED(op);
959 	obdst = bdst;
960 	sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;
961 	q = bsrc.delta == 4 && bdst.delta == 4;
962 
963 	for(i=0; i<dx; i++){
964 		sa = *bsrc.alpha;
965 		ma = *bmask.alpha;
966 		fd = 255-MUL(sa, ma, t);
967 
968 		if(grey){
969 			*bdst.grey = MUL(ma, *bsrc.grey, s)+MUL(fd, *bdst.grey, t);
970 			bsrc.grey += bsrc.delta;
971 			bdst.grey += bdst.delta;
972 		}else{
973 			if(q){
974 				*bdst.rgba = MUL0123(ma, *bsrc.rgba, s, t)+MUL0123(fd, *bdst.rgba, u, v);
975 				bsrc.rgba++;
976 				bdst.rgba++;
977 				bsrc.alpha += sadelta;
978 				bmask.alpha += bmask.delta;
979 				continue;
980 			}
981 			*bdst.red = MUL(ma, *bsrc.red, s)+MUL(fd, *bdst.red, t);
982 			*bdst.grn = MUL(ma, *bsrc.grn, s)+MUL(fd, *bdst.grn, t);
983 			*bdst.blu = MUL(ma, *bsrc.blu, s)+MUL(fd, *bdst.blu, t);
984 			bsrc.red += bsrc.delta;
985 			bsrc.blu += bsrc.delta;
986 			bsrc.grn += bsrc.delta;
987 			bdst.red += bdst.delta;
988 			bdst.blu += bdst.delta;
989 			bdst.grn += bdst.delta;
990 		}
991 		if(bdst.alpha != &ones){
992 			*bdst.alpha = MUL(ma, sa, s)+MUL(fd, *bdst.alpha, t);
993 			bdst.alpha += bdst.delta;
994 		}
995 		bmask.alpha += bmask.delta;
996 		bsrc.alpha += sadelta;
997 	}
998 	return obdst;
999 }
1000 
1001 /*
1002 not used yet
1003 source and mask alpha 1
1004 static Buffer
1005 alphacalcS0(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
1006 {
1007 	Buffer obdst;
1008 	int i;
1009 
1010 	USED(op);
1011 	obdst = bdst;
1012 	if(bsrc.delta == bdst.delta){
1013 		memmove(bdst.rgba, bsrc.rgba, dx*bdst.delta);
1014 		return obdst;
1015 	}
1016 	for(i=0; i<dx; i++){
1017 		if(grey){
1018 			*bdst.grey = *bsrc.grey;
1019 			bsrc.grey += bsrc.delta;
1020 			bdst.grey += bdst.delta;
1021 		}else{
1022 			*bdst.red = *bsrc.red;
1023 			*bdst.grn = *bsrc.grn;
1024 			*bdst.blu = *bsrc.blu;
1025 			bsrc.red += bsrc.delta;
1026 			bsrc.blu += bsrc.delta;
1027 			bsrc.grn += bsrc.delta;
1028 			bdst.red += bdst.delta;
1029 			bdst.blu += bdst.delta;
1030 			bdst.grn += bdst.delta;
1031 		}
1032 		if(bdst.alpha != &ones){
1033 			*bdst.alpha = 255;
1034 			bdst.alpha += bdst.delta;
1035 		}
1036 	}
1037 	return obdst;
1038 }
1039 */
1040 
1041 /* source alpha 1 */
1042 static Buffer
1043 alphacalcS(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
1044 {
1045 	Buffer obdst;
1046 	int fd;
1047 	int i, ma;
1048 	ulong s, t;
1049 
1050 	USED(op);
1051 	obdst = bdst;
1052 
1053 	for(i=0; i<dx; i++){
1054 		ma = *bmask.alpha;
1055 		fd = 255-ma;
1056 
1057 		if(grey){
1058 			*bdst.grey = MUL(ma, *bsrc.grey, s)+MUL(fd, *bdst.grey, t);
1059 			bsrc.grey += bsrc.delta;
1060 			bdst.grey += bdst.delta;
1061 		}else{
1062 			*bdst.red = MUL(ma, *bsrc.red, s)+MUL(fd, *bdst.red, t);
1063 			*bdst.grn = MUL(ma, *bsrc.grn, s)+MUL(fd, *bdst.grn, t);
1064 			*bdst.blu = MUL(ma, *bsrc.blu, s)+MUL(fd, *bdst.blu, t);
1065 			bsrc.red += bsrc.delta;
1066 			bsrc.blu += bsrc.delta;
1067 			bsrc.grn += bsrc.delta;
1068 			bdst.red += bdst.delta;
1069 			bdst.blu += bdst.delta;
1070 			bdst.grn += bdst.delta;
1071 		}
1072 		if(bdst.alpha != &ones){
1073 			*bdst.alpha = ma+MUL(fd, *bdst.alpha, t);
1074 			bdst.alpha += bdst.delta;
1075 		}
1076 		bmask.alpha += bmask.delta;
1077 	}
1078 	return obdst;
1079 }
1080 
1081 static Buffer
1082 boolcalc14(Buffer bdst, Buffer b1, Buffer bmask, int dx, int grey, int op)
1083 {
1084 	Buffer obdst;
1085 	int i, ma, zero;
1086 
1087 	obdst = bdst;
1088 
1089 	for(i=0; i<dx; i++){
1090 		ma = *bmask.alpha;
1091 		zero = ma ? op == DoutS : op == DinS;
1092 
1093 		if(grey){
1094 			if(zero)
1095 				*bdst.grey = 0;
1096 			bdst.grey += bdst.delta;
1097 		}else{
1098 			if(zero)
1099 				*bdst.red = *bdst.grn = *bdst.blu = 0;
1100 			bdst.red += bdst.delta;
1101 			bdst.blu += bdst.delta;
1102 			bdst.grn += bdst.delta;
1103 		}
1104 		bmask.alpha += bmask.delta;
1105 		if(bdst.alpha != &ones){
1106 			if(zero)
1107 				*bdst.alpha = 0;
1108 			bdst.alpha += bdst.delta;
1109 		}
1110 	}
1111 	return obdst;
1112 }
1113 
1114 static Buffer
1115 boolcalc236789(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
1116 {
1117 	Buffer obdst;
1118 	int fs, fd;
1119 	int i, ma, da, zero;
1120 	ulong s, t;
1121 
1122 	obdst = bdst;
1123 	zero = !(op&1);
1124 
1125 	for(i=0; i<dx; i++){
1126 		ma = *bmask.alpha;
1127 		da = *bdst.alpha;
1128 		fs = da;
1129 		if(op&2)
1130 			fs = 255-da;
1131 		fd = 0;
1132 		if(op&4)
1133 			fd = 255;
1134 
1135 		if(grey){
1136 			if(ma)
1137 				*bdst.grey = MUL(fs, *bsrc.grey, s)+MUL(fd, *bdst.grey, t);
1138 			else if(zero)
1139 				*bdst.grey = 0;
1140 			bsrc.grey += bsrc.delta;
1141 			bdst.grey += bdst.delta;
1142 		}else{
1143 			if(ma){
1144 				*bdst.red = MUL(fs, *bsrc.red, s)+MUL(fd, *bdst.red, t);
1145 				*bdst.grn = MUL(fs, *bsrc.grn, s)+MUL(fd, *bdst.grn, t);
1146 				*bdst.blu = MUL(fs, *bsrc.blu, s)+MUL(fd, *bdst.blu, t);
1147 			}
1148 			else if(zero)
1149 				*bdst.red = *bdst.grn = *bdst.blu = 0;
1150 			bsrc.red += bsrc.delta;
1151 			bsrc.blu += bsrc.delta;
1152 			bsrc.grn += bsrc.delta;
1153 			bdst.red += bdst.delta;
1154 			bdst.blu += bdst.delta;
1155 			bdst.grn += bdst.delta;
1156 		}
1157 		bmask.alpha += bmask.delta;
1158 		if(bdst.alpha != &ones){
1159 			if(ma)
1160 				*bdst.alpha = fs+MUL(fd, da, t);
1161 			else if(zero)
1162 				*bdst.alpha = 0;
1163 			bdst.alpha += bdst.delta;
1164 		}
1165 	}
1166 	return obdst;
1167 }
1168 
1169 static Buffer
1170 boolcalc1011(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
1171 {
1172 	Buffer obdst;
1173 	int i, ma, zero;
1174 
1175 	obdst = bdst;
1176 	zero = !(op&1);
1177 
1178 	for(i=0; i<dx; i++){
1179 		ma = *bmask.alpha;
1180 
1181 		if(grey){
1182 			if(ma)
1183 				*bdst.grey = *bsrc.grey;
1184 			else if(zero)
1185 				*bdst.grey = 0;
1186 			bsrc.grey += bsrc.delta;
1187 			bdst.grey += bdst.delta;
1188 		}else{
1189 			if(ma){
1190 				*bdst.red = *bsrc.red;
1191 				*bdst.grn = *bsrc.grn;
1192 				*bdst.blu = *bsrc.blu;
1193 			}
1194 			else if(zero)
1195 				*bdst.red = *bdst.grn = *bdst.blu = 0;
1196 			bsrc.red += bsrc.delta;
1197 			bsrc.blu += bsrc.delta;
1198 			bsrc.grn += bsrc.delta;
1199 			bdst.red += bdst.delta;
1200 			bdst.blu += bdst.delta;
1201 			bdst.grn += bdst.delta;
1202 		}
1203 		bmask.alpha += bmask.delta;
1204 		if(bdst.alpha != &ones){
1205 			if(ma)
1206 				*bdst.alpha = 255;
1207 			else if(zero)
1208 				*bdst.alpha = 0;
1209 			bdst.alpha += bdst.delta;
1210 		}
1211 	}
1212 	return obdst;
1213 }
1214 /*
1215  * Replicated cached scan line read.  Call the function listed in the Param,
1216  * but cache the result so that for replicated images we only do the work once.
1217  */
1218 static Buffer
1219 replread(Param *p, uchar *s, int y)
1220 {
1221 	Buffer *b;
1222 
1223 	USED(s);
1224 	b = &p->bcache[y];
1225 	if((p->bfilled & (1<<y)) == 0){
1226 		p->bfilled |= 1<<y;
1227 		*b = p->replcall(p, p->bufbase+y*p->bufdelta, y);
1228 	}
1229 	return *b;
1230 }
1231 
1232 /*
1233  * Alpha reading function that simply relabels the grey pointer.
1234  */
1235 static Buffer
1236 greymaskread(Param *p, uchar *buf, int y)
1237 {
1238 	Buffer b;
1239 
1240 	b = p->greymaskcall(p, buf, y);
1241 	b.alpha = b.grey;
1242 	return b;
1243 }
1244 
1245 #define DBG if(0)
1246 static Buffer
1247 readnbit(Param *p, uchar *buf, int y)
1248 {
1249 	Buffer b;
1250 	Memimage *img;
1251 	uchar *repl, *r, *w, *ow, bits;
1252 	int i, n, sh, depth, x, dx, npack, nbits;
1253 
1254 	b.rgba = (ulong*)buf;
1255 	b.grey = w = buf;
1256 	b.red = b.blu = b.grn = w;
1257 	b.alpha = &ones;
1258 	b.delta = 1;
1259 
1260 	dx = p->dx;
1261 	img = p->img;
1262 	depth = img->depth;
1263 	repl = &replbit[depth][0];
1264 	npack = 8/depth;
1265 	sh = 8-depth;
1266 
1267 	/* copy from p->r.min.x until end of repl rectangle */
1268 	x = p->r.min.x;
1269 	n = dx;
1270 	if(n > p->img->r.max.x - x)
1271 		n = p->img->r.max.x - x;
1272 
1273 	r = p->bytermin + y*p->bwidth;
1274 DBG print("readnbit dx %d %p=%p+%d*%d, *r=%d fetch %d ", dx, r, p->bytermin, y, p->bwidth, *r, n);
1275 	bits = *r++;
1276 	nbits = 8;
1277 	if((i=x&(npack-1))){
1278 DBG print("throwaway %d...", i);
1279 		bits <<= depth*i;
1280 		nbits -= depth*i;
1281 	}
1282 	for(i=0; i<n; i++){
1283 		if(nbits == 0){
1284 DBG print("(%.2ux)...", *r);
1285 			bits = *r++;
1286 			nbits = 8;
1287 		}
1288 		*w++ = repl[bits>>sh];
1289 DBG print("bit %x...", repl[bits>>sh]);
1290 		bits <<= depth;
1291 		nbits -= depth;
1292 	}
1293 	dx -= n;
1294 	if(dx == 0)
1295 		return b;
1296 
1297 	assert(x+i == p->img->r.max.x);
1298 
1299 	/* copy from beginning of repl rectangle until where we were before. */
1300 	x = p->img->r.min.x;
1301 	n = dx;
1302 	if(n > p->r.min.x - x)
1303 		n = p->r.min.x - x;
1304 
1305 	r = p->bytey0s + y*p->bwidth;
1306 DBG print("x=%d r=%p...", x, r);
1307 	bits = *r++;
1308 	nbits = 8;
1309 	if((i=x&(npack-1))){
1310 		bits <<= depth*i;
1311 		nbits -= depth*i;
1312 	}
1313 DBG print("nbits=%d...", nbits);
1314 	for(i=0; i<n; i++){
1315 		if(nbits == 0){
1316 			bits = *r++;
1317 			nbits = 8;
1318 		}
1319 		*w++ = repl[bits>>sh];
1320 DBG print("bit %x...", repl[bits>>sh]);
1321 		bits <<= depth;
1322 		nbits -= depth;
1323 DBG print("bits %x nbits %d...", bits, nbits);
1324 	}
1325 	dx -= n;
1326 	if(dx == 0)
1327 		return b;
1328 
1329 	assert(dx > 0);
1330 	/* now we have exactly one full scan line: just replicate the buffer itself until we are done */
1331 	ow = buf;
1332 	while(dx--)
1333 		*w++ = *ow++;
1334 
1335 	return b;
1336 }
1337 #undef DBG
1338 
1339 #define DBG if(0)
1340 static void
1341 writenbit(Param *p, uchar *w, Buffer src)
1342 {
1343 	uchar *r;
1344 	ulong bits;
1345 	int i, sh, depth, npack, nbits, x, ex;
1346 
1347 	assert(src.grey != nil && src.delta == 1);
1348 
1349 	x = p->r.min.x;
1350 	ex = x+p->dx;
1351 	depth = p->img->depth;
1352 	npack = 8/depth;
1353 
1354 	i=x&(npack-1);
1355 	bits = i ? (*w >> (8-depth*i)) : 0;
1356 	nbits = depth*i;
1357 	sh = 8-depth;
1358 	r = src.grey;
1359 
1360 	for(; x<ex; x++){
1361 		bits <<= depth;
1362 DBG print(" %x", *r);
1363 		bits |= (*r++ >> sh);
1364 		nbits += depth;
1365 		if(nbits == 8){
1366 			*w++ = bits;
1367 			nbits = 0;
1368 		}
1369 	}
1370 
1371 	if(nbits){
1372 		sh = 8-nbits;
1373 		bits <<= sh;
1374 		bits |= *w & ((1<<sh)-1);
1375 		*w = bits;
1376 	}
1377 DBG print("\n");
1378 	return;
1379 }
1380 #undef DBG
1381 
1382 static Buffer
1383 readcmap(Param *p, uchar *buf, int y)
1384 {
1385 	Buffer b;
1386 	int a, convgrey, copyalpha, dx, i, m;
1387 	uchar *q, *cmap, *begin, *end, *r, *w;
1388 
1389 	begin = p->bytey0s + y*p->bwidth;
1390 	r = p->bytermin + y*p->bwidth;
1391 	end = p->bytey0e + y*p->bwidth;
1392 	cmap = p->img->cmap->cmap2rgb;
1393 	convgrey = p->convgrey;
1394 	copyalpha = (p->img->flags&Falpha) ? 1 : 0;
1395 
1396 	w = buf;
1397 	dx = p->dx;
1398 	if(copyalpha){
1399 		b.alpha = buf++;
1400 		a = p->img->shift[CAlpha]/8;
1401 		m = p->img->shift[CMap]/8;
1402 		for(i=0; i<dx; i++){
1403 			*w++ = r[a];
1404 			q = cmap+r[m]*3;
1405 			r += 2;
1406 			if(r == end)
1407 				r = begin;
1408 			if(convgrey){
1409 				*w++ = RGB2K(q[0], q[1], q[2]);
1410 			}else{
1411 				*w++ = q[2];	/* blue */
1412 				*w++ = q[1];	/* green */
1413 				*w++ = q[0];	/* red */
1414 			}
1415 		}
1416 	}else{
1417 		b.alpha = &ones;
1418 		for(i=0; i<dx; i++){
1419 			q = cmap+*r++*3;
1420 			if(r == end)
1421 				r = begin;
1422 			if(convgrey){
1423 				*w++ = RGB2K(q[0], q[1], q[2]);
1424 			}else{
1425 				*w++ = q[2];	/* blue */
1426 				*w++ = q[1];	/* green */
1427 				*w++ = q[0];	/* red */
1428 			}
1429 		}
1430 	}
1431 
1432 	b.rgba = (ulong*)(buf-copyalpha);
1433 
1434 	if(convgrey){
1435 		b.grey = buf;
1436 		b.red = b.blu = b.grn = buf;
1437 		b.delta = 1+copyalpha;
1438 	}else{
1439 		b.blu = buf;
1440 		b.grn = buf+1;
1441 		b.red = buf+2;
1442 		b.grey = nil;
1443 		b.delta = 3+copyalpha;
1444 	}
1445 	return b;
1446 }
1447 
1448 static void
1449 writecmap(Param *p, uchar *w, Buffer src)
1450 {
1451 	uchar *cmap, *red, *grn, *blu;
1452 	int i, dx, delta;
1453 
1454 	cmap = p->img->cmap->rgb2cmap;
1455 
1456 	delta = src.delta;
1457 	red= src.red;
1458 	grn = src.grn;
1459 	blu = src.blu;
1460 
1461 	dx = p->dx;
1462 	for(i=0; i<dx; i++, red+=delta, grn+=delta, blu+=delta)
1463 		*w++ = cmap[(*red>>4)*256+(*grn>>4)*16+(*blu>>4)];
1464 }
1465 
1466 #define DBG if(0)
1467 static Buffer
1468 readbyte(Param *p, uchar *buf, int y)
1469 {
1470 	Buffer b;
1471 	Memimage *img;
1472 	int dx, isgrey, convgrey, alphaonly, copyalpha, i, nb;
1473 	uchar *begin, *end, *r, *w, *rrepl, *grepl, *brepl, *arepl, *krepl;
1474 	uchar ured, ugrn, ublu;
1475 	ulong u;
1476 
1477 	img = p->img;
1478 	begin = p->bytey0s + y*p->bwidth;
1479 	r = p->bytermin + y*p->bwidth;
1480 	end = p->bytey0e + y*p->bwidth;
1481 
1482 	w = buf;
1483 	dx = p->dx;
1484 	nb = img->depth/8;
1485 
1486 	convgrey = p->convgrey;	/* convert rgb to grey */
1487 	isgrey = img->flags&Fgrey;
1488 	alphaonly = p->alphaonly;
1489 	copyalpha = (img->flags&Falpha) ? 1 : 0;
1490 
1491 DBG print("copyalpha %d alphaonly %d convgrey %d isgrey %d\n", copyalpha, alphaonly, convgrey, isgrey);
1492 	/* if we can, avoid processing everything */
1493 	if(!(img->flags&Frepl) && !convgrey && (img->flags&Fbytes)){
1494 		memset(&b, 0, sizeof b);
1495 		if(p->needbuf){
1496 			memmove(buf, r, dx*nb);
1497 			r = buf;
1498 		}
1499 		b.rgba = (ulong*)r;
1500 		if(copyalpha)
1501 			b.alpha = r+img->shift[CAlpha]/8;
1502 		else
1503 			b.alpha = &ones;
1504 		if(isgrey){
1505 			b.grey = r+img->shift[CGrey]/8;
1506 			b.red = b.grn = b.blu = b.grey;
1507 		}else{
1508 			b.red = r+img->shift[CRed]/8;
1509 			b.grn = r+img->shift[CGreen]/8;
1510 			b.blu = r+img->shift[CBlue]/8;
1511 		}
1512 		b.delta = nb;
1513 		return b;
1514 	}
1515 
1516 DBG print("2\n");
1517 	rrepl = replbit[img->nbits[CRed]];
1518 	grepl = replbit[img->nbits[CGreen]];
1519 	brepl = replbit[img->nbits[CBlue]];
1520 	arepl = replbit[img->nbits[CAlpha]];
1521 	krepl = replbit[img->nbits[CGrey]];
1522 
1523 	for(i=0; i<dx; i++){
1524 		u = r[0] | (r[1]<<8) | (r[2]<<16) | (r[3]<<24);
1525 		if(copyalpha) {
1526 			*w++ = arepl[(u>>img->shift[CAlpha]) & img->mask[CAlpha]];
1527 DBG print("a %x\n", w[-1]);
1528 		}
1529 
1530 		if(isgrey)
1531 			*w++ = krepl[(u >> img->shift[CGrey]) & img->mask[CGrey]];
1532 		else if(!alphaonly){
1533 			ured = rrepl[(u >> img->shift[CRed]) & img->mask[CRed]];
1534 			ugrn = grepl[(u >> img->shift[CGreen]) & img->mask[CGreen]];
1535 			ublu = brepl[(u >> img->shift[CBlue]) & img->mask[CBlue]];
1536 			if(convgrey){
1537 DBG print("g %x %x %x\n", ured, ugrn, ublu);
1538 				*w++ = RGB2K(ured, ugrn, ublu);
1539 DBG print("%x\n", w[-1]);
1540 			}else{
1541 				*w++ = brepl[(u >> img->shift[CBlue]) & img->mask[CBlue]];
1542 				*w++ = grepl[(u >> img->shift[CGreen]) & img->mask[CGreen]];
1543 				*w++ = rrepl[(u >> img->shift[CRed]) & img->mask[CRed]];
1544 			}
1545 		}
1546 		r += nb;
1547 		if(r == end)
1548 			r = begin;
1549 	}
1550 
1551 	b.alpha = copyalpha ? buf : &ones;
1552 	b.rgba = (ulong*)buf;
1553 	if(alphaonly){
1554 		b.red = b.grn = b.blu = b.grey = nil;
1555 		if(!copyalpha)
1556 			b.rgba = nil;
1557 		b.delta = 1;
1558 	}else if(isgrey || convgrey){
1559 		b.grey = buf+copyalpha;
1560 		b.red = b.grn = b.blu = buf+copyalpha;
1561 		b.delta = copyalpha+1;
1562 DBG print("alpha %x grey %x\n", b.alpha ? *b.alpha : 0xFF, *b.grey);
1563 	}else{
1564 		b.blu = buf+copyalpha;
1565 		b.grn = buf+copyalpha+1;
1566 		b.grey = nil;
1567 		b.red = buf+copyalpha+2;
1568 		b.delta = copyalpha+3;
1569 	}
1570 	return b;
1571 }
1572 #undef DBG
1573 
1574 #define DBG if(0)
1575 static void
1576 writebyte(Param *p, uchar *w, Buffer src)
1577 {
1578 	Memimage *img;
1579 	int i, isalpha, isgrey, nb, delta, dx, adelta;
1580 	uchar ff, *red, *grn, *blu, *grey, *alpha;
1581 	ulong u, mask;
1582 
1583 	img = p->img;
1584 
1585 	red = src.red;
1586 	grn = src.grn;
1587 	blu = src.blu;
1588 	alpha = src.alpha;
1589 	delta = src.delta;
1590 	grey = src.grey;
1591 	dx = p->dx;
1592 
1593 	nb = img->depth/8;
1594 	mask = (nb==4) ? 0 : ~((1<<img->depth)-1);
1595 
1596 	isalpha = img->flags&Falpha;
1597 	isgrey = img->flags&Fgrey;
1598 	adelta = src.delta;
1599 
1600 	if(isalpha && (alpha == nil || alpha == &ones)){
1601 		ff = 0xFF;
1602 		alpha = &ff;
1603 		adelta = 0;
1604 	}
1605 
1606 	for(i=0; i<dx; i++){
1607 		u = w[0] | (w[1]<<8) | (w[2]<<16) | (w[3]<<24);
1608 DBG print("u %.8lux...", u);
1609 		u &= mask;
1610 DBG print("&mask %.8lux...", u);
1611 		if(isgrey){
1612 			u |= ((*grey >> (8-img->nbits[CGrey])) & img->mask[CGrey]) << img->shift[CGrey];
1613 DBG print("|grey %.8lux...", u);
1614 			grey += delta;
1615 		}else{
1616 			u |= ((*red >> (8-img->nbits[CRed])) & img->mask[CRed]) << img->shift[CRed];
1617 			u |= ((*grn >> (8-img->nbits[CGreen])) & img->mask[CGreen]) << img->shift[CGreen];
1618 			u |= ((*blu >> (8-img->nbits[CBlue])) & img->mask[CBlue]) << img->shift[CBlue];
1619 			red += delta;
1620 			grn += delta;
1621 			blu += delta;
1622 DBG print("|rgb %.8lux...", u);
1623 		}
1624 
1625 		if(isalpha){
1626 			u |= ((*alpha >> (8-img->nbits[CAlpha])) & img->mask[CAlpha]) << img->shift[CAlpha];
1627 			alpha += adelta;
1628 DBG print("|alpha %.8lux...", u);
1629 		}
1630 
1631 		w[0] = u;
1632 		w[1] = u>>8;
1633 		w[2] = u>>16;
1634 		w[3] = u>>24;
1635 		w += nb;
1636 	}
1637 }
1638 #undef DBG
1639 
1640 static Readfn*
1641 readfn(Memimage *img)
1642 {
1643 	if(img->depth < 8)
1644 		return readnbit;
1645 	if(img->nbits[CMap] == 8)
1646 		return readcmap;
1647 	return readbyte;
1648 }
1649 
1650 static Readfn*
1651 readalphafn(Memimage *m)
1652 {
1653 	USED(m);
1654 	return readbyte;
1655 }
1656 
1657 static Writefn*
1658 writefn(Memimage *img)
1659 {
1660 	if(img->depth < 8)
1661 		return writenbit;
1662 	if(img->chan == CMAP8)
1663 		return writecmap;
1664 	return writebyte;
1665 }
1666 
1667 static void
1668 nullwrite(Param *p, uchar *s, Buffer b)
1669 {
1670 	USED(p);
1671 	USED(s);
1672 }
1673 
1674 static Buffer
1675 readptr(Param *p, uchar *s, int y)
1676 {
1677 	Buffer b;
1678 	uchar *q;
1679 
1680 	USED(s);
1681 	q = p->bytermin + y*p->bwidth;
1682 	b.red = q;	/* ptr to data */
1683 	b.grn = b.blu = b.grey = b.alpha = nil;
1684 	b.rgba = (ulong*)q;
1685 	b.delta = p->img->depth/8;
1686 	return b;
1687 }
1688 
1689 static Buffer
1690 boolmemmove(Buffer bdst, Buffer bsrc, Buffer b1, int dx, int i, int o)
1691 {
1692 	USED(i);
1693 	USED(o);
1694 	memmove(bdst.red, bsrc.red, dx*bdst.delta);
1695 	return bdst;
1696 }
1697 
1698 static Buffer
1699 boolcopy8(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int i, int o)
1700 {
1701 	uchar *m, *r, *w, *ew;
1702 
1703 	USED(i);
1704 	USED(o);
1705 	m = bmask.grey;
1706 	w = bdst.red;
1707 	r = bsrc.red;
1708 	ew = w+dx;
1709 	for(; w < ew; w++,r++)
1710 		if(*m++)
1711 			*w = *r;
1712 	return bdst;	/* not used */
1713 }
1714 
1715 static Buffer
1716 boolcopy16(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int i, int o)
1717 {
1718 	uchar *m;
1719 	ushort *r, *w, *ew;
1720 
1721 	USED(i);
1722 	USED(o);
1723 	m = bmask.grey;
1724 	w = (ushort*)bdst.red;
1725 	r = (ushort*)bsrc.red;
1726 	ew = w+dx;
1727 	for(; w < ew; w++,r++)
1728 		if(*m++)
1729 			*w = *r;
1730 	return bdst;	/* not used */
1731 }
1732 
1733 static Buffer
1734 boolcopy24(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int i, int o)
1735 {
1736 	uchar *m;
1737 	uchar *r, *w, *ew;
1738 
1739 	USED(i);
1740 	USED(o);
1741 	m = bmask.grey;
1742 	w = bdst.red;
1743 	r = bsrc.red;
1744 	ew = w+dx*3;
1745 	while(w < ew){
1746 		if(*m++){
1747 			*w++ = *r++;
1748 			*w++ = *r++;
1749 			*w++ = *r++;
1750 		}else{
1751 			w += 3;
1752 			r += 3;
1753 		}
1754 	}
1755 	return bdst;	/* not used */
1756 }
1757 
1758 static Buffer
1759 boolcopy32(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int i, int o)
1760 {
1761 	uchar *m;
1762 	ulong *r, *w, *ew;
1763 
1764 	USED(i);
1765 	USED(o);
1766 	m = bmask.grey;
1767 	w = (ulong*)bdst.red;
1768 	r = (ulong*)bsrc.red;
1769 	ew = w+dx;
1770 	for(; w < ew; w++,r++)
1771 		if(*m++)
1772 			*w = *r;
1773 	return bdst;	/* not used */
1774 }
1775 
1776 static Buffer
1777 genconv(Param *p, uchar *buf, int y)
1778 {
1779 	Buffer b;
1780 	int nb;
1781 	uchar *r, *w, *ew;
1782 
1783 	/* read from source into RGB format in convbuf */
1784 	b = p->convreadcall(p, p->convbuf, y);
1785 
1786 	/* write RGB format into dst format in buf */
1787 	p->convwritecall(p->convdpar, buf, b);
1788 
1789 	if(p->convdx){
1790 		nb = p->convdpar->img->depth/8;
1791 		r = buf;
1792 		w = buf+nb*p->dx;
1793 		ew = buf+nb*p->convdx;
1794 		while(w<ew)
1795 			*w++ = *r++;
1796 	}
1797 
1798 	b.red = buf;
1799 	b.blu = b.grn = b.grey = b.alpha = nil;
1800 	b.rgba = (ulong*)buf;
1801 	b.delta = 0;
1802 
1803 	return b;
1804 }
1805 
1806 static Readfn*
1807 convfn(Memimage *dst, Param *dpar, Memimage *src, Param *spar)
1808 {
1809 	if(dst->chan == src->chan && !(src->flags&Frepl)){
1810 //if(drawdebug) iprint("readptr...");
1811 		return readptr;
1812 	}
1813 
1814 	if(dst->chan==CMAP8 && (src->chan==GREY1||src->chan==GREY2||src->chan==GREY4)){
1815 		/* cheat because we know the replicated value is exactly the color map entry. */
1816 //if(drawdebug) iprint("Readnbit...");
1817 		return readnbit;
1818 	}
1819 
1820 	spar->convreadcall = readfn(src);
1821 	spar->convwritecall = writefn(dst);
1822 	spar->convdpar = dpar;
1823 
1824 	/* allocate a conversion buffer */
1825 	spar->convbufoff = ndrawbuf;
1826 	ndrawbuf += spar->dx*4;
1827 
1828 	if(spar->dx > Dx(spar->img->r)){
1829 		spar->convdx = spar->dx;
1830 		spar->dx = Dx(spar->img->r);
1831 	}
1832 
1833 //if(drawdebug) iprint("genconv...");
1834 	return genconv;
1835 }
1836 
1837 ulong
1838 _pixelbits(Memimage *i, Point pt)
1839 {
1840 	uchar *p;
1841 	ulong val;
1842 	int off, bpp, npack;
1843 
1844 	val = 0;
1845 	p = byteaddr(i, pt);
1846 	switch(bpp=i->depth){
1847 	case 1:
1848 	case 2:
1849 	case 4:
1850 		npack = 8/bpp;
1851 		off = pt.x%npack;
1852 		val = p[0] >> bpp*(npack-1-off);
1853 		val &= (1<<bpp)-1;
1854 		break;
1855 	case 8:
1856 		val = p[0];
1857 		break;
1858 	case 16:
1859 		val = p[0]|(p[1]<<8);
1860 		break;
1861 	case 24:
1862 		val = p[0]|(p[1]<<8)|(p[2]<<16);
1863 		break;
1864 	case 32:
1865 		val = p[0]|(p[1]<<8)|(p[2]<<16)|(p[3]<<24);
1866 		break;
1867 	}
1868 	while(bpp<32){
1869 		val |= val<<bpp;
1870 		bpp *= 2;
1871 	}
1872 	return val;
1873 }
1874 
1875 static Calcfn*
1876 boolcopyfn(Memimage *img, Memimage *mask)
1877 {
1878 	if(mask->flags&Frepl && Dx(mask->r)==1 && Dy(mask->r)==1 && pixelbits(mask, mask->r.min)==~0)
1879 		return boolmemmove;
1880 
1881 	switch(img->depth){
1882 	case 8:
1883 		return boolcopy8;
1884 	case 16:
1885 		return boolcopy16;
1886 	case 24:
1887 		return boolcopy24;
1888 	case 32:
1889 		return boolcopy32;
1890 	default:
1891 		assert(0 /* boolcopyfn */);
1892 	}
1893 	return 0;
1894 }
1895 
1896 /*
1897  * Optimized draw for filling and scrolling; uses memset and memmove.
1898  *
1899 static void
1900 memsetb(void *vp, uchar val, int n)
1901 {
1902 	uchar *p, *ep;
1903 
1904 	p = vp;
1905 	ep = p+n;
1906 	while(p<ep)
1907 		*p++ = val;
1908 }
1909 */
1910 
1911 static void
1912 memsets(void *vp, ushort val, int n)
1913 {
1914 	ushort *p, *ep;
1915 
1916 	p = vp;
1917 	ep = p+n;
1918 	while(p<ep)
1919 		*p++ = val;
1920 }
1921 
1922 static void
1923 memsetl(void *vp, ulong val, int n)
1924 {
1925 	ulong *p, *ep;
1926 
1927 	p = vp;
1928 	ep = p+n;
1929 	while(p<ep)
1930 		*p++ = val;
1931 }
1932 
1933 static void
1934 memset24(void *vp, ulong val, int n)
1935 {
1936 	uchar *p, *ep;
1937 	uchar a,b,c;
1938 
1939 	p = vp;
1940 	ep = p+3*n;
1941 	a = val;
1942 	b = val>>8;
1943 	c = val>>16;
1944 	while(p<ep){
1945 		*p++ = a;
1946 		*p++ = b;
1947 		*p++ = c;
1948 	}
1949 }
1950 
1951 ulong
1952 _imgtorgba(Memimage *img, ulong val)
1953 {
1954 	uchar r, g, b, a;
1955 	int nb, ov, v;
1956 	ulong chan;
1957 	uchar *p;
1958 
1959 	a = 0xFF;
1960 	r = g = b = 0xAA;	/* garbage */
1961 	for(chan=img->chan; chan; chan>>=8){
1962 		nb = NBITS(chan);
1963 		ov = v = val&((1<<nb)-1);
1964 		val >>= nb;
1965 
1966 		while(nb < 8){
1967 			v |= v<<nb;
1968 			nb *= 2;
1969 		}
1970 		v >>= (nb-8);
1971 
1972 		switch(TYPE(chan)){
1973 		case CRed:
1974 			r = v;
1975 			break;
1976 		case CGreen:
1977 			g = v;
1978 			break;
1979 		case CBlue:
1980 			b = v;
1981 			break;
1982 		case CAlpha:
1983 			a = v;
1984 			break;
1985 		case CGrey:
1986 			r = g = b = v;
1987 			break;
1988 		case CMap:
1989 			p = img->cmap->cmap2rgb+3*ov;
1990 			r = *p++;
1991 			g = *p++;
1992 			b = *p;
1993 			break;
1994 		}
1995 	}
1996 	return (r<<24)|(g<<16)|(b<<8)|a;
1997 }
1998 
1999 ulong
2000 _rgbatoimg(Memimage *img, ulong rgba)
2001 {
2002 	ulong chan;
2003 	int d, nb;
2004 	ulong v;
2005 	uchar *p, r, g, b, a, m;
2006 
2007 	v = 0;
2008 	r = rgba>>24;
2009 	g = rgba>>16;
2010 	b = rgba>>8;
2011 	a = rgba;
2012 	d = 0;
2013 	for(chan=img->chan; chan; chan>>=8){
2014 		nb = NBITS(chan);
2015 		switch(TYPE(chan)){
2016 		case CRed:
2017 			v |= (r>>(8-nb))<<d;
2018 			break;
2019 		case CGreen:
2020 			v |= (g>>(8-nb))<<d;
2021 			break;
2022 		case CBlue:
2023 			v |= (b>>(8-nb))<<d;
2024 			break;
2025 		case CAlpha:
2026 			v |= (a>>(8-nb))<<d;
2027 			break;
2028 		case CMap:
2029 			p = img->cmap->rgb2cmap;
2030 			m = p[(r>>4)*256+(g>>4)*16+(b>>4)];
2031 			v |= (m>>(8-nb))<<d;
2032 			break;
2033 		case CGrey:
2034 			m = RGB2K(r,g,b);
2035 			v |= (m>>(8-nb))<<d;
2036 			break;
2037 		}
2038 		d += nb;
2039 	}
2040 //	print("rgba2img %.8lux = %.*lux\n", rgba, 2*d/8, v);
2041 	return v;
2042 }
2043 
2044 #define DBG if(0)
2045 static int
2046 memoptdraw(Memdrawparam *par)
2047 {
2048 	int m, y, dy, dx, op;
2049 	ulong v;
2050 	Memimage *src;
2051 	Memimage *dst;
2052 
2053 	dx = Dx(par->r);
2054 	dy = Dy(par->r);
2055 	src = par->src;
2056 	dst = par->dst;
2057 	op = par->op;
2058 
2059 DBG print("state %lux mval %lux dd %d\n", par->state, par->mval, dst->depth);
2060 	/*
2061 	 * If we have an opaque mask and source is one opaque pixel we can convert to the
2062 	 * destination format and just replicate with memset.
2063 	 */
2064 	m = Simplesrc|Simplemask|Fullmask;
2065 	if((par->state&m)==m && (par->srgba&0xFF) == 0xFF && (op ==S || op == SoverD)){
2066 		uchar *dp, p[4];
2067 		int d, dwid, ppb, np, nb;
2068 		uchar lm, rm;
2069 
2070 DBG print("memopt, dst %p, dst->data->bdata %p\n", dst, dst->data->bdata);
2071 		dwid = dst->width*sizeof(ulong);
2072 		dp = byteaddr(dst, par->r.min);
2073 		v = par->sdval;
2074 DBG print("sdval %lud, depth %d\n", v, dst->depth);
2075 		switch(dst->depth){
2076 		case 1:
2077 		case 2:
2078 		case 4:
2079 			for(d=dst->depth; d<8; d*=2)
2080 				v |= (v<<d);
2081 			ppb = 8/dst->depth;	/* pixels per byte */
2082 			m = ppb-1;
2083 			/* left edge */
2084 			np = par->r.min.x&m;		/* no. pixels unused on left side of word */
2085 			dx -= (ppb-np);
2086 			nb = 8 - np * dst->depth;		/* no. bits used on right side of word */
2087 			lm = (1<<nb)-1;
2088 DBG print("np %d x %d nb %d lm %ux ppb %d m %ux\n", np, par->r.min.x, nb, lm, ppb, m);
2089 
2090 			/* right edge */
2091 			np = par->r.max.x&m;	/* no. pixels used on left side of word */
2092 			dx -= np;
2093 			nb = 8 - np * dst->depth;		/* no. bits unused on right side of word */
2094 			rm = ~((1<<nb)-1);
2095 DBG print("np %d x %d nb %d rm %ux ppb %d m %ux\n", np, par->r.max.x, nb, rm, ppb, m);
2096 
2097 DBG print("dx %d Dx %d\n", dx, Dx(par->r));
2098 			/* lm, rm are masks that are 1 where we should touch the bits */
2099 			if(dx < 0){	/* just one byte */
2100 				lm &= rm;
2101 				for(y=0; y<dy; y++, dp+=dwid)
2102 					*dp ^= (v ^ *dp) & lm;
2103 			}else if(dx == 0){	/* no full bytes */
2104 				if(lm)
2105 					dwid--;
2106 
2107 				for(y=0; y<dy; y++, dp+=dwid){
2108 					if(lm){
2109 DBG print("dp %p v %lux lm %ux (v ^ *dp) & lm %lux\n", dp, v, lm, (v^*dp)&lm);
2110 						*dp ^= (v ^ *dp) & lm;
2111 						dp++;
2112 					}
2113 					*dp ^= (v ^ *dp) & rm;
2114 				}
2115 			}else{		/* full bytes in middle */
2116 				dx /= ppb;
2117 				if(lm)
2118 					dwid--;
2119 				dwid -= dx;
2120 
2121 				for(y=0; y<dy; y++, dp+=dwid){
2122 					if(lm){
2123 						*dp ^= (v ^ *dp) & lm;
2124 						dp++;
2125 					}
2126 					memset(dp, v, dx);
2127 					dp += dx;
2128 					*dp ^= (v ^ *dp) & rm;
2129 				}
2130 			}
2131 			return 1;
2132 		case 8:
2133 			for(y=0; y<dy; y++, dp+=dwid)
2134 				memset(dp, v, dx);
2135 			return 1;
2136 		case 16:
2137 			p[0] = v;		/* make little endian */
2138 			p[1] = v>>8;
2139 			v = *(ushort*)p;
2140 DBG print("dp=%p; dx=%d; for(y=0; y<%d; y++, dp+=%d)\nmemsets(dp, v, dx);\n",
2141 	dp, dx, dy, dwid);
2142 			for(y=0; y<dy; y++, dp+=dwid)
2143 				memsets(dp, v, dx);
2144 			return 1;
2145 		case 24:
2146 			for(y=0; y<dy; y++, dp+=dwid)
2147 				memset24(dp, v, dx);
2148 			return 1;
2149 		case 32:
2150 			p[0] = v;		/* make little endian */
2151 			p[1] = v>>8;
2152 			p[2] = v>>16;
2153 			p[3] = v>>24;
2154 			v = *(ulong*)p;
2155 			for(y=0; y<dy; y++, dp+=dwid)
2156 				memsetl(dp, v, dx);
2157 			return 1;
2158 		default:
2159 			assert(0 /* bad dest depth in memoptdraw */);
2160 		}
2161 	}
2162 
2163 	/*
2164 	 * If no source alpha, an opaque mask, we can just copy the
2165 	 * source onto the destination.  If the channels are the same and
2166 	 * the source is not replicated, memmove suffices.
2167 	 */
2168 	m = Simplemask|Fullmask;
2169 	if((par->state&(m|Replsrc))==m && src->depth >= 8
2170 	&& src->chan == dst->chan && !(src->flags&Falpha) && (op == S || op == SoverD)){
2171 		uchar *sp, *dp;
2172 		long swid, dwid, nb;
2173 		int dir;
2174 
2175 		if(src->data == dst->data && byteaddr(dst, par->r.min) > byteaddr(src, par->sr.min))
2176 			dir = -1;
2177 		else
2178 			dir = 1;
2179 
2180 		swid = src->width*sizeof(ulong);
2181 		dwid = dst->width*sizeof(ulong);
2182 		sp = byteaddr(src, par->sr.min);
2183 		dp = byteaddr(dst, par->r.min);
2184 		if(dir == -1){
2185 			sp += (dy-1)*swid;
2186 			dp += (dy-1)*dwid;
2187 			swid = -swid;
2188 			dwid = -dwid;
2189 		}
2190 		nb = (dx*src->depth)/8;
2191 		for(y=0; y<dy; y++, sp+=swid, dp+=dwid)
2192 			memmove(dp, sp, nb);
2193 		return 1;
2194 	}
2195 
2196 	/*
2197 	 * If we have a 1-bit mask, 1-bit source, and 1-bit destination, and
2198 	 * they're all bit aligned, we can just use bit operators.  This happens
2199 	 * when we're manipulating boolean masks, e.g. in the arc code.
2200 	 */
2201 	if((par->state&(Simplemask|Simplesrc|Replmask|Replsrc))==0
2202 	&& dst->chan==GREY1 && src->chan==GREY1 && par->mask->chan==GREY1
2203 	&& (par->r.min.x&7)==(par->sr.min.x&7) && (par->r.min.x&7)==(par->mr.min.x&7)){
2204 		uchar *sp, *dp, *mp;
2205 		uchar lm, rm;
2206 		long swid, dwid, mwid;
2207 		int i, x, dir;
2208 
2209 		sp = byteaddr(src, par->sr.min);
2210 		dp = byteaddr(dst, par->r.min);
2211 		mp = byteaddr(par->mask, par->mr.min);
2212 		swid = src->width*sizeof(ulong);
2213 		dwid = dst->width*sizeof(ulong);
2214 		mwid = par->mask->width*sizeof(ulong);
2215 
2216 		if(src->data == dst->data && byteaddr(dst, par->r.min) > byteaddr(src, par->sr.min)){
2217 			dir = -1;
2218 		}else
2219 			dir = 1;
2220 
2221 		lm = 0xFF>>(par->r.min.x&7);
2222 		rm = 0xFF<<(8-(par->r.max.x&7));
2223 		dx -= (8-(par->r.min.x&7)) + (par->r.max.x&7);
2224 
2225 		if(dx < 0){	/* one byte wide */
2226 			lm &= rm;
2227 			if(dir == -1){
2228 				dp += dwid*(dy-1);
2229 				sp += swid*(dy-1);
2230 				mp += mwid*(dy-1);
2231 				dwid = -dwid;
2232 				swid = -swid;
2233 				mwid = -mwid;
2234 			}
2235 			for(y=0; y<dy; y++){
2236 				*dp ^= (*dp ^ *sp) & *mp & lm;
2237 				dp += dwid;
2238 				sp += swid;
2239 				mp += mwid;
2240 			}
2241 			return 1;
2242 		}
2243 
2244 		dx /= 8;
2245 		if(dir == 1){
2246 			i = (lm!=0)+dx+(rm!=0);
2247 			mwid -= i;
2248 			swid -= i;
2249 			dwid -= i;
2250 			for(y=0; y<dy; y++, dp+=dwid, sp+=swid, mp+=mwid){
2251 				if(lm){
2252 					*dp ^= (*dp ^ *sp++) & *mp++ & lm;
2253 					dp++;
2254 				}
2255 				for(x=0; x<dx; x++){
2256 					*dp ^= (*dp ^ *sp++) & *mp++;
2257 					dp++;
2258 				}
2259 				if(rm){
2260 					*dp ^= (*dp ^ *sp++) & *mp++ & rm;
2261 					dp++;
2262 				}
2263 			}
2264 			return 1;
2265 		}else{
2266 		/* dir == -1 */
2267 			i = (lm!=0)+dx+(rm!=0);
2268 			dp += dwid*(dy-1)+i-1;
2269 			sp += swid*(dy-1)+i-1;
2270 			mp += mwid*(dy-1)+i-1;
2271 			dwid = -dwid+i;
2272 			swid = -swid+i;
2273 			mwid = -mwid+i;
2274 			for(y=0; y<dy; y++, dp+=dwid, sp+=swid, mp+=mwid){
2275 				if(rm){
2276 					*dp ^= (*dp ^ *sp--) & *mp-- & rm;
2277 					dp--;
2278 				}
2279 				for(x=0; x<dx; x++){
2280 					*dp ^= (*dp ^ *sp--) & *mp--;
2281 					dp--;
2282 				}
2283 				if(lm){
2284 					*dp ^= (*dp ^ *sp--) & *mp-- & lm;
2285 					dp--;
2286 				}
2287 			}
2288 		}
2289 		return 1;
2290 	}
2291 	return 0;
2292 }
2293 #undef DBG
2294 
2295 /*
2296  * Boolean character drawing.
2297  * Solid opaque color through a 1-bit greyscale mask.
2298  */
2299 #define DBG if(0)
2300 static int
2301 chardraw(Memdrawparam *par)
2302 {
2303 	ulong bits;
2304 	int i, ddepth, dy, dx, x, bx, ex, y, npack, bsh, depth, op;
2305 	ulong v, maskwid, dstwid;
2306 	uchar *wp, *rp, *q, *wc;
2307 	ushort *ws;
2308 	ulong *wl;
2309 	uchar sp[4];
2310 	Rectangle r, mr;
2311 	Memimage *mask, *src, *dst;
2312 
2313 if(0) if(drawdebug) iprint("chardraw? mf %lux md %d sf %lux dxs %d dys %d dd %d ddat %p sdat %p\n",
2314 		par->mask->flags, par->mask->depth, par->src->flags,
2315 		Dx(par->src->r), Dy(par->src->r), par->dst->depth, par->dst->data, par->src->data);
2316 
2317 	mask = par->mask;
2318 	src = par->src;
2319 	dst = par->dst;
2320 	r = par->r;
2321 	mr = par->mr;
2322 	op = par->op;
2323 
2324 	if((par->state&(Replsrc|Simplesrc|Replmask)) != (Replsrc|Simplesrc)
2325 	|| mask->depth != 1 || src->flags&Falpha || dst->depth<8 || dst->data==src->data
2326 	|| op != SoverD)
2327 		return 0;
2328 
2329 //if(drawdebug) iprint("chardraw...");
2330 
2331 	depth = mask->depth;
2332 	maskwid = mask->width*sizeof(ulong);
2333 	rp = byteaddr(mask, mr.min);
2334 	npack = 8/depth;
2335 	bsh = (mr.min.x % npack) * depth;
2336 
2337 	wp = byteaddr(dst, r.min);
2338 	dstwid = dst->width*sizeof(ulong);
2339 DBG print("bsh %d\n", bsh);
2340 	dy = Dy(r);
2341 	dx = Dx(r);
2342 
2343 	ddepth = dst->depth;
2344 
2345 	/*
2346 	 * for loop counts from bsh to bsh+dx
2347 	 *
2348 	 * we want the bottom bits to be the amount
2349 	 * to shift the pixels down, so for n≡0 (mod 8) we want
2350 	 * bottom bits 7.  for n≡1, 6, etc.
2351 	 * the bits come from -n-1.
2352 	 */
2353 
2354 	bx = -bsh-1;
2355 	ex = -bsh-1-dx;
2356 	bits = 0;
2357 	v = par->sdval;
2358 
2359 	/* make little endian */
2360 	sp[0] = v;
2361 	sp[1] = v>>8;
2362 	sp[2] = v>>16;
2363 	sp[3] = v>>24;
2364 
2365 //print("sp %x %x %x %x\n", sp[0], sp[1], sp[2], sp[3]);
2366 	for(y=0; y<dy; y++, rp+=maskwid, wp+=dstwid){
2367 		q = rp;
2368 		if(bsh)
2369 			bits = *q++;
2370 		switch(ddepth){
2371 		case 8:
2372 //if(drawdebug) iprint("8loop...");
2373 			wc = wp;
2374 			for(x=bx; x>ex; x--, wc++){
2375 				i = x&7;
2376 				if(i == 8-1)
2377 					bits = *q++;
2378 DBG print("bits %lux sh %d...", bits, i);
2379 				if((bits>>i)&1)
2380 					*wc = v;
2381 			}
2382 			break;
2383 		case 16:
2384 			ws = (ushort*)wp;
2385 			v = *(ushort*)sp;
2386 			for(x=bx; x>ex; x--, ws++){
2387 				i = x&7;
2388 				if(i == 8-1)
2389 					bits = *q++;
2390 DBG print("bits %lux sh %d...", bits, i);
2391 				if((bits>>i)&1)
2392 					*ws = v;
2393 			}
2394 			break;
2395 		case 24:
2396 			wc = wp;
2397 			for(x=bx; x>ex; x--, wc+=3){
2398 				i = x&7;
2399 				if(i == 8-1)
2400 					bits = *q++;
2401 DBG print("bits %lux sh %d...", bits, i);
2402 				if((bits>>i)&1){
2403 					wc[0] = sp[0];
2404 					wc[1] = sp[1];
2405 					wc[2] = sp[2];
2406 				}
2407 			}
2408 			break;
2409 		case 32:
2410 			wl = (ulong*)wp;
2411 			v = *(ulong*)sp;
2412 			for(x=bx; x>ex; x--, wl++){
2413 				i = x&7;
2414 				if(i == 8-1)
2415 					bits = *q++;
2416 DBG iprint("bits %lux sh %d...", bits, i);
2417 				if((bits>>i)&1)
2418 					*wl = v;
2419 			}
2420 			break;
2421 		}
2422 	}
2423 
2424 DBG print("\n");
2425 	return 1;
2426 }
2427 #undef DBG
2428 
2429 
2430 /*
2431  * Fill entire byte with replicated (if necessary) copy of source pixel,
2432  * assuming destination ldepth is >= source ldepth.
2433  *
2434  * This code is just plain wrong for >8bpp.
2435  *
2436 ulong
2437 membyteval(Memimage *src)
2438 {
2439 	int i, val, bpp;
2440 	uchar uc;
2441 
2442 	unloadmemimage(src, src->r, &uc, 1);
2443 	bpp = src->depth;
2444 	uc <<= (src->r.min.x&(7/src->depth))*src->depth;
2445 	uc &= ~(0xFF>>bpp);
2446 	// pixel value is now in high part of byte. repeat throughout byte
2447 	val = uc;
2448 	for(i=bpp; i<8; i<<=1)
2449 		val |= val>>i;
2450 	return val;
2451 }
2452  *
2453  */
2454 
2455 void
2456 _memfillcolor(Memimage *i, ulong val)
2457 {
2458 	ulong bits;
2459 	int d, y;
2460 	uchar p[4];
2461 
2462 	if(val == DNofill)
2463 		return;
2464 
2465 	bits = _rgbatoimg(i, val);
2466 	switch(i->depth){
2467 	case 24:	/* 24-bit images suck */
2468 		for(y=i->r.min.y; y<i->r.max.y; y++)
2469 			memset24(byteaddr(i, Pt(i->r.min.x, y)), bits, Dx(i->r));
2470 		break;
2471 	default:	/* 1, 2, 4, 8, 16, 32 */
2472 		for(d=i->depth; d<32; d*=2)
2473 			bits = (bits << d) | bits;
2474 		p[0] = bits;		/* make little endian */
2475 		p[1] = bits>>8;
2476 		p[2] = bits>>16;
2477 		p[3] = bits>>24;
2478 		bits = *(ulong*)p;
2479 		memsetl(wordaddr(i, i->r.min), bits, i->width*Dy(i->r));
2480 		break;
2481 	}
2482 }
2483 
2484