xref: /netbsd-src/sys/arch/luna68k/dev/omrasops.c (revision cf9938ad3aa07cb249c1e1681fee771a5b65d17d)
1 /* $NetBSD: omrasops.c,v 1.27 2024/09/20 03:24:05 isaki Exp $ */
2 
3 /*-
4  * Copyright (c) 2000 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Tohru Nishimura.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>			/* RCS ID & Copyright macro defns */
33 
34 __KERNEL_RCSID(0, "$NetBSD: omrasops.c,v 1.27 2024/09/20 03:24:05 isaki Exp $");
35 
36 /*
37  * Designed specifically for 'm68k bitorder';
38  *	- most significant byte is stored at lower address,
39  *	- most significant bit is displayed at left most on screen.
40  * Implementation relies on;
41  *	- first column is at 32bit aligned address,
42  *	- font glyphs are stored in 32bit padded.
43  */
44 /*
45  * BMSEL affects both of
46  * 1) which plane a write to the common bitmap plane is reflected on and
47  * 2) which plane's ROP a write to the common ROP is reflected on.
48  *
49  * The common ROP is not a ROP applied to write to the common bitmap plane.
50  * It's equivalent to set ROPs of the plane selected in the plane mask one
51  * by one.
52  */
53 
54 #include <sys/param.h>
55 #include <sys/systm.h>
56 #include <sys/device.h>
57 
58 #include <dev/wscons/wsconsio.h>
59 #include <dev/wscons/wsdisplayvar.h>
60 #include <dev/rasops/rasops.h>
61 
62 #include <arch/luna68k/dev/omrasopsvar.h>
63 
64 #ifdef luna68k
65 #define USE_M68K_ASM	1
66 #endif
67 
68 /* To provide optimization conditions to compilers */
69 #if defined(__GNUC__)
70 #define ASSUME(cond)	if (!(cond)) __unreachable()
71 #elif defined(__clang__) && __has_builtin(__builtin_assume)
72 #define ASSUME(cond)	__builtin_assume(cond)
73 #else
74 #define ASSUME(cond)	(void)(cond)
75 #endif
76 
77 /* XXX it should be redesigned, including making the attributes support 8bpp */
78 typedef struct {
79 	union {
80 		int32_t all;
81 		struct {
82 			int8_t ismulti; /* is multi color used */
83 			uint8_t fg;
84 			uint8_t bg;
85 			uint8_t reserved;
86 		};
87 	};
88 } rowattr_t;
89 
90 /* wscons emulator operations */
91 static void	om_cursor(void *, int, int, int);
92 static int	om_mapchar(void *, int, u_int *);
93 static void	om_putchar(void *, int, int, u_int, long);
94 static void	om1_copycols(void *, int, int, int, int);
95 static void	om4_copycols(void *, int, int, int, int);
96 static void	om1_copyrows(void *, int, int, int num);
97 static void	om4_copyrows(void *, int, int, int num);
98 static void	om_erasecols(void *, int, int, int, long);
99 static void	om_eraserows(void *, int, int, long);
100 static int	om_allocattr(void *, int, int, int, long *);
101 
102 static void	om_fill(int, int, uint8_t *, int, int, uint32_t, int, int);
103 static void	om_fill_color(int, int, uint8_t *, int, int, int, int);
104 static void	om_rascopy_single(int, uint8_t *, uint8_t *, int16_t, int16_t,
105     uint8_t[]);
106 static void	om4_rascopy_multi(uint8_t *, uint8_t *, int16_t, int16_t);
107 static void	om_unpack_attr(long, uint8_t *, uint8_t *, int *);
108 
109 static int	omrasops_init(struct rasops_info *, int, int);
110 
111 /*
112  * XXX should be fixed...
113  * This number of elements is derived from howmany(1024, fontheight = 24).
114  * But it is currently initialized with row = 34, so it is used only up to 34.
115  */
116 #define OMRASOPS_MAX_ROWS	43
117 static rowattr_t rowattr[OMRASOPS_MAX_ROWS];
118 
119 #define	ALL1BITS	(~0U)
120 #define	ALL0BITS	(0U)
121 #define	BLITWIDTH	(32)
122 #define	ALIGNMASK	(0x1f)
123 #define	BYTESDONE	(4)
124 
125 #if 0 /* XXX not used yet */
126 /*
127  * internal attributes. see om_allocattr().
128  */
129 #define OMFB_ATTR_MULTICOLOR		(1U << 31)
130 #define OMFB_ATTR_UNDERLINE		(1U << 17)
131 #define OMFB_ATTR_BOLD			(1U << 16)
132 #endif
133 
134 /*
135  * XXX deprecated.
136  * This way cannot be extended to 8bpp, so don't use it in new code.
137  */
138 #define P0(addr) ((uint32_t *)((uint8_t *)(addr) + OMFB_PLANEOFFS * 1))
139 #define P1(addr) ((uint32_t *)((uint8_t *)(addr) + OMFB_PLANEOFFS * 2))
140 #define P2(addr) ((uint32_t *)((uint8_t *)(addr) + OMFB_PLANEOFFS * 3))
141 #define P3(addr) ((uint32_t *)((uint8_t *)(addr) + OMFB_PLANEOFFS * 4))
142 
143 /*
144  * macros to handle unaligned bit copy ops.
145  * See src/sys/dev/rasops/rasops_masks.h for MI version.
146  * Also refer src/sys/arch/hp300/dev/maskbits.h.
147  * (which was implemented for ancient src/sys/arch/hp300/dev/grf_hy.c)
148  */
149 
150 /* luna68k version GETBITS() that gets w bits from bit x at psrc memory */
151 #define	FASTGETBITS(psrc, x, w, dst)					\
152 	asm("bfextu %3{%1:%2},%0"					\
153 	    : "=d" (dst)						\
154 	    : "di" (x), "di" (w), "o" (*(uint32_t *)(psrc)))
155 
156 /* luna68k version PUTBITS() that puts w bits from bit x at pdst memory */
157 /* XXX this macro assumes (x + w) <= 32 to handle unaligned residual bits */
158 #define	FASTPUTBITS(src, x, w, pdst)					\
159 	asm("bfins %3,%0{%1:%2}"					\
160 	    : "+o" (*(uint32_t *)(pdst))				\
161 	    : "di" (x), "di" (w), "d" (src)				\
162 	    : "memory" )
163 
164 #define	GETBITS(psrc, x, w, dst)	FASTGETBITS(psrc, x, w, dst)
165 #define	PUTBITS(src, x, w, pdst)	FASTPUTBITS(src, x, w, pdst)
166 
167 /*
168  * Clear lower w bits from x.
169  * x must be filled with 1 at least lower w bits.
170  */
171 #if USE_M68K_ASM
172 #define CLEAR_LOWER_BITS(x, w)						\
173 	asm volatile(							\
174 	"	bclr	%[width],%[data]	;\n"			\
175 	"	addq.l	#1,%[data]		;\n"			\
176 	    : [data] "+&d" (x)						\
177 	    : [width] "d" (w)						\
178 	    :								\
179 	)
180 #else
181 #define CLEAR_LOWER_BITS(x, w)	x = ((x) & ~(1U << (w))) + 1
182 #endif
183 
184 /* Set planemask for the common plane and the common ROP */
185 static inline void
186 om_set_planemask(int planemask)
187 {
188 
189 	*(volatile uint32_t *)OMFB_PLANEMASK = planemask;
190 }
191 
192 /* Get a ROP address */
193 static inline volatile uint32_t *
194 om_rop_addr(int plane, int rop)
195 {
196 
197 	return (volatile uint32_t *)
198 	    (OMFB_ROP_P0 + OMFB_PLANEOFFS * plane + rop * 4);
199 }
200 
201 /* Set ROP and ROP's mask for individual plane */
202 static inline void
203 om_set_rop(int plane, int rop, uint32_t mask)
204 {
205 
206 	*om_rop_addr(plane, rop) = mask;
207 }
208 
209 /* Set ROP and ROP's mask for current setplanemask-ed plane(s) */
210 static inline void
211 om_set_rop_curplane(int rop, uint32_t mask)
212 {
213 
214 	((volatile uint32_t *)(OMFB_ROP_COMMON))[rop] = mask;
215 }
216 
217 /* Reset planemask and ROP */
218 static inline void
219 om_reset_planemask_and_rop(void)
220 {
221 
222 	om_set_planemask(hwplanemask);
223 	om_set_rop_curplane(ROP_THROUGH, ~0U);
224 }
225 
226 static inline void
227 om_set_rowattr(int row, uint8_t fg, uint8_t bg)
228 {
229 
230 	if (rowattr[row].fg == fg && rowattr[row].bg == bg)
231 		return;
232 	if (rowattr[row].ismulti)
233 		return;
234 
235 	if (rowattr[row].fg == rowattr[row].bg) {
236 		/* From the initial (erased) state, */
237 		if (rowattr[row].fg != fg && rowattr[row].bg != bg) {
238 			/* if both are changed at once, it's multi color */
239 			rowattr[row].ismulti = true;
240 		} else {
241 			/* otherwise, it's single color */
242 			rowattr[row].fg = fg;
243 			rowattr[row].bg = bg;
244 		}
245 	} else {
246 		rowattr[row].ismulti = true;
247 	}
248 }
249 
250 static inline void
251 om_reset_rowattr(int row, uint8_t bg)
252 {
253 
254 	/* Setting fg equal to bg means 'reset' or 'erased'. */
255 	rowattr[row].ismulti = false;
256 	rowattr[row].bg = bg;
257 	rowattr[row].fg = bg;
258 }
259 
260 /*
261  * Fill rectangle.
262  * val is assumed only ALL0BITS or ALL1BITS, because all bits are used as is
263  * regardless of bit offset of the destination.
264  */
265 static void
266 om_fill(int planemask, int rop, uint8_t *dstptr, int dstbitoffs, int dstspan,
267     uint32_t val, int width, int height)
268 {
269 	uint32_t mask;
270 	uint32_t prev_mask;
271 	int32_t height_m1;
272 	int dw;		/* 1 pass width bits */
273 
274 	ASSUME(width > 0);
275 	ASSUME(height > 0);
276 	ASSUME(0 <= dstbitoffs && dstbitoffs < 32);
277 
278 	om_set_planemask(planemask);
279 
280 	height_m1 = height - 1;
281 	mask = ALL1BITS >> dstbitoffs;
282 	prev_mask = ~mask;
283 	dw = 32 - dstbitoffs;
284 
285 	/* do-while loop seems slightly faster than a for loop */
286 	do {
287 		uint8_t *d;
288 		int32_t h;
289 
290 		width -= dw;
291 		if (width < 0) {
292 			CLEAR_LOWER_BITS(mask, -width);
293 			/* To exit this loop. */
294 			width = 0;
295 		}
296 
297 		if (prev_mask != mask) {
298 			om_set_rop_curplane(rop, mask);
299 			prev_mask = mask;
300 		}
301 
302 		d = dstptr;
303 		dstptr += 4;
304 		h = height_m1;
305 
306 #if USE_M68K_ASM
307 		asm volatile("\n"
308 		"om_fill_loop_h:\n"
309 		"	move.l	%[val],(%[d])			;\n"
310 		"	add.l	%[dstspan],%[d]			;\n"
311 		"	dbra	%[h],om_fill_loop_h		;\n"
312 		    : /* output */
313 		      [d] "+&a" (d),
314 		      [h] "+&d" (h)
315 		    : /* input */
316 		      [val] "d" (val),
317 		      [dstspan] "r" (dstspan)
318 		    : /* clobbers */
319 		      "memory"
320 		);
321 #else
322 		do {
323 			*(uint32_t *)d = val;
324 			d += dstspan;
325 		} while (--h >= 0);
326 #endif
327 		mask = ALL1BITS;
328 		dw = 32;
329 	} while (width > 0);
330 }
331 
332 static void
333 om_fill_color(int planecount, int color, uint8_t *dstptr, int dstbitoffs,
334     int dstspan, int width, int height)
335 {
336 	uint32_t mask;
337 	uint32_t prev_mask;
338 	int32_t height_m1;
339 	int dw;		/* 1 pass width bits */
340 
341 	ASSUME(width > 0);
342 	ASSUME(height > 0);
343 	ASSUME(planecount > 0);
344 
345 	/* select all planes */
346 	om_set_planemask(hwplanemask);
347 
348 	mask = ALL1BITS >> dstbitoffs;
349 	prev_mask = ~mask;
350 	dw = 32 - dstbitoffs;
351 	height_m1 = height - 1;
352 
353 	do {
354 		uint8_t *d;
355 		int32_t plane;
356 		int32_t h;
357 		int16_t rop;
358 
359 		width -= dw;
360 		if (width < 0) {
361 			CLEAR_LOWER_BITS(mask, -width);
362 			/* To exit this loop. */
363 			width = 0;
364 		}
365 
366 		if (prev_mask != mask) {
367 			for (plane = 0; plane < planecount; plane++) {
368 				if ((color & (1U << plane)) != 0)
369 					rop = ROP_ONE;
370 				else
371 					rop = ROP_ZERO;
372 				om_set_rop(plane, rop, mask);
373 			}
374 			prev_mask = mask;
375 		}
376 
377 		d = dstptr;
378 		dstptr += 4;
379 		h = height_m1;
380 
381 #if USE_M68K_ASM
382 		asm volatile("\n"
383 		"om_fill_color_loop_h:\n"
384 		"	clr.l	(%[d])				;\n"
385 		"	add.l	%[dstspan],%[d]			;\n"
386 		"	dbra	%[h],om_fill_color_loop_h	;\n"
387 		    : /* output */
388 		      [d] "+&a" (d),
389 		      [h] "+&d" (h)
390 		    : /* input */
391 		      [dstspan] "r" (dstspan)
392 		    : /* clobbers */
393 		      "memory"
394 		);
395 #else
396 		do {
397 			/*
398 			 * ROP is either ONE or ZERO,
399 			 * so don't care what you write to *d.
400 			 */
401 			*(uint32_t *)d = 0;
402 			d += dstspan;
403 		} while (--h >= 0);
404 #endif
405 		mask = ALL1BITS;
406 		dw = 32;
407 	} while (width > 0);
408 }
409 
410 /*
411  * Calculate ROP depending on fg/bg color combination as follows.
412  * This is called per individual plane while shifting fg and bg.
413  * So the LSB of fg and bg points to this plane.
414  *
415  * All ROP values we want to use here happens to be a multiple of 5.
416  *
417  *  bg fg  rop               result
418  *  -- --  ----------------  ------
419  *   0  0  ROP_ZERO    =  0   0
420  *   0  1  ROP_THROUGH =  5   D
421  *   1  0  ROP_INV1    = 10  ~D
422  *   1  1  ROP_ONE     = 15   1
423  *
424  * This allows characters to be drawn in the specified fg/bg colors with
425  * a single write to the common plane.
426  */
427 static inline int
428 om_fgbg2rop(uint8_t fg, uint8_t bg)
429 {
430 	int t;
431 
432 	t = (bg & 1) * 2 + (fg & 1);
433 	return t * 5;
434 }
435 
436 /*
437  * Blit a character at the specified co-ordinates.
438  * This function modifies(breaks) the planemask and ROPs.
439  */
440 static void
441 om_putchar(void *cookie, int row, int startcol, u_int uc, long attr)
442 {
443 	struct rasops_info *ri = cookie;
444 	uint8_t *fontptr;
445 	uint8_t *dstcmn;
446 	uint32_t mask;
447 	int width;
448 	int height;
449 	int planecount;
450 	int x, y;
451 	int fontstride;
452 	int fontx;
453 	int plane;
454 	int dw;		/* 1 pass width bits */
455 	int xh, xl;
456 	uint8_t fg, bg;
457 	/* ROP address cache */
458 	static volatile uint32_t *ropaddr[OMFB_MAX_PLANECOUNT];
459 	static uint8_t last_fg, last_bg;
460 
461 	if (uc >= 0x80)
462 		return;
463 
464 	width = ri->ri_font->fontwidth;
465 	height = ri->ri_font->fontheight;
466 	planecount = ri->ri_depth;
467 	fontstride = ri->ri_font->stride;
468 	y = height * row;
469 	x = width * startcol;
470 	fontptr = (uint8_t *)ri->ri_font->data +
471 	    (uc - ri->ri_font->firstchar) * ri->ri_fontscale;
472 
473 	om_unpack_attr(attr, &fg, &bg, NULL);
474 	om_set_rowattr(row, fg, bg);
475 
476 	if (last_fg != fg || last_bg != bg) {
477 		last_fg = fg;
478 		last_bg = bg;
479 		/* calculate ROP */
480 		for (plane = 0; plane < planecount; plane++) {
481 			int t = om_fgbg2rop(fg, bg);
482 			ropaddr[plane] = om_rop_addr(plane, t);
483 			fg >>= 1;
484 			bg >>= 1;
485 		}
486 	}
487 
488 	/* divide x into the lower 5 bits and the rest. */
489 	xh = x >> 5;
490 	xl = x & 0x1f;
491 
492 	/* write to common plane */
493 	dstcmn = (uint8_t *)ri->ri_bits + xh * 4 + y * OMFB_STRIDE;
494 
495 	/* select all plane */
496 	om_set_planemask(hwplanemask);
497 
498 	fontx = 0;
499 	mask = ALL1BITS >> xl;
500 	dw = 32 - xl;
501 
502 	ASSUME(planecount == 1 ||
503 	       planecount == 4 ||
504 	       planecount == 8);
505 
506 	do {
507 		uint8_t *d;
508 		uint8_t *f;
509 		int32_t h;
510 
511 		width -= dw;
512 		if (width < 0) {
513 			CLEAR_LOWER_BITS(mask, -width);
514 			/* To exit this loop. */
515 			width = 0;
516 		}
517 
518 		switch (planecount) {
519 		 case 8:
520 			*(ropaddr[7]) = mask;
521 			*(ropaddr[6]) = mask;
522 			*(ropaddr[5]) = mask;
523 			*(ropaddr[4]) = mask;
524 			/* FALLTHROUGH */
525 		 case 4:
526 			*(ropaddr[3]) = mask;
527 			*(ropaddr[2]) = mask;
528 			*(ropaddr[1]) = mask;
529 			/* FALLTHROUGH */
530 		 case 1:
531 			*(ropaddr[0]) = mask;
532 			break;
533 		}
534 
535 		d = dstcmn;
536 		f = fontptr;
537 		h = height - 1;
538 		do {
539 			uint32_t v;
540 			GETBITS(f, fontx, dw, v);
541 			/* no need to shift v because it's masked by ROP */
542 			*(uint32_t *)d = v;
543 			d += OMFB_STRIDE;
544 			f += fontstride;
545 		} while (--h >= 0);
546 
547 		dstcmn += 4;
548 		fontx += dw;
549 		mask = ALL1BITS;
550 		dw = 32;
551 	} while (width > 0);
552 
553 	om_reset_planemask_and_rop();
554 }
555 
556 static void
557 om_erasecols(void *cookie, int row, int startcol, int ncols, long attr)
558 {
559 	struct rasops_info *ri = cookie;
560 	int startx;
561 	int width;
562 	int height;
563 	int planecount;
564 	int sh, sl;
565 	int y;
566 	int scanspan;
567 	uint8_t *p;
568 	uint8_t fg, bg;
569 
570 	scanspan = ri->ri_stride;
571 	y = ri->ri_font->fontheight * row;
572 	startx = ri->ri_font->fontwidth * startcol;
573 	width = ri->ri_font->fontwidth * ncols;
574 	height = ri->ri_font->fontheight;
575 	planecount = ri->ri_depth;
576 	om_unpack_attr(attr, &fg, &bg, NULL);
577 	sh = startx >> 5;
578 	sl = startx & 0x1f;
579 	p = (uint8_t *)ri->ri_bits + y * scanspan + sh * 4;
580 
581 	/* I'm not sure */
582 	om_set_rowattr(row, fg, bg);
583 
584 	if (bg == 0) {
585 		/* om_fill seems slightly efficient */
586 		om_fill(hwplanemask, ROP_ZERO,
587 		    p, sl, scanspan, 0, width, height);
588 	} else {
589 		om_fill_color(planecount, bg, p, sl, scanspan, width, height);
590 	}
591 
592 	/* reset mask value */
593 	om_reset_planemask_and_rop();
594 }
595 
596 static void
597 om_eraserows(void *cookie, int startrow, int nrows, long attr)
598 {
599 	struct rasops_info *ri = cookie;
600 	int startx;
601 	int width;
602 	int height;
603 	int planecount;
604 	int sh, sl;
605 	int y;
606 	int scanspan;
607 	int row;
608 	uint8_t *p;
609 	uint8_t fg, bg;
610 
611 	scanspan = ri->ri_stride;
612 	y = ri->ri_font->fontheight * startrow;
613 	startx = 0;
614 	width = ri->ri_emuwidth;
615 	height = ri->ri_font->fontheight * nrows;
616 	planecount = ri->ri_depth;
617 	om_unpack_attr(attr, &fg, &bg, NULL);
618 	sh = startx >> 5;
619 	sl = startx & 0x1f;
620 	p = (uint8_t *)ri->ri_bits + y * scanspan + sh * 4;
621 
622 	for (row = startrow; row < startrow + nrows; row++) {
623 		om_reset_rowattr(row, bg);
624 	}
625 
626 	if (bg == 0) {
627 		/* om_fill seems slightly efficient */
628 		om_fill(hwplanemask, ROP_ZERO,
629 		    p, sl, scanspan, 0, width, height);
630 	} else {
631 		om_fill_color(planecount, bg, p, sl, scanspan, width, height);
632 	}
633 	/* reset mask value */
634 	om_reset_planemask_and_rop();
635 }
636 
637 /*
638  * Single plane raster copy.
639  *  dst: destination plane pointer.
640  *  src: source plane pointer.
641  *       if y-forward, src > dst, point to left-top.
642  *       if y-backward, src < dst, point to left-bottom.
643  *  width: pixel width (must > 0)
644  *  height: pixel height (> 0 if forward, < 0 if backward)
645  *  rop: ROP array with planecount elements.
646  *
647  * This function modifies(breaks) the planemask and ROPs.
648  */
649 static void
650 om_rascopy_single(int planecount, uint8_t *dst, uint8_t *src,
651     int16_t width, int16_t height, uint8_t rop[])
652 {
653 	uint32_t mask;
654 	int wh;
655 	int wl;
656 	int step;
657 	int plane;
658 	int16_t height_m1;
659 	int16_t w, h;
660 
661 	step = OMFB_STRIDE;
662 
663 	/*
664 	 * X direction is always forward (or ascend order) to use (An)+
665 	 * addressing mode in asm.
666 	 */
667 
668 	/* Reverse order Y if backward copy */
669 	if (height < 0) {
670 		/* The sign is managed by step, height is always positive */
671 		step = -step;
672 		height = -height;
673 	}
674 	height_m1 = height - 1;
675 
676 	/*
677 	 * On single, it's not necessary to process two longwords at a time,
678 	 * but we do so for symmetry and speedup.
679 	 */
680 
681 	/* First, transfer a rectangle consist of two longwords */
682 	wh = (width >> 6);
683 	if (wh > 0) {
684 		int step8 = step - wh * 8;
685 
686 #if USE_M68K_ASM
687 		wh--;	/* for dbra */
688 		h = height_m1;
689 		asm volatile("\n"
690 		"om_rascopy_single_LL:\n"
691 		"	move.w	%[wh],%[w]			;\n"
692 		"1:\n"
693 		"	move.l	(%[src])+,(%[dst])+		;\n"
694 		"	move.l	(%[src])+,(%[dst])+		;\n"
695 		"	dbra	%[w],1b				;\n"
696 
697 		"	adda.l	%[step8],%[src]			;\n"
698 		"	adda.l	%[step8],%[dst]			;\n"
699 		"	dbra	%[h],om_rascopy_single_LL	;\n"
700 		    : /* output */
701 		      [src] "+&a" (src),
702 		      [dst] "+&a" (dst),
703 		      [h] "+&d" (h),
704 		      [w] "=&d" (w)
705 		    : /* input */
706 		      [wh] "r" (wh),
707 		      [step8] "r" (step8)
708 		    : /* clobbers */
709 		      "memory"
710 		);
711 #else
712 		wh--;	/* to match to asm side */
713 		for (h = height_m1; h >= 0; h--) {
714 			uint32_t *s32 = (uint32_t *)src;
715 			uint32_t *d32 = (uint32_t *)dst;
716 			for (w = wh; w >= 0; w--) {
717 				*d32++ = *s32++;
718 				*d32++ = *s32++;
719 			}
720 			src = (uint8_t *)s32 + step8;
721 			dst = (uint8_t *)d32 + step8;
722 		}
723 #endif
724 
725 		if ((width & 0x3f) == 0) {
726 			/* transfer completed */
727 			return;
728 		}
729 
730 		/* rewind y for the next transfer */
731 		src -= height * step;
732 		dst -= height * step;
733 	}
734 
735 	if ((width & 32) != 0) {
736 		/* Transfer one longword since an odd longword */
737 #if USE_M68K_ASM
738 		h = height_m1;
739 		asm volatile("\n"
740 		"om_rascopy_single_L:\n"
741 		"	move.l	(%[src]),(%[dst])		;\n"
742 		"	adda.l	%[step],%[src]			;\n"
743 		"	adda.l	%[step],%[dst]			;\n"
744 		"	dbra	%[h],om_rascopy_single_L	;\n"
745 		    : /* output */
746 		      [src] "+&a" (src),
747 		      [dst] "+&a" (dst),
748 		      [h] "+&d" (h)
749 		    : /* input */
750 		      [step] "r" (step)
751 		    : /* clobbers */
752 		      "memory"
753 		);
754 #else
755 		for (h = height_m1; h >= 0; h--) {
756 			*(uint32_t *)dst = *(uint32_t *)src;
757 			dst += step;
758 			src += step;
759 		}
760 #endif
761 
762 		if ((width & 0x1f) == 0) {
763 			/* transfer completed */
764 			return;
765 		}
766 
767 		/* rewind y for the next transfer */
768 		src += 4 - height * step;
769 		dst += 4 - height * step;
770 	}
771 
772 	wl = width & 0x1f;
773 	/* wl > 0 at this point */
774 
775 	/* Then, transfer residual bits */
776 
777 	mask = ALL1BITS << (32 - wl);
778 	/*
779 	 * The common ROP cannot be used here.  Because the hardware doesn't
780 	 * allow you to set the mask while keeping the ROP states.
781 	 */
782 	for (plane = 0; plane < planecount; plane++) {
783 		om_set_rop(plane, rop[plane], mask);
784 	}
785 
786 #if USE_M68K_ASM
787 	h = height_m1;
788 	asm volatile("\n"
789 	"om_rascopy_single_bit:\n"
790 	"	move.l	(%[src]),(%[dst])			;\n"
791 	"	adda.l	%[step],%[src]				;\n"
792 	"	adda.l	%[step],%[dst]				;\n"
793 	"	dbra	%[h],om_rascopy_single_bit		;\n"
794 	    : /* output */
795 	      [src] "+&a" (src),
796 	      [dst] "+&a" (dst),
797 	      [h] "+&d" (h)
798 	    : /* input */
799 	      [step] "r" (step)
800 	    : /* clobbers */
801 	      "memory"
802 	);
803 #else
804 	for (h = height_m1; h >= 0; h--) {
805 		*(uint32_t *)dst = *(uint32_t *)src;
806 		dst += step;
807 		src += step;
808 	}
809 #endif
810 
811 	for (plane = 0; plane < planecount; plane++) {
812 		om_set_rop(plane, rop[plane], ALL1BITS);
813 	}
814 }
815 
816 /*
817  * Multiple plane raster copy.
818  *  dst0: destination pointer in Plane0.
819  *  src0: source pointer in Plane0.
820  *       if y-forward, src0 > dst0, point to left-top.
821  *       if y-backward, src0 < dst0, point to left-bottom.
822  *  width: pixel width (must > 0)
823  *  height: pixel height (> 0 if forward, < 0 if backward)
824  *
825  * This function modifies(breaks) the planemask and ROPs.
826  */
827 static void
828 om4_rascopy_multi(uint8_t *dst0, uint8_t *src0, int16_t width, int16_t height)
829 {
830 	uint8_t *dst1, *dst2, *dst3;
831 	int wh;
832 	int wl;
833 	int rewind;
834 	int step;
835 	uint32_t mask;
836 	int16_t height_m1;
837 	int16_t w, h;
838 
839 	step = OMFB_STRIDE;
840 
841 	/*
842 	 * X direction is always forward (or ascend order) to use (An)+
843 	 * addressing mode in asm.
844 	 */
845 
846 	/* Reverse order Y if backward copy */
847 	if (height < 0) {
848 		/* The sign is managed by step, height is always positive */
849 		step = -step;
850 		height = -height;
851 	}
852 	height_m1 = height - 1;
853 
854 	dst1 = dst0 + OMFB_PLANEOFFS;
855 	dst2 = dst1 + OMFB_PLANEOFFS;
856 	dst3 = dst2 + OMFB_PLANEOFFS;
857 
858 	/* First, transfer a rectangle consist of two longwords */
859 	wh = width >> 6;
860 	if (wh > 0) {
861 		int step8 = step - wh * 8;
862 
863 #if USE_M68K_ASM
864 		wh--;	/* for dbra */
865 		h = height_m1;
866 		asm volatile("\n"
867 		"om4_rascopy_multi_LL:\n"
868 		"	move.w	%[wh],%[w]		;\n"
869 		"1:\n"
870 			/*
871 			 * Optimized for 68030.
872 			 *
873 			 * On LUNA, the following is faster than any of
874 			 * "MOVE.L (An)+,(An)+", "MOVE.L (An,Dn),(An,Dn)", or
875 			 * "MOVEM.L", due to the relationship of instruction
876 			 *  overlaps and access waits.
877 			 *
878 			 * The head time of (An)+ as source operand is 0 and
879 			 * the head time of ADDA instruction is 2.  If the
880 			 * previous instruction has some write wait cycles,
881 			 * i.e., tail cycles, (An)+ as source operand cannot
882 			 * overlap it but ADDA instruction can.
883 			 */
884 		"	move.l	(%[src0]),(%[dst0])+	;\n"	/* P0 */
885 		"	adda.l	%[PLANEOFFS],%[src0]	;\n"
886 		"	move.l	(%[src0]),(%[dst1])+	;\n"	/* P1 */
887 		"	adda.l	%[PLANEOFFS],%[src0]	;\n"
888 		"	move.l	(%[src0]),(%[dst2])+	;\n"	/* P2 */
889 		"	adda.l	%[PLANEOFFS],%[src0]	;\n"
890 		"	move.l	(%[src0]),(%[dst3])+	;\n"	/* P3 */
891 			/* Expect an overlap, so don't use (An)+ */
892 		"	addq.l	#4,%[src0]		;\n"
893 
894 		"	move.l	(%[src0]),(%[dst3])+	;\n"	/* P3 */
895 		"	suba.l	%[PLANEOFFS],%[src0]	;\n"
896 		"	move.l	(%[src0]),(%[dst2])+	;\n"	/* P2 */
897 		"	suba.l	%[PLANEOFFS],%[src0]	;\n"
898 		"	move.l	(%[src0]),(%[dst1])+	;\n"	/* P1 */
899 		"	suba.l	%[PLANEOFFS],%[src0]	;\n"
900 		"	move.l	(%[src0])+,(%[dst0])+	;\n"	/* P0 */
901 		"	dbra	%[w],1b			;\n"
902 
903 		"	adda.l	%[step8],%[src0]	;\n"
904 		"	adda.l	%[step8],%[dst0]	;\n"
905 		"	adda.l	%[step8],%[dst1]	;\n"
906 		"	adda.l	%[step8],%[dst2]	;\n"
907 		"	adda.l	%[step8],%[dst3]	;\n"
908 		"	dbra	%[h],om4_rascopy_multi_LL	;\n"
909 		    : /* output */
910 		      [src0] "+&a" (src0),
911 		      [dst0] "+&a" (dst0),
912 		      [dst1] "+&a" (dst1),
913 		      [dst2] "+&a" (dst2),
914 		      [dst3] "+&a" (dst3),
915 		      [h] "+&d" (h),
916 		      [w] "=&d" (w)
917 		    : /* input */
918 		      [wh] "r" (wh),
919 		      [PLANEOFFS] "r" (OMFB_PLANEOFFS),
920 		      [step8] "r" (step8)
921 		    : /* clobbers */
922 		      "memory"
923 		);
924 #else
925 		wh--;	/* to match to asm side */
926 		for (h = height_m1; h >= 0; h--) {
927 			for (w = wh; w >= 0; w--) {
928 				*(uint32_t *)dst0 = *(uint32_t *)src0;
929 				dst0 += 4;
930 				src0 += OMFB_PLANEOFFS;
931 				*(uint32_t *)dst1 = *(uint32_t *)src0;
932 				dst1 += 4;
933 				src0 += OMFB_PLANEOFFS;
934 				*(uint32_t *)dst2 = *(uint32_t *)src0;
935 				dst2 += 4;
936 				src0 += OMFB_PLANEOFFS;
937 				*(uint32_t *)dst3 = *(uint32_t *)src0;
938 				dst3 += 4;
939 				src0 += 4;
940 
941 				*(uint32_t *)dst3 = *(uint32_t *)src0;
942 				dst3 += 4;
943 				src0 -= OMFB_PLANEOFFS;
944 				*(uint32_t *)dst2 = *(uint32_t *)src0;
945 				dst2 += 4;
946 				src0 -= OMFB_PLANEOFFS;
947 				*(uint32_t *)dst1 = *(uint32_t *)src0;
948 				dst1 += 4;
949 				src0 -= OMFB_PLANEOFFS;
950 				*(uint32_t *)dst0 = *(uint32_t *)src0;
951 				dst0 += 4;
952 				src0 += 4;
953 			}
954 			src0 += step8;
955 			dst0 += step8;
956 			dst1 += step8;
957 			dst2 += step8;
958 			dst3 += step8;
959 		}
960 #endif
961 
962 		if ((width & 0x3f) == 0) {
963 			/* transfer completed */
964 			return;
965 		}
966 
967 		/* rewind y for the next transfer */
968 		src0 -= height * step;
969 		dst0 -= height * step;
970 		dst1 -= height * step;
971 		dst2 -= height * step;
972 		dst3 -= height * step;
973 	}
974 
975 	/* This rewind rewinds the plane, so Y order is irrelevant */
976 	rewind = OMFB_STRIDE - OMFB_PLANEOFFS * 3;
977 
978 	if ((width & 32) != 0) {
979 		/* Transfer one longword since an odd longword */
980 #if USE_M68K_ASM
981 		h = height_m1;
982 		asm volatile("\n"
983 		"om4_rascopy_multi_L:\n"
984 		"	move.l	(%[src0]),(%[dst0])		;\n"
985 		"	adda.l	%[PLANEOFFS],%[src0]		;\n"
986 		"	move.l	(%[src0]),(%[dst1])		;\n"
987 		"	adda.l	%[PLANEOFFS],%[src0]		;\n"
988 		"	move.l	(%[src0]),(%[dst2])		;\n"
989 		"	adda.l	%[PLANEOFFS],%[src0]		;\n"
990 		"	move.l	(%[src0]),(%[dst3])		;\n"
991 		"	adda.l	%[rewind],%[src0]		;\n"
992 
993 		"	adda.l	%[step],%[dst0]			;\n"
994 		"	adda.l	%[step],%[dst1]			;\n"
995 		"	adda.l	%[step],%[dst2]			;\n"
996 		"	adda.l	%[step],%[dst3]			;\n"
997 		"	dbra	%[h],om4_rascopy_multi_L	;\n"
998 		    : /* output */
999 		      [src0] "+&a" (src0),
1000 		      [dst0] "+&a" (dst0),
1001 		      [dst1] "+&a" (dst1),
1002 		      [dst2] "+&a" (dst2),
1003 		      [dst3] "+&a" (dst3),
1004 		      [h] "+&d" (h)
1005 		    : /* input */
1006 		      [PLANEOFFS] "r" (OMFB_PLANEOFFS),
1007 		      [rewind] "r" (rewind),
1008 		      [step] "r" (step)
1009 		    : /* clobbers */
1010 		      "memory"
1011 		);
1012 #else
1013 		for (h = height_m1; h >= 0; h--) {
1014 			*(uint32_t *)dst0 = *(uint32_t *)src0;
1015 			src0 += OMFB_PLANEOFFS;
1016 			*(uint32_t *)dst1 = *(uint32_t *)src0;
1017 			src0 += OMFB_PLANEOFFS;
1018 			*(uint32_t *)dst2 = *(uint32_t *)src0;
1019 			src0 += OMFB_PLANEOFFS;
1020 			*(uint32_t *)dst3 = *(uint32_t *)src0;
1021 			src0 += rewind;
1022 
1023 			dst0 += step;
1024 			dst1 += step;
1025 			dst2 += step;
1026 			dst3 += step;
1027 		}
1028 #endif
1029 
1030 		if ((width & 0x1f) == 0) {
1031 			/* transfer completed */
1032 			return;
1033 		}
1034 
1035 		/* rewind y for the next transfer */
1036 		src0 += 4 - height * step;
1037 		dst0 += 4 - height * step;
1038 		dst1 += 4 - height * step;
1039 		dst2 += 4 - height * step;
1040 		dst3 += 4 - height * step;
1041 	}
1042 
1043 	wl = width & 0x1f;
1044 	/* wl > 0 at this point */
1045 
1046 	/* Then, transfer residual bits */
1047 
1048 	mask = ALL1BITS << (32 - wl);
1049 	om_set_planemask(hwplanemask);
1050 	om_set_rop_curplane(ROP_THROUGH, mask);
1051 
1052 #if USE_M68K_ASM
1053 	h = height_m1;
1054 	asm volatile("\n"
1055 	"om4_rascopy_multi_bit:\n"
1056 	"	move.l	(%[src0]),(%[dst0])			;\n"
1057 	"	adda.l	%[PLANEOFFS],%[src0]			;\n"
1058 	"	move.l	(%[src0]),(%[dst1])			;\n"
1059 	"	adda.l	%[PLANEOFFS],%[src0]			;\n"
1060 	"	move.l	(%[src0]),(%[dst2])			;\n"
1061 	"	adda.l	%[PLANEOFFS],%[src0]			;\n"
1062 	"	move.l	(%[src0]),(%[dst3])			;\n"
1063 	"	adda.l	%[rewind],%[src0]			;\n"
1064 
1065 	"	adda.l	%[step],%[dst0]				;\n"
1066 	"	adda.l	%[step],%[dst1]				;\n"
1067 	"	adda.l	%[step],%[dst2]				;\n"
1068 	"	adda.l	%[step],%[dst3]				;\n"
1069 	"	dbra	%[h],om4_rascopy_multi_bit		;\n"
1070 	    : /* output */
1071 	      [src0] "+&a" (src0),
1072 	      [dst0] "+&a" (dst0),
1073 	      [dst1] "+&a" (dst1),
1074 	      [dst2] "+&a" (dst2),
1075 	      [dst3] "+&a" (dst3),
1076 	      [h] "+&d" (h)
1077 	    : /* input */
1078 	      [PLANEOFFS] "r" (OMFB_PLANEOFFS),
1079 	      [rewind] "r" (rewind),
1080 	      [step] "r" (step)
1081 	    : /* clobbers */
1082 	      "memory"
1083 	);
1084 #else
1085 	for (h = height_m1; h >= 0; h--) {
1086 		*(uint32_t *)dst0 = *(uint32_t *)src0;
1087 		src0 += OMFB_PLANEOFFS;
1088 		*(uint32_t *)dst1 = *(uint32_t *)src0;
1089 		src0 += OMFB_PLANEOFFS;
1090 		*(uint32_t *)dst2 = *(uint32_t *)src0;
1091 		src0 += OMFB_PLANEOFFS;
1092 		*(uint32_t *)dst3 = *(uint32_t *)src0;
1093 		src0 += rewind;
1094 
1095 		dst0 += step;
1096 		dst1 += step;
1097 		dst2 += step;
1098 		dst3 += step;
1099 	}
1100 #endif
1101 	om_reset_planemask_and_rop();
1102 }
1103 
1104 static void
1105 om1_copyrows(void *cookie, int srcrow, int dstrow, int nrows)
1106 {
1107 	struct rasops_info *ri = cookie;
1108 	uint8_t *p, *q;
1109 	int scanspan, offset, srcy, height, width, w;
1110 	uint32_t rmask;
1111 
1112 	scanspan = ri->ri_stride;
1113 	height = ri->ri_font->fontheight * nrows;
1114 	offset = (dstrow - srcrow) * scanspan * ri->ri_font->fontheight;
1115 	srcy = ri->ri_font->fontheight * srcrow;
1116 	if (srcrow < dstrow && srcrow + nrows > dstrow) {
1117 		scanspan = -scanspan;
1118 		srcy = srcy + height - 1;
1119 	}
1120 
1121 	p = (uint8_t *)ri->ri_bits + srcy * ri->ri_stride;
1122 	w = ri->ri_emuwidth;
1123 	width = w;
1124 	rmask = ALL1BITS << (-width & ALIGNMASK);
1125 	q = p;
1126 	while (height > 0) {
1127 		*P0(p + offset) = *P0(p);		/* always aligned */
1128 		width -= 2 * BLITWIDTH;
1129 		while (width > 0) {
1130 			p += BYTESDONE;
1131 			*P0(p + offset) = *P0(p);
1132 			width -= BLITWIDTH;
1133 		}
1134 		p += BYTESDONE;
1135 		*P0(p + offset) = (*P0(p) & rmask) | (*P0(p + offset) & ~rmask);
1136 
1137 		p = (q += scanspan);
1138 		width = w;
1139 		height--;
1140 	}
1141 }
1142 
1143 static void
1144 om4_copyrows(void *cookie, int srcrow, int dstrow, int nrows)
1145 {
1146 	struct rasops_info *ri = cookie;
1147 	uint8_t *src, *dst;
1148 	int width, rowheight;
1149 	int planecount;
1150 	int ptrstep, rowstep;
1151 	int srcplane;
1152 	int i;
1153 	int r;
1154 	uint8_t rop[OMFB_MAX_PLANECOUNT];
1155 
1156 	width = ri->ri_emuwidth;
1157 	rowheight = ri->ri_font->fontheight;
1158 	planecount = ri->ri_depth;
1159 	src = (uint8_t *)ri->ri_bits + srcrow * rowheight * ri->ri_stride;
1160 	dst = (uint8_t *)ri->ri_bits + dstrow * rowheight * ri->ri_stride;
1161 
1162 	if (nrows <= 0 || srcrow == dstrow) {
1163 		return;
1164 	} else if (srcrow < dstrow) {
1165 		/* y-backward */
1166 
1167 		/* select the bottom raster of the bottom row */
1168 		srcrow += nrows - 1;
1169 		dstrow += nrows - 1;
1170 		src += nrows * rowheight * ri->ri_stride - ri->ri_stride;
1171 		dst += nrows * rowheight * ri->ri_stride - ri->ri_stride;
1172 		rowstep = -1;
1173 		rowheight = -rowheight;
1174 	} else {
1175 		/* y-forward */
1176 		rowstep = 1;
1177 	}
1178 	ptrstep = ri->ri_stride * rowheight;
1179 
1180 	om_set_planemask(hwplanemask);
1181 
1182 	srcplane = 0;
1183 	while (nrows > 0) {
1184 		r = 1;
1185 		if (rowattr[srcrow].ismulti == false &&
1186 		    rowattr[srcrow].fg == rowattr[srcrow].bg &&
1187 		    rowattr[srcrow].all == rowattr[dstrow].all) {
1188 			goto skip;
1189 		}
1190 
1191 		/* count the number of rows with the same attributes */
1192 		for (; r < nrows; r++) {
1193 			if (rowattr[srcrow + r * rowstep].all !=
1194 			    rowattr[srcrow].all) {
1195 				break;
1196 			}
1197 		}
1198 		/* r is the number of rows including srcrow itself */
1199 
1200 		if (rowattr[srcrow].ismulti) {
1201 			/*
1202 			 * src,dst point to the common plane.  src0,dst0 will
1203 			 * point to the same offset in plane0 because plane0
1204 			 * is placed just after the common plane.
1205 			 */
1206 			uint8_t *src0 = src + OMFB_PLANEOFFS;
1207 			uint8_t *dst0 = dst + OMFB_PLANEOFFS;
1208 			om_set_rop_curplane(ROP_THROUGH, ALL1BITS);
1209 			om4_rascopy_multi(dst0, src0, width, rowheight * r);
1210 		} else {
1211 			uint8_t *srcp;
1212 			uint8_t fg;
1213 			uint8_t bg;
1214 			uint8_t set;
1215 
1216 			fg = rowattr[srcrow].fg;
1217 			bg = rowattr[srcrow].bg;
1218 			set = fg ^ bg;
1219 			if (set == 0) {
1220 				/* use fg since both can be acceptable */
1221 				set = fg;
1222 			} else if ((set & fg) != 0) {
1223 				/*
1224 				 * set is the set of bits that set in fg and
1225 				 * cleared in bg.
1226 				 */
1227 				set &= fg;
1228 			} else {
1229 				/*
1230 				 * otherwise, set is the set of bits that
1231 				 * (probably) set in bg and cleared in fg.
1232 				 */
1233 				uint8_t tmp;
1234 
1235 				set &= bg;
1236 				/* and swap fg and bg */
1237 				tmp = fg;
1238 				fg = bg;
1239 				bg = tmp;
1240 			}
1241 
1242 			for (i = 0; i < planecount; i++) {
1243 				int t = om_fgbg2rop(fg, bg);
1244 				rop[i] = t;
1245 				om_set_rop(i, rop[i], ALL1BITS);
1246 				fg >>= 1;
1247 				bg >>= 1;
1248 			}
1249 
1250 			/*
1251 			 * If any bit in 'set' is set, any of them can be used.
1252 			 * If all bits in 'set' are cleared, use plane 0.
1253 			 * srcplane is the plane that fg is set and bg is
1254 			 * cleared.
1255 			 */
1256 			srcplane = (set != 0) ? (31 - __builtin_clz(set)) : 0;
1257 
1258 			srcp = src + OMFB_PLANEOFFS + srcplane * OMFB_PLANEOFFS;
1259 			om_rascopy_single(planecount, dst, srcp,
1260 			    width, rowheight * r, rop);
1261 		}
1262 
1263 skip:
1264 		for (i = 0; i < r; i++) {
1265 			rowattr[dstrow] = rowattr[srcrow];
1266 
1267 			srcrow += rowstep;
1268 			dstrow += rowstep;
1269 			src += ptrstep;
1270 			dst += ptrstep;
1271 			nrows--;
1272 		}
1273 	}
1274 }
1275 
1276 /*
1277  * XXX om{1,4}_copycols can be merged, but these are not frequently executed
1278  * and have low execution costs.  So I'm putting it off for now.
1279  */
1280 
1281 static void
1282 om1_copycols(void *cookie, int startrow, int srccol, int dstcol, int ncols)
1283 {
1284 	struct rasops_info *ri = cookie;
1285 	uint8_t *sp, *dp, *sq, *dq, *basep;
1286 	int scanspan, height, w, y, srcx, dstx;
1287 	int sb, eb, db, sboff, full, cnt, lnum, rnum;
1288 	uint32_t lmask, rmask, tmp;
1289 	bool sbover;
1290 
1291 	scanspan = ri->ri_stride;
1292 	y = ri->ri_font->fontheight * startrow;
1293 	srcx = ri->ri_font->fontwidth * srccol;
1294 	dstx = ri->ri_font->fontwidth * dstcol;
1295 	height = ri->ri_font->fontheight;
1296 	w = ri->ri_font->fontwidth * ncols;
1297 	basep = (uint8_t *)ri->ri_bits + y * scanspan;
1298 
1299 	sb = srcx & ALIGNMASK;
1300 	db = dstx & ALIGNMASK;
1301 
1302 	om_reset_planemask_and_rop();
1303 
1304 	if (db + w <= BLITWIDTH) {
1305 		/* Destination is contained within a single word */
1306 		sp = basep + (srcx / 32) * 4;
1307 		dp = basep + (dstx / 32) * 4;
1308 
1309 		while (height > 0) {
1310 			GETBITS(P0(sp), sb, w, tmp);
1311 			PUTBITS(tmp, db, w, P0(dp));
1312 			dp += scanspan;
1313 			sp += scanspan;
1314 			height--;
1315 		}
1316 		return;
1317 	}
1318 
1319 	lmask = (db == 0) ? 0 : ALL1BITS >> db;
1320 	eb = (db + w) & ALIGNMASK;
1321 	rmask = (eb == 0) ? 0 : ALL1BITS << (32 - eb);
1322 	lnum = (32 - db) & ALIGNMASK;
1323 	rnum = (dstx + w) & ALIGNMASK;
1324 
1325 	if (lmask != 0)
1326 		full = (w - (32 - db)) / 32;
1327 	else
1328 		full = w / 32;
1329 
1330 	sbover = (sb + lnum) >= 32;
1331 
1332 	if (dstcol < srccol || srccol + ncols < dstcol) {
1333 		/* copy forward (left-to-right) */
1334 		sp = basep + (srcx / 32) * 4;
1335 		dp = basep + (dstx / 32) * 4;
1336 
1337 		if (lmask != 0) {
1338 			sboff = sb + lnum;
1339 			if (sboff >= 32)
1340 				sboff -= 32;
1341 		} else {
1342 			sboff = sb;
1343 		}
1344 
1345 		sq = sp;
1346 		dq = dp;
1347 		while (height > 0) {
1348 			if (lmask != 0) {
1349 				GETBITS(P0(sp), sb, lnum, tmp);
1350 				PUTBITS(tmp, db, lnum, P0(dp));
1351 				dp += BYTESDONE;
1352 				if (sbover)
1353 					sp += BYTESDONE;
1354 			}
1355 
1356 			for (cnt = full; cnt; cnt--) {
1357 				GETBITS(P0(sp), sboff, 32, tmp);
1358 				*P0(dp) = tmp;
1359 				sp += BYTESDONE;
1360 				dp += BYTESDONE;
1361 			}
1362 
1363 			if (rmask != 0) {
1364 				GETBITS(P0(sp), sboff, rnum, tmp);
1365 				PUTBITS(tmp, 0, rnum, P0(dp));
1366 			}
1367 
1368 			sp = (sq += scanspan);
1369 			dp = (dq += scanspan);
1370 			height--;
1371 		}
1372 	} else {
1373 		/* copy backward (right-to-left) */
1374 		sp = basep + ((srcx + w) / 32) * 4;
1375 		dp = basep + ((dstx + w) / 32) * 4;
1376 
1377 		sboff = (srcx + w) & ALIGNMASK;
1378 		sboff -= rnum;
1379 		if (sboff < 0) {
1380 			sp -= BYTESDONE;
1381 			sboff += 32;
1382 		}
1383 
1384 		sq = sp;
1385 		dq = dp;
1386 		while (height > 0) {
1387 			if (rnum != 0) {
1388 				GETBITS(P0(sp), sboff, rnum, tmp);
1389 				PUTBITS(tmp, 0, rnum, P0(dp));
1390 			}
1391 
1392 			for (cnt = full; cnt; cnt--) {
1393 				sp -= BYTESDONE;
1394 				dp -= BYTESDONE;
1395 				GETBITS(P0(sp), sboff, 32, tmp);
1396 				*P0(dp) = tmp;
1397 			}
1398 
1399 			if (lmask != 0) {
1400 				if (sbover)
1401 					sp -= BYTESDONE;
1402 				dp -= BYTESDONE;
1403 				GETBITS(P0(sp), sb, lnum, tmp);
1404 				PUTBITS(tmp, db, lnum, P0(dp));
1405 			}
1406 
1407 			sp = (sq += scanspan);
1408 			dp = (dq += scanspan);
1409 			height--;
1410 		}
1411 	}
1412 }
1413 
1414 static void
1415 om4_copycols(void *cookie, int startrow, int srccol, int dstcol, int ncols)
1416 {
1417 	struct rasops_info *ri = cookie;
1418 	uint8_t *sp, *dp, *sq, *dq, *basep;
1419 	int scanspan, height, w, y, srcx, dstx;
1420 	int sb, eb, db, sboff, full, cnt, lnum, rnum;
1421 	uint32_t lmask, rmask, tmp;
1422 	bool sbover;
1423 
1424 	scanspan = ri->ri_stride;
1425 	y = ri->ri_font->fontheight * startrow;
1426 	srcx = ri->ri_font->fontwidth * srccol;
1427 	dstx = ri->ri_font->fontwidth * dstcol;
1428 	height = ri->ri_font->fontheight;
1429 	w = ri->ri_font->fontwidth * ncols;
1430 	basep = (uint8_t *)ri->ri_bits + y * scanspan;
1431 
1432 	sb = srcx & ALIGNMASK;
1433 	db = dstx & ALIGNMASK;
1434 
1435 	om_reset_planemask_and_rop();
1436 
1437 	if (db + w <= BLITWIDTH) {
1438 		/* Destination is contained within a single word */
1439 		sp = basep + (srcx / 32) * 4;
1440 		dp = basep + (dstx / 32) * 4;
1441 
1442 		while (height > 0) {
1443 			GETBITS(P0(sp), sb, w, tmp);
1444 			PUTBITS(tmp, db, w, P0(dp));
1445 			GETBITS(P1(sp), sb, w, tmp);
1446 			PUTBITS(tmp, db, w, P1(dp));
1447 			GETBITS(P2(sp), sb, w, tmp);
1448 			PUTBITS(tmp, db, w, P2(dp));
1449 			GETBITS(P3(sp), sb, w, tmp);
1450 			PUTBITS(tmp, db, w, P3(dp));
1451 			dp += scanspan;
1452 			sp += scanspan;
1453 			height--;
1454 		}
1455 		return;
1456 	}
1457 
1458 	lmask = (db == 0) ? 0 : ALL1BITS >> db;
1459 	eb = (db + w) & ALIGNMASK;
1460 	rmask = (eb == 0) ? 0 : ALL1BITS << (32 - eb);
1461 	lnum = (32 - db) & ALIGNMASK;
1462 	rnum = (dstx + w) & ALIGNMASK;
1463 
1464 	if (lmask != 0)
1465 		full = (w - (32 - db)) / 32;
1466 	else
1467 		full = w / 32;
1468 
1469 	sbover = (sb + lnum) >= 32;
1470 
1471 	if (dstcol < srccol || srccol + ncols < dstcol) {
1472 		/* copy forward (left-to-right) */
1473 		sp = basep + (srcx / 32) * 4;
1474 		dp = basep + (dstx / 32) * 4;
1475 
1476 		if (lmask != 0) {
1477 			sboff = sb + lnum;
1478 			if (sboff >= 32)
1479 				sboff -= 32;
1480 		} else {
1481 			sboff = sb;
1482 		}
1483 
1484 		sq = sp;
1485 		dq = dp;
1486 		while (height > 0) {
1487 			if (lmask != 0) {
1488 				GETBITS(P0(sp), sb, lnum, tmp);
1489 				PUTBITS(tmp, db, lnum, P0(dp));
1490 				GETBITS(P1(sp), sb, lnum, tmp);
1491 				PUTBITS(tmp, db, lnum, P1(dp));
1492 				GETBITS(P2(sp), sb, lnum, tmp);
1493 				PUTBITS(tmp, db, lnum, P2(dp));
1494 				GETBITS(P3(sp), sb, lnum, tmp);
1495 				PUTBITS(tmp, db, lnum, P3(dp));
1496 				dp += BYTESDONE;
1497 				if (sbover)
1498 					sp += BYTESDONE;
1499 			}
1500 
1501 			for (cnt = full; cnt; cnt--) {
1502 				GETBITS(P0(sp), sboff, 32, tmp);
1503 				*P0(dp) = tmp;
1504 				GETBITS(P1(sp), sboff, 32, tmp);
1505 				*P1(dp) = tmp;
1506 				GETBITS(P2(sp), sboff, 32, tmp);
1507 				*P2(dp) = tmp;
1508 				GETBITS(P3(sp), sboff, 32, tmp);
1509 				*P3(dp) = tmp;
1510 				sp += BYTESDONE;
1511 				dp += BYTESDONE;
1512 			}
1513 
1514 			if (rmask != 0) {
1515 				GETBITS(P0(sp), sboff, rnum, tmp);
1516 				PUTBITS(tmp, 0, rnum, P0(dp));
1517 				GETBITS(P1(sp), sboff, rnum, tmp);
1518 				PUTBITS(tmp, 0, rnum, P1(dp));
1519 				GETBITS(P2(sp), sboff, rnum, tmp);
1520 				PUTBITS(tmp, 0, rnum, P2(dp));
1521 				GETBITS(P3(sp), sboff, rnum, tmp);
1522 				PUTBITS(tmp, 0, rnum, P3(dp));
1523 			}
1524 
1525 			sp = (sq += scanspan);
1526 			dp = (dq += scanspan);
1527 			height--;
1528 		}
1529 	} else {
1530 		/* copy backward (right-to-left) */
1531 		sp = basep + ((srcx + w) / 32) * 4;
1532 		dp = basep + ((dstx + w) / 32) * 4;
1533 
1534 		sboff = (srcx + w) & ALIGNMASK;
1535 		sboff -= rnum;
1536 		if (sboff < 0) {
1537 			sp -= BYTESDONE;
1538 			sboff += 32;
1539 		}
1540 
1541 		sq = sp;
1542 		dq = dp;
1543 		while (height > 0) {
1544 			if (rnum != 0) {
1545 				GETBITS(P0(sp), sboff, rnum, tmp);
1546 				PUTBITS(tmp, 0, rnum, P0(dp));
1547 				GETBITS(P1(sp), sboff, rnum, tmp);
1548 				PUTBITS(tmp, 0, rnum, P1(dp));
1549 				GETBITS(P2(sp), sboff, rnum, tmp);
1550 				PUTBITS(tmp, 0, rnum, P2(dp));
1551 				GETBITS(P3(sp), sboff, rnum, tmp);
1552 				PUTBITS(tmp, 0, rnum, P3(dp));
1553 			}
1554 
1555 			for (cnt = full; cnt; cnt--) {
1556 				sp -= BYTESDONE;
1557 				dp -= BYTESDONE;
1558 				GETBITS(P0(sp), sboff, 32, tmp);
1559 				*P0(dp) = tmp;
1560 				GETBITS(P1(sp), sboff, 32, tmp);
1561 				*P1(dp) = tmp;
1562 				GETBITS(P2(sp), sboff, 32, tmp);
1563 				*P2(dp) = tmp;
1564 				GETBITS(P3(sp), sboff, 32, tmp);
1565 				*P3(dp) = tmp;
1566 			}
1567 
1568 			if (lmask != 0) {
1569 				if (sbover)
1570 					sp -= BYTESDONE;
1571 				dp -= BYTESDONE;
1572 				GETBITS(P0(sp), sb, lnum, tmp);
1573 				PUTBITS(tmp, db, lnum, P0(dp));
1574 				GETBITS(P1(sp), sb, lnum, tmp);
1575 				PUTBITS(tmp, db, lnum, P1(dp));
1576 				GETBITS(P2(sp), sb, lnum, tmp);
1577 				PUTBITS(tmp, db, lnum, P2(dp));
1578 				GETBITS(P3(sp), sb, lnum, tmp);
1579 				PUTBITS(tmp, db, lnum, P3(dp));
1580 			}
1581 
1582 			sp = (sq += scanspan);
1583 			dp = (dq += scanspan);
1584 			height--;
1585 		}
1586 	}
1587 }
1588 
1589 /*
1590  * Map a character.
1591  */
1592 static int
1593 om_mapchar(void *cookie, int c, u_int *cp)
1594 {
1595 	struct rasops_info *ri = cookie;
1596 	struct wsdisplay_font *wf = ri->ri_font;
1597 
1598 	if (wf->encoding != WSDISPLAY_FONTENC_ISO) {
1599 		c = wsfont_map_unichar(wf, c);
1600 
1601 		if (c < 0)
1602 			goto fail;
1603 	}
1604 	if (c < wf->firstchar || c >= (wf->firstchar + wf->numchars))
1605 		goto fail;
1606 
1607 	*cp = c;
1608 	return 5;
1609 
1610  fail:
1611 	*cp = ' ';
1612 	return 0;
1613 }
1614 
1615 /*
1616  * Position|{enable|disable} the cursor at the specified location.
1617  */
1618 static void
1619 om_cursor(void *cookie, int on, int row, int col)
1620 {
1621 	struct rasops_info *ri = cookie;
1622 	int startx;
1623 	int width;
1624 	int height;
1625 	int sh, sl;
1626 	int y;
1627 	int scanspan;
1628 	uint8_t *p;
1629 
1630 	if (!on) {
1631 		/* make sure it's on */
1632 		if ((ri->ri_flg & RI_CURSOR) == 0)
1633 			return;
1634 
1635 		row = ri->ri_crow;
1636 		col = ri->ri_ccol;
1637 	} else {
1638 		/* unpaint the old copy. */
1639 		ri->ri_crow = row;
1640 		ri->ri_ccol = col;
1641 	}
1642 
1643 	scanspan = ri->ri_stride;
1644 	y = ri->ri_font->fontheight * row;
1645 	startx = ri->ri_font->fontwidth * col;
1646 	width = ri->ri_font->fontwidth;
1647 	height = ri->ri_font->fontheight;
1648 	sh = startx >> 5;
1649 	sl = startx & 0x1f;
1650 	p = (uint8_t *)ri->ri_bits + y * scanspan + sh * 4;
1651 
1652 	/* ROP_INV2 ignores data from MPU and inverts the current VRAM data */
1653 	om_fill(hwplanemask, ROP_INV2, p, sl, scanspan, 0, width, height);
1654 
1655 	ri->ri_flg ^= RI_CURSOR;
1656 
1657 	/* reset mask value */
1658 	om_reset_planemask_and_rop();
1659 }
1660 
1661 /*
1662  * Allocate attribute. We just pack these into an integer.
1663  *
1664  * Attribute bitmap:
1665  *  b31:    Multi color (used by copyrows)
1666  *  b30-18: 0 (reserved)
1667  *  b17:    Underline (not supported yet)
1668  *  b16:    Bold (or HILIT if 1bpp; not supported yet)
1669  *  b15-8:  fg color code
1670  *  b7-0:   bg color code
1671  */
1672 #if 0
1673 /*
1674  * Future plan:
1675  * Place fg and bg side by side in advance to reduce the computation cost
1676  * at the time of ROP setting.
1677  *
1678  * bit: 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
1679  *      f7 b7 f6 b6 f5 b5 f4 b4 f3 b3 f2 b2 f1 b1 f0 b0
1680  *
1681  * In this form, use bit1..0 if 1bpp, use bit7..0 if 4bpp.
1682  */
1683 #endif
1684 static int
1685 om_allocattr(void *cookie, int fg, int bg, int flags, long *attrp)
1686 {
1687 	struct rasops_info *ri = cookie;
1688 	int planecount = ri->ri_depth;
1689 	uint32_t a;
1690 	uint16_t c;
1691 
1692 	a = 0;
1693 	c = 0;
1694 
1695 	if ((flags & WSATTR_BLINK) != 0)
1696 		return EINVAL;
1697 
1698 	if ((flags & WSATTR_WSCOLORS) == 0) {
1699 		fg = WSCOL_WHITE;	/* maybe 7 or 1 */
1700 		bg = WSCOL_BLACK;	/* maybe 0 */
1701 	}
1702 
1703 	if ((flags & WSATTR_REVERSE) != 0) {
1704 		int tmp;
1705 		tmp = fg;
1706 		fg = bg;
1707 		bg = tmp;
1708 	}
1709 
1710 	if ((flags & WSATTR_HILIT) != 0) {
1711 		if (planecount == 1) {
1712 #if 0
1713 			a |= OMFB_ATTR_BOLD;
1714 #else
1715 			return EINVAL;
1716 #endif
1717 		} else if (fg < 8) {
1718 			fg += 8;
1719 		}
1720 	}
1721 
1722 	if ((flags & WSATTR_UNDERLINE) != 0) {
1723 #if 0
1724 		a |= OMFB_ATTR_UNDERLINE;
1725 #else
1726 		return EINVAL;
1727 #endif
1728 	}
1729 
1730 	fg &= hwplanemask;
1731 	bg &= hwplanemask;
1732 
1733 #if 0
1734 	int i;
1735 	for (i = 0; i < planecount; i++) {
1736 		c += c;
1737 		c += ((fg & 1) << 1) | (bg & 1);
1738 		fg >>= 1;
1739 		bg >>= 1;
1740 	}
1741 #else
1742 	c = (fg  << 8) | bg;
1743 #endif
1744 	a |= c;
1745 
1746 	*attrp = a;
1747 	return 0;
1748 }
1749 
1750 static void
1751 om_unpack_attr(long attr, uint8_t *fg, uint8_t *bg, int *underline)
1752 {
1753 	uint8_t f, b;
1754 
1755 	f = (attr >> 8) & hwplanemask;
1756 	b = attr & hwplanemask;
1757 
1758 	if (fg)
1759 		*fg = f;
1760 	if (bg)
1761 		*bg = b;
1762 }
1763 
1764 /*
1765  * Init subset of rasops(9) for omrasops.
1766  */
1767 int
1768 omrasops1_init(struct rasops_info *ri, int wantrows, int wantcols)
1769 {
1770 
1771 	omrasops_init(ri, wantrows, wantcols);
1772 
1773 	/* fill our own emulops */
1774 	ri->ri_ops.cursor    = om_cursor;
1775 	ri->ri_ops.mapchar   = om_mapchar;
1776 	ri->ri_ops.putchar   = om_putchar;
1777 	ri->ri_ops.copycols  = om1_copycols;
1778 	ri->ri_ops.erasecols = om_erasecols;
1779 	ri->ri_ops.copyrows  = om1_copyrows;
1780 	ri->ri_ops.eraserows = om_eraserows;
1781 	ri->ri_ops.allocattr = om_allocattr;
1782 	ri->ri_caps = WSSCREEN_REVERSE;
1783 
1784 	ri->ri_flg |= RI_CFGDONE;
1785 
1786 	return 0;
1787 }
1788 
1789 int
1790 omrasops4_init(struct rasops_info *ri, int wantrows, int wantcols)
1791 {
1792 
1793 	omrasops_init(ri, wantrows, wantcols);
1794 
1795 	/* fill our own emulops */
1796 	ri->ri_ops.cursor    = om_cursor;
1797 	ri->ri_ops.mapchar   = om_mapchar;
1798 	ri->ri_ops.putchar   = om_putchar;
1799 	ri->ri_ops.copycols  = om4_copycols;
1800 	ri->ri_ops.erasecols = om_erasecols;
1801 	ri->ri_ops.copyrows  = om4_copyrows;
1802 	ri->ri_ops.eraserows = om_eraserows;
1803 	ri->ri_ops.allocattr = om_allocattr;
1804 	ri->ri_caps = WSSCREEN_HILIT | WSSCREEN_WSCOLORS | WSSCREEN_REVERSE;
1805 
1806 	ri->ri_flg |= RI_CFGDONE;
1807 
1808 	return 0;
1809 }
1810 
1811 static int
1812 omrasops_init(struct rasops_info *ri, int wantrows, int wantcols)
1813 {
1814 	int wsfcookie, bpp;
1815 
1816 	if (wantrows > OMRASOPS_MAX_ROWS)
1817 		wantrows = OMRASOPS_MAX_ROWS;
1818 	if (wantrows == 0)
1819 		wantrows = 34;
1820 	if (wantrows < 10)
1821 		wantrows = 10;
1822 	if (wantcols == 0)
1823 		wantcols = 80;
1824 	if (wantcols < 20)
1825 		wantcols = 20;
1826 
1827 	/* Use default font */
1828 	wsfont_init();
1829 	wsfcookie = wsfont_find(NULL, 0, 0, 0, WSDISPLAY_FONTORDER_L2R,
1830 	    WSDISPLAY_FONTORDER_L2R, WSFONT_FIND_BITMAP);
1831 	if (wsfcookie < 0)
1832 		panic("%s: no font available", __func__);
1833 	if (wsfont_lock(wsfcookie, &ri->ri_font))
1834 		panic("%s: unable to lock font", __func__);
1835 	ri->ri_wsfcookie = wsfcookie;
1836 
1837 	KASSERT(ri->ri_font->fontwidth > 4 && ri->ri_font->fontwidth <= 32);
1838 
1839 	/* all planes are independently addressed */
1840 	bpp = 1;
1841 
1842 	/* Now constrain what they get */
1843 	ri->ri_emuwidth = ri->ri_font->fontwidth * wantcols;
1844 	ri->ri_emuheight = ri->ri_font->fontheight * wantrows;
1845 	if (ri->ri_emuwidth > ri->ri_width)
1846 		ri->ri_emuwidth = ri->ri_width;
1847 	if (ri->ri_emuheight > ri->ri_height)
1848 		ri->ri_emuheight = ri->ri_height;
1849 
1850 	/* Reduce width until aligned on a 32-bit boundary */
1851 	while ((ri->ri_emuwidth * bpp & 31) != 0)
1852 		ri->ri_emuwidth--;
1853 
1854 	ri->ri_cols = ri->ri_emuwidth / ri->ri_font->fontwidth;
1855 	ri->ri_rows = ri->ri_emuheight / ri->ri_font->fontheight;
1856 	ri->ri_emustride = ri->ri_emuwidth * bpp >> 3;
1857 	ri->ri_ccol = 0;
1858 	ri->ri_crow = 0;
1859 	ri->ri_pelbytes = bpp >> 3;
1860 
1861 	ri->ri_xscale = (ri->ri_font->fontwidth * bpp) >> 3;
1862 	ri->ri_yscale = ri->ri_font->fontheight * ri->ri_stride;
1863 	ri->ri_fontscale = ri->ri_font->fontheight * ri->ri_font->stride;
1864 
1865 	/* Clear the entire display */
1866 	if ((ri->ri_flg & RI_CLEAR) != 0)
1867 		memset(ri->ri_bits, 0, ri->ri_stride * ri->ri_height);
1868 
1869 	/* Now centre our window if needs be */
1870 	ri->ri_origbits = ri->ri_bits;
1871 
1872 	if ((ri->ri_flg & RI_CENTER) != 0) {
1873 		ri->ri_bits += (((ri->ri_width * bpp >> 3) -
1874 		    ri->ri_emustride) >> 1) & ~3;
1875 		ri->ri_bits += ((ri->ri_height - ri->ri_emuheight) >> 1) *
1876 		    ri->ri_stride;
1877 		ri->ri_yorigin = (int)(ri->ri_bits - ri->ri_origbits)
1878 		   / ri->ri_stride;
1879 		ri->ri_xorigin = (((int)(ri->ri_bits - ri->ri_origbits)
1880 		   % ri->ri_stride) * 8 / bpp);
1881 	} else
1882 		ri->ri_xorigin = ri->ri_yorigin = 0;
1883 
1884 	return 0;
1885 }
1886