xref: /openbsd-src/usr.bin/mandoc/preconv.c (revision e5157e49389faebcb42b7237d55fbf096d9c2523)
1 /*	$OpenBSD: preconv.c,v 1.3 2014/11/14 04:23:08 schwarze Exp $ */
2 /*
3  * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 
21 #include <stdio.h>
22 #include <string.h>
23 #include "mandoc.h"
24 #include "libmandoc.h"
25 
26 int
27 preconv_encode(struct buf *ib, size_t *ii, struct buf *ob, size_t *oi,
28     int *filenc)
29 {
30 	size_t		 i;
31 	int		 state;
32 	unsigned int	 accum;
33 	unsigned char	 cu;
34 
35 	if ( ! (*filenc & MPARSE_UTF8))
36 		goto latin;
37 
38 	state = 0;
39 	accum = 0U;
40 
41 	for (i = *ii; i < ib->sz; i++) {
42 		cu = ib->buf[i];
43 		if (state) {
44 			if ( ! (cu & 128) || (cu & 64)) {
45 				/* Bad sequence header. */
46 				break;
47 			}
48 
49 			/* Accept only legitimate bit patterns. */
50 
51 			if (cu > 191 || cu < 128) {
52 				/* Bad in-sequence bits. */
53 				break;
54 			}
55 
56 			accum |= (cu & 63) << --state * 6;
57 
58 			if (state)
59 				continue;
60 
61 			if (accum < 0x80)
62 				ob->buf[(*oi)++] = accum;
63 			else
64 				*oi += snprintf(ob->buf + *oi,
65 				    11, "\\[u%.4X]", accum);
66 			*ii = i + 1;
67 			*filenc &= ~MPARSE_LATIN1;
68 			return(1);
69 		} else {
70 			/*
71 			 * Entering a UTF-8 state:  if we encounter a
72 			 * UTF-8 bitmask, calculate the expected UTF-8
73 			 * state from it.
74 			 */
75 			for (state = 0; state < 7; state++)
76 				if ( ! (cu & (1 << (7 - state))))
77 					break;
78 
79 			/* Accept only legitimate bit patterns. */
80 
81 			switch (state--) {
82 			case (4):
83 				if (cu <= 244 && cu >= 240) {
84 					accum = (cu & 7) << 18;
85 					continue;
86 				}
87 				/* Bad 4-sequence start bits. */
88 				break;
89 			case (3):
90 				if (cu <= 239 && cu >= 224) {
91 					accum = (cu & 15) << 12;
92 					continue;
93 				}
94 				/* Bad 3-sequence start bits. */
95 				break;
96 			case (2):
97 				if (cu <= 223 && cu >= 194) {
98 					accum = (cu & 31) << 6;
99 					continue;
100 				}
101 				/* Bad 2-sequence start bits. */
102 				break;
103 			default:
104 				/* Bad sequence bit mask. */
105 				break;
106 			}
107 			break;
108 		}
109 	}
110 
111 	/* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */
112 
113 latin:
114 	if ( ! (*filenc & MPARSE_LATIN1))
115 		return(0);
116 
117 	*oi += snprintf(ob->buf + *oi, 11,
118 	    "\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]);
119 
120 	*filenc &= ~MPARSE_UTF8;
121 	return(1);
122 }
123 
124 int
125 preconv_cue(const struct buf *b, size_t offset)
126 {
127 	const char	*ln, *eoln, *eoph;
128 	size_t		 sz, phsz;
129 
130 	ln = b->buf + offset;
131 	sz = b->sz - offset;
132 
133 	/* Look for the end-of-line. */
134 
135 	if (NULL == (eoln = memchr(ln, '\n', sz)))
136 		eoln = ln + sz;
137 
138 	/* Check if we have the correct header/trailer. */
139 
140 	if ((sz = (size_t)(eoln - ln)) < 10 ||
141 	    memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3))
142 		return(MPARSE_UTF8 | MPARSE_LATIN1);
143 
144 	/* Move after the header and adjust for the trailer. */
145 
146 	ln += 7;
147 	sz -= 10;
148 
149 	while (sz > 0) {
150 		while (sz > 0 && ' ' == *ln) {
151 			ln++;
152 			sz--;
153 		}
154 		if (0 == sz)
155 			break;
156 
157 		/* Find the end-of-phrase marker (or eoln). */
158 
159 		if (NULL == (eoph = memchr(ln, ';', sz)))
160 			eoph = eoln - 3;
161 		else
162 			eoph++;
163 
164 		/* Only account for the "coding" phrase. */
165 
166 		if ((phsz = eoph - ln) < 7 ||
167 		    strncasecmp(ln, "coding:", 7)) {
168 			sz -= phsz;
169 			ln += phsz;
170 			continue;
171 		}
172 
173 		sz -= 7;
174 		ln += 7;
175 
176 		while (sz > 0 && ' ' == *ln) {
177 			ln++;
178 			sz--;
179 		}
180 		if (0 == sz)
181 			return(0);
182 
183 		/* Check us against known encodings. */
184 
185 		if (phsz > 4 && !strncasecmp(ln, "utf-8", 5))
186 			return(MPARSE_UTF8);
187 		if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11))
188 			return(MPARSE_LATIN1);
189 		return(0);
190 	}
191 	return(MPARSE_UTF8 | MPARSE_LATIN1);
192 }
193