1 /* $OpenBSD: preconv.c,v 1.3 2014/11/14 04:23:08 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 21 #include <stdio.h> 22 #include <string.h> 23 #include "mandoc.h" 24 #include "libmandoc.h" 25 26 int 27 preconv_encode(struct buf *ib, size_t *ii, struct buf *ob, size_t *oi, 28 int *filenc) 29 { 30 size_t i; 31 int state; 32 unsigned int accum; 33 unsigned char cu; 34 35 if ( ! (*filenc & MPARSE_UTF8)) 36 goto latin; 37 38 state = 0; 39 accum = 0U; 40 41 for (i = *ii; i < ib->sz; i++) { 42 cu = ib->buf[i]; 43 if (state) { 44 if ( ! (cu & 128) || (cu & 64)) { 45 /* Bad sequence header. */ 46 break; 47 } 48 49 /* Accept only legitimate bit patterns. */ 50 51 if (cu > 191 || cu < 128) { 52 /* Bad in-sequence bits. */ 53 break; 54 } 55 56 accum |= (cu & 63) << --state * 6; 57 58 if (state) 59 continue; 60 61 if (accum < 0x80) 62 ob->buf[(*oi)++] = accum; 63 else 64 *oi += snprintf(ob->buf + *oi, 65 11, "\\[u%.4X]", accum); 66 *ii = i + 1; 67 *filenc &= ~MPARSE_LATIN1; 68 return(1); 69 } else { 70 /* 71 * Entering a UTF-8 state: if we encounter a 72 * UTF-8 bitmask, calculate the expected UTF-8 73 * state from it. 74 */ 75 for (state = 0; state < 7; state++) 76 if ( ! (cu & (1 << (7 - state)))) 77 break; 78 79 /* Accept only legitimate bit patterns. */ 80 81 switch (state--) { 82 case (4): 83 if (cu <= 244 && cu >= 240) { 84 accum = (cu & 7) << 18; 85 continue; 86 } 87 /* Bad 4-sequence start bits. */ 88 break; 89 case (3): 90 if (cu <= 239 && cu >= 224) { 91 accum = (cu & 15) << 12; 92 continue; 93 } 94 /* Bad 3-sequence start bits. */ 95 break; 96 case (2): 97 if (cu <= 223 && cu >= 194) { 98 accum = (cu & 31) << 6; 99 continue; 100 } 101 /* Bad 2-sequence start bits. */ 102 break; 103 default: 104 /* Bad sequence bit mask. */ 105 break; 106 } 107 break; 108 } 109 } 110 111 /* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */ 112 113 latin: 114 if ( ! (*filenc & MPARSE_LATIN1)) 115 return(0); 116 117 *oi += snprintf(ob->buf + *oi, 11, 118 "\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]); 119 120 *filenc &= ~MPARSE_UTF8; 121 return(1); 122 } 123 124 int 125 preconv_cue(const struct buf *b, size_t offset) 126 { 127 const char *ln, *eoln, *eoph; 128 size_t sz, phsz; 129 130 ln = b->buf + offset; 131 sz = b->sz - offset; 132 133 /* Look for the end-of-line. */ 134 135 if (NULL == (eoln = memchr(ln, '\n', sz))) 136 eoln = ln + sz; 137 138 /* Check if we have the correct header/trailer. */ 139 140 if ((sz = (size_t)(eoln - ln)) < 10 || 141 memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3)) 142 return(MPARSE_UTF8 | MPARSE_LATIN1); 143 144 /* Move after the header and adjust for the trailer. */ 145 146 ln += 7; 147 sz -= 10; 148 149 while (sz > 0) { 150 while (sz > 0 && ' ' == *ln) { 151 ln++; 152 sz--; 153 } 154 if (0 == sz) 155 break; 156 157 /* Find the end-of-phrase marker (or eoln). */ 158 159 if (NULL == (eoph = memchr(ln, ';', sz))) 160 eoph = eoln - 3; 161 else 162 eoph++; 163 164 /* Only account for the "coding" phrase. */ 165 166 if ((phsz = eoph - ln) < 7 || 167 strncasecmp(ln, "coding:", 7)) { 168 sz -= phsz; 169 ln += phsz; 170 continue; 171 } 172 173 sz -= 7; 174 ln += 7; 175 176 while (sz > 0 && ' ' == *ln) { 177 ln++; 178 sz--; 179 } 180 if (0 == sz) 181 return(0); 182 183 /* Check us against known encodings. */ 184 185 if (phsz > 4 && !strncasecmp(ln, "utf-8", 5)) 186 return(MPARSE_UTF8); 187 if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11)) 188 return(MPARSE_LATIN1); 189 return(0); 190 } 191 return(MPARSE_UTF8 | MPARSE_LATIN1); 192 } 193