1 /* $OpenBSD: preconv.c,v 1.7 2015/10/06 18:30:44 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <stdio.h> 22 #include <string.h> 23 #include "mandoc.h" 24 #include "libmandoc.h" 25 26 int 27 preconv_encode(struct buf *ib, size_t *ii, struct buf *ob, size_t *oi, 28 int *filenc) 29 { 30 unsigned char *cu; 31 int nby; 32 unsigned int accum; 33 34 cu = (unsigned char *)ib->buf + *ii; 35 assert(*cu & 0x80); 36 37 if ( ! (*filenc & MPARSE_UTF8)) 38 goto latin; 39 40 nby = 1; 41 while (nby < 5 && *cu & (1 << (7 - nby))) 42 nby++; 43 44 switch (nby) { 45 case 2: 46 accum = *cu & 0x1f; 47 if (accum < 0x02) /* Obfuscated ASCII. */ 48 goto latin; 49 break; 50 case 3: 51 accum = *cu & 0x0f; 52 break; 53 case 4: 54 accum = *cu & 0x07; 55 if (accum > 0x04) /* Beyond Unicode. */ 56 goto latin; 57 break; 58 default: /* Bad sequence header. */ 59 goto latin; 60 } 61 62 cu++; 63 switch (nby) { 64 case 3: 65 if ((accum == 0x00 && ! (*cu & 0x20)) || /* Use 2-byte. */ 66 (accum == 0x0d && *cu & 0x20)) /* Surrogates. */ 67 goto latin; 68 break; 69 case 4: 70 if ((accum == 0x00 && ! (*cu & 0x30)) || /* Use 3-byte. */ 71 (accum == 0x04 && *cu & 0x30)) /* Beyond Unicode. */ 72 goto latin; 73 break; 74 default: 75 break; 76 } 77 78 while (--nby) { 79 if ((*cu & 0xc0) != 0x80) /* Invalid continuation. */ 80 goto latin; 81 accum <<= 6; 82 accum += *cu & 0x3f; 83 cu++; 84 } 85 86 assert(accum > 0x7f); 87 assert(accum < 0x110000); 88 assert(accum < 0xd800 || accum > 0xdfff); 89 90 *oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum); 91 *ii = (char *)cu - ib->buf; 92 *filenc &= ~MPARSE_LATIN1; 93 return 1; 94 95 latin: 96 if ( ! (*filenc & MPARSE_LATIN1)) 97 return 0; 98 99 *oi += snprintf(ob->buf + *oi, 11, 100 "\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]); 101 102 *filenc &= ~MPARSE_UTF8; 103 return 1; 104 } 105 106 int 107 preconv_cue(const struct buf *b, size_t offset) 108 { 109 const char *ln, *eoln, *eoph; 110 size_t sz, phsz; 111 112 ln = b->buf + offset; 113 sz = b->sz - offset; 114 115 /* Look for the end-of-line. */ 116 117 if (NULL == (eoln = memchr(ln, '\n', sz))) 118 eoln = ln + sz; 119 120 /* Check if we have the correct header/trailer. */ 121 122 if ((sz = (size_t)(eoln - ln)) < 10 || 123 memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3)) 124 return MPARSE_UTF8 | MPARSE_LATIN1; 125 126 /* Move after the header and adjust for the trailer. */ 127 128 ln += 7; 129 sz -= 10; 130 131 while (sz > 0) { 132 while (sz > 0 && ' ' == *ln) { 133 ln++; 134 sz--; 135 } 136 if (0 == sz) 137 break; 138 139 /* Find the end-of-phrase marker (or eoln). */ 140 141 if (NULL == (eoph = memchr(ln, ';', sz))) 142 eoph = eoln - 3; 143 else 144 eoph++; 145 146 /* Only account for the "coding" phrase. */ 147 148 if ((phsz = eoph - ln) < 7 || 149 strncasecmp(ln, "coding:", 7)) { 150 sz -= phsz; 151 ln += phsz; 152 continue; 153 } 154 155 sz -= 7; 156 ln += 7; 157 158 while (sz > 0 && ' ' == *ln) { 159 ln++; 160 sz--; 161 } 162 if (0 == sz) 163 return 0; 164 165 /* Check us against known encodings. */ 166 167 if (phsz > 4 && !strncasecmp(ln, "utf-8", 5)) 168 return MPARSE_UTF8; 169 if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11)) 170 return MPARSE_LATIN1; 171 return 0; 172 } 173 return MPARSE_UTF8 | MPARSE_LATIN1; 174 } 175