1 /* $OpenBSD: text.c,v 1.3 2017/04/18 14:16:48 nicm Exp $ */ 2 3 /* 4 * Copyright (c) 2015 Nicholas Marriott <nicm@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER 15 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 16 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 21 #include <ctype.h> 22 #include <string.h> 23 24 #include "file.h" 25 #include "magic.h" 26 #include "xmalloc.h" 27 28 static const char *text_words[][3] = { 29 { "msgid", "PO (gettext message catalogue)", "text/x-po" }, 30 { "dnl", "M4 macro language pre-processor", "text/x-m4" }, 31 { "import", "Java program", "text/x-java" }, 32 { "\"libhdr\"", "BCPL program", "text/x-bcpl" }, 33 { "\"LIBHDR\"", "BCPL program", "text/x-bcpl" }, 34 { "//", "C++ program", "text/x-c++" }, 35 { "virtual", "C++ program", "text/x-c++" }, 36 { "class", "C++ program", "text/x-c++" }, 37 { "public:", "C++ program", "text/x-c++" }, 38 { "private:", "C++ program", "text/x-c++" }, 39 { "/*", "C program", "text/x-c" }, 40 { "#include", "C program", "text/x-c" }, 41 { "char", "C program", "text/x-c" }, 42 { "The", "English", "text/plain" }, 43 { "the", "English", "text/plain" }, 44 { "double", "C program", "text/x-c" }, 45 { "extern", "C program", "text/x-c" }, 46 { "float", "C program", "text/x-c" }, 47 { "struct", "C program", "text/x-c" }, 48 { "union", "C program", "text/x-c" }, 49 { "CFLAGS", "make commands", "text/x-makefile" }, 50 { "LDFLAGS", "make commands", "text/x-makefile" }, 51 { "all:", "make commands", "text/x-makefile" }, 52 { ".PRECIOUS", "make commands", "text/x-makefile" }, 53 { ".ascii", "assembler program", "text/x-asm" }, 54 { ".asciiz", "assembler program", "text/x-asm" }, 55 { ".byte", "assembler program", "text/x-asm" }, 56 { ".even", "assembler program", "text/x-asm" }, 57 { ".globl", "assembler program", "text/x-asm" }, 58 { ".text", "assembler program", "text/x-asm" }, 59 { "clr", "assembler program", "text/x-asm" }, 60 { "(input", "Pascal program", "text/x-pascal" }, 61 { "program", "Pascal program", "text/x-pascal" }, 62 { "record", "Pascal program", "text/x-pascal" }, 63 { "dcl", "PL/1 program", "text/x-pl1" }, 64 { "Received:", "mail", "text/x-mail" }, 65 { ">From", "mail", "text/x-mail" }, 66 { "Return-Path:", "mail", "text/x-mail" }, 67 { "Cc:", "mail", "text/x-mail" }, 68 { "Newsgroups:", "news", "text/x-news" }, 69 { "Path:", "news", "text/x-news" }, 70 { "Organization:", "news", "text/x-news" }, 71 { "href=", "HTML document", "text/html" }, 72 { "HREF=", "HTML document", "text/html" }, 73 { "<body", "HTML document", "text/html" }, 74 { "<BODY", "HTML document", "text/html" }, 75 { "<html", "HTML document", "text/html" }, 76 { "<HTML", "HTML document", "text/html" }, 77 { "<!--", "HTML document", "text/html" }, 78 { NULL, NULL, NULL } 79 }; 80 81 static int 82 text_is_ascii(u_char c) 83 { 84 const char cc[] = "\007\010\011\012\014\015\033"; 85 86 if (c == '\0') 87 return (0); 88 if (strchr(cc, c) != NULL) 89 return (1); 90 return (c > 31 && c < 127); 91 } 92 93 static int 94 text_is_latin1(u_char c) 95 { 96 if (c >= 160) 97 return (1); 98 return (text_is_ascii(c)); 99 } 100 101 static int 102 text_is_extended(u_char c) 103 { 104 if (c >= 128) 105 return (1); 106 return (text_is_ascii(c)); 107 } 108 109 static int 110 text_try_test(const void *base, size_t size, int (*f)(u_char)) 111 { 112 const u_char *data = base; 113 size_t offset; 114 115 for (offset = 0; offset < size; offset++) { 116 if (!f(data[offset])) 117 return (0); 118 } 119 return (1); 120 } 121 122 const char * 123 text_get_type(const void *base, size_t size) 124 { 125 if (text_try_test(base, size, text_is_ascii)) 126 return ("ASCII"); 127 if (text_try_test(base, size, text_is_latin1)) 128 return ("ISO-8859"); 129 if (text_try_test(base, size, text_is_extended)) 130 return ("Non-ISO extended-ASCII"); 131 return (NULL); 132 } 133 134 const char * 135 text_try_words(const void *base, size_t size, int flags) 136 { 137 const char *cp, *end, *next, *word; 138 size_t wordlen; 139 u_int i; 140 141 end = (const char *)base + size; 142 for (cp = base; cp != end; /* nothing */) { 143 while (cp != end && isspace((u_char)*cp)) 144 cp++; 145 146 next = cp; 147 while (next != end && !isspace((u_char)*next)) 148 next++; 149 150 for (i = 0; /* nothing */; i++) { 151 word = text_words[i][0]; 152 if (word == NULL) 153 break; 154 wordlen = strlen(word); 155 156 if ((size_t)(next - cp) != wordlen) 157 continue; 158 if (memcmp(cp, word, wordlen) != 0) 159 continue; 160 if (flags & MAGIC_TEST_MIME) 161 return (text_words[i][2]); 162 return (text_words[i][1]); 163 } 164 165 cp = next; 166 } 167 return (NULL); 168 } 169