1 /* $OpenBSD: text.c,v 1.3 2017/04/18 14:16:48 nicm Exp $ */
2
3 /*
4 * Copyright (c) 2015 Nicholas Marriott <nicm@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19 #include <sys/types.h>
20
21 #include <ctype.h>
22 #include <string.h>
23
24 #include "file.h"
25 #include "magic.h"
26 #include "xmalloc.h"
27
28 static const char *text_words[][3] = {
29 { "msgid", "PO (gettext message catalogue)", "text/x-po" },
30 { "dnl", "M4 macro language pre-processor", "text/x-m4" },
31 { "import", "Java program", "text/x-java" },
32 { "\"libhdr\"", "BCPL program", "text/x-bcpl" },
33 { "\"LIBHDR\"", "BCPL program", "text/x-bcpl" },
34 { "//", "C++ program", "text/x-c++" },
35 { "virtual", "C++ program", "text/x-c++" },
36 { "class", "C++ program", "text/x-c++" },
37 { "public:", "C++ program", "text/x-c++" },
38 { "private:", "C++ program", "text/x-c++" },
39 { "/*", "C program", "text/x-c" },
40 { "#include", "C program", "text/x-c" },
41 { "char", "C program", "text/x-c" },
42 { "The", "English", "text/plain" },
43 { "the", "English", "text/plain" },
44 { "double", "C program", "text/x-c" },
45 { "extern", "C program", "text/x-c" },
46 { "float", "C program", "text/x-c" },
47 { "struct", "C program", "text/x-c" },
48 { "union", "C program", "text/x-c" },
49 { "CFLAGS", "make commands", "text/x-makefile" },
50 { "LDFLAGS", "make commands", "text/x-makefile" },
51 { "all:", "make commands", "text/x-makefile" },
52 { ".PRECIOUS", "make commands", "text/x-makefile" },
53 { ".ascii", "assembler program", "text/x-asm" },
54 { ".asciiz", "assembler program", "text/x-asm" },
55 { ".byte", "assembler program", "text/x-asm" },
56 { ".even", "assembler program", "text/x-asm" },
57 { ".globl", "assembler program", "text/x-asm" },
58 { ".text", "assembler program", "text/x-asm" },
59 { "clr", "assembler program", "text/x-asm" },
60 { "(input", "Pascal program", "text/x-pascal" },
61 { "program", "Pascal program", "text/x-pascal" },
62 { "record", "Pascal program", "text/x-pascal" },
63 { "dcl", "PL/1 program", "text/x-pl1" },
64 { "Received:", "mail", "text/x-mail" },
65 { ">From", "mail", "text/x-mail" },
66 { "Return-Path:", "mail", "text/x-mail" },
67 { "Cc:", "mail", "text/x-mail" },
68 { "Newsgroups:", "news", "text/x-news" },
69 { "Path:", "news", "text/x-news" },
70 { "Organization:", "news", "text/x-news" },
71 { "href=", "HTML document", "text/html" },
72 { "HREF=", "HTML document", "text/html" },
73 { "<body", "HTML document", "text/html" },
74 { "<BODY", "HTML document", "text/html" },
75 { "<html", "HTML document", "text/html" },
76 { "<HTML", "HTML document", "text/html" },
77 { "<!--", "HTML document", "text/html" },
78 { NULL, NULL, NULL }
79 };
80
81 static int
text_is_ascii(u_char c)82 text_is_ascii(u_char c)
83 {
84 const char cc[] = "\007\010\011\012\014\015\033";
85
86 if (c == '\0')
87 return (0);
88 if (strchr(cc, c) != NULL)
89 return (1);
90 return (c > 31 && c < 127);
91 }
92
93 static int
text_is_latin1(u_char c)94 text_is_latin1(u_char c)
95 {
96 if (c >= 160)
97 return (1);
98 return (text_is_ascii(c));
99 }
100
101 static int
text_is_extended(u_char c)102 text_is_extended(u_char c)
103 {
104 if (c >= 128)
105 return (1);
106 return (text_is_ascii(c));
107 }
108
109 static int
text_try_test(const void * base,size_t size,int (* f)(u_char))110 text_try_test(const void *base, size_t size, int (*f)(u_char))
111 {
112 const u_char *data = base;
113 size_t offset;
114
115 for (offset = 0; offset < size; offset++) {
116 if (!f(data[offset]))
117 return (0);
118 }
119 return (1);
120 }
121
122 const char *
text_get_type(const void * base,size_t size)123 text_get_type(const void *base, size_t size)
124 {
125 if (text_try_test(base, size, text_is_ascii))
126 return ("ASCII");
127 if (text_try_test(base, size, text_is_latin1))
128 return ("ISO-8859");
129 if (text_try_test(base, size, text_is_extended))
130 return ("Non-ISO extended-ASCII");
131 return (NULL);
132 }
133
134 const char *
text_try_words(const void * base,size_t size,int flags)135 text_try_words(const void *base, size_t size, int flags)
136 {
137 const char *cp, *end, *next, *word;
138 size_t wordlen;
139 u_int i;
140
141 end = (const char *)base + size;
142 for (cp = base; cp != end; /* nothing */) {
143 while (cp != end && isspace((u_char)*cp))
144 cp++;
145
146 next = cp;
147 while (next != end && !isspace((u_char)*next))
148 next++;
149
150 for (i = 0; /* nothing */; i++) {
151 word = text_words[i][0];
152 if (word == NULL)
153 break;
154 wordlen = strlen(word);
155
156 if ((size_t)(next - cp) != wordlen)
157 continue;
158 if (memcmp(cp, word, wordlen) != 0)
159 continue;
160 if (flags & MAGIC_TEST_MIME)
161 return (text_words[i][2]);
162 return (text_words[i][1]);
163 }
164
165 cp = next;
166 }
167 return (NULL);
168 }
169