1*d8e84ae2Snicm /* $OpenBSD: text.c,v 1.3 2017/04/18 14:16:48 nicm Exp $ */
2ff772f70Snicm
3ff772f70Snicm /*
4ff772f70Snicm * Copyright (c) 2015 Nicholas Marriott <nicm@openbsd.org>
5ff772f70Snicm *
6ff772f70Snicm * Permission to use, copy, modify, and distribute this software for any
7ff772f70Snicm * purpose with or without fee is hereby granted, provided that the above
8ff772f70Snicm * copyright notice and this permission notice appear in all copies.
9ff772f70Snicm *
10ff772f70Snicm * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11ff772f70Snicm * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12ff772f70Snicm * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13ff772f70Snicm * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14ff772f70Snicm * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15ff772f70Snicm * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16ff772f70Snicm * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17ff772f70Snicm */
18ff772f70Snicm
19ff772f70Snicm #include <sys/types.h>
20ff772f70Snicm
21ff772f70Snicm #include <ctype.h>
22ff772f70Snicm #include <string.h>
23ff772f70Snicm
24ff772f70Snicm #include "file.h"
25ff772f70Snicm #include "magic.h"
26ff772f70Snicm #include "xmalloc.h"
27ff772f70Snicm
28ff772f70Snicm static const char *text_words[][3] = {
29ff772f70Snicm { "msgid", "PO (gettext message catalogue)", "text/x-po" },
30ff772f70Snicm { "dnl", "M4 macro language pre-processor", "text/x-m4" },
31ff772f70Snicm { "import", "Java program", "text/x-java" },
32ff772f70Snicm { "\"libhdr\"", "BCPL program", "text/x-bcpl" },
33ff772f70Snicm { "\"LIBHDR\"", "BCPL program", "text/x-bcpl" },
34ff772f70Snicm { "//", "C++ program", "text/x-c++" },
35ff772f70Snicm { "virtual", "C++ program", "text/x-c++" },
36ff772f70Snicm { "class", "C++ program", "text/x-c++" },
37ff772f70Snicm { "public:", "C++ program", "text/x-c++" },
38ff772f70Snicm { "private:", "C++ program", "text/x-c++" },
39ff772f70Snicm { "/*", "C program", "text/x-c" },
40ff772f70Snicm { "#include", "C program", "text/x-c" },
41ff772f70Snicm { "char", "C program", "text/x-c" },
42ff772f70Snicm { "The", "English", "text/plain" },
43ff772f70Snicm { "the", "English", "text/plain" },
44ff772f70Snicm { "double", "C program", "text/x-c" },
45ff772f70Snicm { "extern", "C program", "text/x-c" },
46ff772f70Snicm { "float", "C program", "text/x-c" },
47ff772f70Snicm { "struct", "C program", "text/x-c" },
48ff772f70Snicm { "union", "C program", "text/x-c" },
49ff772f70Snicm { "CFLAGS", "make commands", "text/x-makefile" },
50ff772f70Snicm { "LDFLAGS", "make commands", "text/x-makefile" },
51ff772f70Snicm { "all:", "make commands", "text/x-makefile" },
52ff772f70Snicm { ".PRECIOUS", "make commands", "text/x-makefile" },
53ff772f70Snicm { ".ascii", "assembler program", "text/x-asm" },
54ff772f70Snicm { ".asciiz", "assembler program", "text/x-asm" },
55ff772f70Snicm { ".byte", "assembler program", "text/x-asm" },
56ff772f70Snicm { ".even", "assembler program", "text/x-asm" },
57ff772f70Snicm { ".globl", "assembler program", "text/x-asm" },
58ff772f70Snicm { ".text", "assembler program", "text/x-asm" },
59ff772f70Snicm { "clr", "assembler program", "text/x-asm" },
60ff772f70Snicm { "(input", "Pascal program", "text/x-pascal" },
61ff772f70Snicm { "program", "Pascal program", "text/x-pascal" },
62ff772f70Snicm { "record", "Pascal program", "text/x-pascal" },
63ff772f70Snicm { "dcl", "PL/1 program", "text/x-pl1" },
64ff772f70Snicm { "Received:", "mail", "text/x-mail" },
65ff772f70Snicm { ">From", "mail", "text/x-mail" },
66ff772f70Snicm { "Return-Path:", "mail", "text/x-mail" },
67ff772f70Snicm { "Cc:", "mail", "text/x-mail" },
68ff772f70Snicm { "Newsgroups:", "news", "text/x-news" },
69ff772f70Snicm { "Path:", "news", "text/x-news" },
70ff772f70Snicm { "Organization:", "news", "text/x-news" },
71ff772f70Snicm { "href=", "HTML document", "text/html" },
72ff772f70Snicm { "HREF=", "HTML document", "text/html" },
73ff772f70Snicm { "<body", "HTML document", "text/html" },
74ff772f70Snicm { "<BODY", "HTML document", "text/html" },
75ff772f70Snicm { "<html", "HTML document", "text/html" },
76ff772f70Snicm { "<HTML", "HTML document", "text/html" },
77ff772f70Snicm { "<!--", "HTML document", "text/html" },
78ff772f70Snicm { NULL, NULL, NULL }
79ff772f70Snicm };
80ff772f70Snicm
81ff772f70Snicm static int
text_is_ascii(u_char c)82ff772f70Snicm text_is_ascii(u_char c)
83ff772f70Snicm {
84ff772f70Snicm const char cc[] = "\007\010\011\012\014\015\033";
85ff772f70Snicm
86ff772f70Snicm if (c == '\0')
87ff772f70Snicm return (0);
88ff772f70Snicm if (strchr(cc, c) != NULL)
89ff772f70Snicm return (1);
90ff772f70Snicm return (c > 31 && c < 127);
91ff772f70Snicm }
92ff772f70Snicm
93ff772f70Snicm static int
text_is_latin1(u_char c)94ff772f70Snicm text_is_latin1(u_char c)
95ff772f70Snicm {
96ff772f70Snicm if (c >= 160)
97ff772f70Snicm return (1);
98ff772f70Snicm return (text_is_ascii(c));
99ff772f70Snicm }
100ff772f70Snicm
101ff772f70Snicm static int
text_is_extended(u_char c)102ff772f70Snicm text_is_extended(u_char c)
103ff772f70Snicm {
104ff772f70Snicm if (c >= 128)
105ff772f70Snicm return (1);
106ff772f70Snicm return (text_is_ascii(c));
107ff772f70Snicm }
108ff772f70Snicm
109ff772f70Snicm static int
text_try_test(const void * base,size_t size,int (* f)(u_char))110ff772f70Snicm text_try_test(const void *base, size_t size, int (*f)(u_char))
111ff772f70Snicm {
112ff772f70Snicm const u_char *data = base;
113ff772f70Snicm size_t offset;
114ff772f70Snicm
115ff772f70Snicm for (offset = 0; offset < size; offset++) {
116ff772f70Snicm if (!f(data[offset]))
117ff772f70Snicm return (0);
118ff772f70Snicm }
119ff772f70Snicm return (1);
120ff772f70Snicm }
121ff772f70Snicm
122ff772f70Snicm const char *
text_get_type(const void * base,size_t size)123ff772f70Snicm text_get_type(const void *base, size_t size)
124ff772f70Snicm {
125ff772f70Snicm if (text_try_test(base, size, text_is_ascii))
126ff772f70Snicm return ("ASCII");
127ff772f70Snicm if (text_try_test(base, size, text_is_latin1))
128ff772f70Snicm return ("ISO-8859");
129ff772f70Snicm if (text_try_test(base, size, text_is_extended))
130ff772f70Snicm return ("Non-ISO extended-ASCII");
131ff772f70Snicm return (NULL);
132ff772f70Snicm }
133ff772f70Snicm
134ff772f70Snicm const char *
text_try_words(const void * base,size_t size,int flags)135ff772f70Snicm text_try_words(const void *base, size_t size, int flags)
136ff772f70Snicm {
137ff772f70Snicm const char *cp, *end, *next, *word;
138ff772f70Snicm size_t wordlen;
139ff772f70Snicm u_int i;
140ff772f70Snicm
141*d8e84ae2Snicm end = (const char *)base + size;
142ff772f70Snicm for (cp = base; cp != end; /* nothing */) {
143ff772f70Snicm while (cp != end && isspace((u_char)*cp))
144ff772f70Snicm cp++;
145ff772f70Snicm
146ff772f70Snicm next = cp;
147ff772f70Snicm while (next != end && !isspace((u_char)*next))
148ff772f70Snicm next++;
149ff772f70Snicm
150ff772f70Snicm for (i = 0; /* nothing */; i++) {
151ff772f70Snicm word = text_words[i][0];
152ff772f70Snicm if (word == NULL)
153ff772f70Snicm break;
154ff772f70Snicm wordlen = strlen(word);
155ff772f70Snicm
156ff772f70Snicm if ((size_t)(next - cp) != wordlen)
157ff772f70Snicm continue;
158ff772f70Snicm if (memcmp(cp, word, wordlen) != 0)
159ff772f70Snicm continue;
160ff772f70Snicm if (flags & MAGIC_TEST_MIME)
161ff772f70Snicm return (text_words[i][2]);
162ff772f70Snicm return (text_words[i][1]);
163ff772f70Snicm }
164ff772f70Snicm
165ff772f70Snicm cp = next;
166ff772f70Snicm }
167ff772f70Snicm return (NULL);
168ff772f70Snicm }
169