xref: /openbsd-src/usr.bin/file/text.c (revision d8e84ae2eb8c4ff3171e61da9eec6edbb227a066)
1 /* $OpenBSD: text.c,v 1.3 2017/04/18 14:16:48 nicm Exp $ */
2 
3 /*
4  * Copyright (c) 2015 Nicholas Marriott <nicm@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 
21 #include <ctype.h>
22 #include <string.h>
23 
24 #include "file.h"
25 #include "magic.h"
26 #include "xmalloc.h"
27 
28 static const char *text_words[][3] = {
29 	{ "msgid", "PO (gettext message catalogue)", "text/x-po" },
30 	{ "dnl", "M4 macro language pre-processor", "text/x-m4" },
31 	{ "import", "Java program", "text/x-java" },
32 	{ "\"libhdr\"", "BCPL program", "text/x-bcpl" },
33 	{ "\"LIBHDR\"", "BCPL program", "text/x-bcpl" },
34 	{ "//", "C++ program", "text/x-c++" },
35 	{ "virtual", "C++ program", "text/x-c++" },
36 	{ "class", "C++ program", "text/x-c++" },
37 	{ "public:", "C++ program", "text/x-c++" },
38 	{ "private:", "C++ program", "text/x-c++" },
39 	{ "/*", "C program", "text/x-c" },
40 	{ "#include", "C program", "text/x-c" },
41 	{ "char", "C program", "text/x-c" },
42 	{ "The", "English", "text/plain" },
43 	{ "the", "English", "text/plain" },
44 	{ "double", "C program", "text/x-c" },
45 	{ "extern", "C program", "text/x-c" },
46 	{ "float", "C program", "text/x-c" },
47 	{ "struct", "C program", "text/x-c" },
48 	{ "union", "C program", "text/x-c" },
49 	{ "CFLAGS", "make commands", "text/x-makefile" },
50 	{ "LDFLAGS", "make commands", "text/x-makefile" },
51 	{ "all:", "make commands", "text/x-makefile" },
52 	{ ".PRECIOUS", "make commands", "text/x-makefile" },
53 	{ ".ascii", "assembler program", "text/x-asm" },
54 	{ ".asciiz", "assembler program", "text/x-asm" },
55 	{ ".byte", "assembler program", "text/x-asm" },
56 	{ ".even", "assembler program", "text/x-asm" },
57 	{ ".globl", "assembler program", "text/x-asm" },
58 	{ ".text", "assembler program", "text/x-asm" },
59 	{ "clr", "assembler program", "text/x-asm" },
60 	{ "(input", "Pascal program", "text/x-pascal" },
61 	{ "program", "Pascal program", "text/x-pascal" },
62 	{ "record", "Pascal program", "text/x-pascal" },
63 	{ "dcl", "PL/1 program", "text/x-pl1" },
64 	{ "Received:", "mail", "text/x-mail" },
65 	{ ">From", "mail", "text/x-mail" },
66 	{ "Return-Path:", "mail", "text/x-mail" },
67 	{ "Cc:", "mail", "text/x-mail" },
68 	{ "Newsgroups:", "news", "text/x-news" },
69 	{ "Path:", "news", "text/x-news" },
70 	{ "Organization:", "news", "text/x-news" },
71 	{ "href=", "HTML document", "text/html" },
72 	{ "HREF=", "HTML document", "text/html" },
73 	{ "<body", "HTML document", "text/html" },
74 	{ "<BODY", "HTML document", "text/html" },
75 	{ "<html", "HTML document", "text/html" },
76 	{ "<HTML", "HTML document", "text/html" },
77 	{ "<!--", "HTML document", "text/html" },
78 	{ NULL, NULL, NULL }
79 };
80 
81 static int
text_is_ascii(u_char c)82 text_is_ascii(u_char c)
83 {
84 	const char	cc[] = "\007\010\011\012\014\015\033";
85 
86 	if (c == '\0')
87 		return (0);
88 	if (strchr(cc, c) != NULL)
89 		return (1);
90 	return (c > 31 && c < 127);
91 }
92 
93 static int
text_is_latin1(u_char c)94 text_is_latin1(u_char c)
95 {
96 	if (c >= 160)
97 		return (1);
98 	return (text_is_ascii(c));
99 }
100 
101 static int
text_is_extended(u_char c)102 text_is_extended(u_char c)
103 {
104 	if (c >= 128)
105 		return (1);
106 	return (text_is_ascii(c));
107 }
108 
109 static int
text_try_test(const void * base,size_t size,int (* f)(u_char))110 text_try_test(const void *base, size_t size, int (*f)(u_char))
111 {
112 	const u_char	*data = base;
113 	size_t		 offset;
114 
115 	for (offset = 0; offset < size; offset++) {
116 		if (!f(data[offset]))
117 			return (0);
118 	}
119 	return (1);
120 }
121 
122 const char *
text_get_type(const void * base,size_t size)123 text_get_type(const void *base, size_t size)
124 {
125 	if (text_try_test(base, size, text_is_ascii))
126 		return ("ASCII");
127 	if (text_try_test(base, size, text_is_latin1))
128 		return ("ISO-8859");
129 	if (text_try_test(base, size, text_is_extended))
130 		return ("Non-ISO extended-ASCII");
131 	return (NULL);
132 }
133 
134 const char *
text_try_words(const void * base,size_t size,int flags)135 text_try_words(const void *base, size_t size, int flags)
136 {
137 	const char	*cp, *end, *next, *word;
138 	size_t		 wordlen;
139 	u_int		 i;
140 
141 	end = (const char *)base + size;
142 	for (cp = base; cp != end; /* nothing */) {
143 		while (cp != end && isspace((u_char)*cp))
144 			cp++;
145 
146 		next = cp;
147 		while (next != end && !isspace((u_char)*next))
148 			next++;
149 
150 		for (i = 0; /* nothing */; i++) {
151 			word = text_words[i][0];
152 			if (word == NULL)
153 				break;
154 			wordlen = strlen(word);
155 
156 			if ((size_t)(next - cp) != wordlen)
157 				continue;
158 			if (memcmp(cp, word, wordlen) != 0)
159 				continue;
160 			if (flags & MAGIC_TEST_MIME)
161 				return (text_words[i][2]);
162 			return (text_words[i][1]);
163 		}
164 
165 		cp = next;
166 	}
167 	return (NULL);
168 }
169