1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <regexp.h>
5 #include "dfa.h"
6
7 /***
8 * Regular expression for matching.
9 */
10
11 char *ignore[] =
12 {
13 /* HTML that isn't A, IMG, or FONT */
14 /* Must have a space somewhere to avoid catching <email@address> */
15 "<[ \n\r]*("
16 "[^aif]|"
17 "a[^> \t\r\n]|"
18 "i[^mM \t\r\n]|"
19 "im[^gG \t\r\n]|"
20 "img[^> \t\r\n]|"
21 "f[^oO \t\r\n]|"
22 "fo[^Nn \t\r\n]|"
23 "fon[^tT \t\r\n]|"
24 "font[^> \r\t\n]"
25 ")[^>]*[ \t\n\r][^>]*>",
26 "<[ \n\r]*("
27 "i|im|f|fo|fon"
28 ")[ \t\r\n][^>]*>",
29
30 /* ignore html comments */
31 "<!--([^\\-]|-[^\\-]|--[^>]|\n)*-->",
32
33 /* random mail strings */
34 "^message-id:.*\n([ ].*\n)*",
35 "^in-reply-to:.*\n([ ].*\n)*",
36 "^references:.*\n([ ].*\n)*",
37 "^date:.*\n([ ].*\n)*",
38 "^delivery-date:.*\n([ ].*\n)*",
39 "e?smtp id .*",
40 "^ id.*",
41 "boundary=.*",
42 "name=\"",
43 "filename=\"",
44 "news:<[^>]+>",
45 "^--[^ ]*$",
46
47 /* base64 encoding */
48 "^[0-9a-zA-Z+\\-=/]+$",
49
50 /* uu encoding */
51 "^[!-Z]+$",
52
53 /* little things */
54 ".",
55 "\n",
56 };
57
58 char *keywords[] =
59 {
60 "([a-zA-Z'`$!¡-]|[0-9]([.,][0-9])*)+",
61 };
62
63 int debug;
64
65 Dreprog*
dregcomp(char * buf)66 dregcomp(char *buf)
67 {
68 Reprog *r;
69 Dreprog *d;
70
71 if(debug)
72 print(">>> '%s'\n", buf);
73
74 r = regcomp(buf);
75 if(r == nil)
76 sysfatal("regcomp");
77 d = dregcvt(r);
78 if(d == nil)
79 sysfatal("dregcomp");
80 free(r);
81 return d;
82 }
83
84 char*
strcpycase(char * d,char * s)85 strcpycase(char *d, char *s)
86 {
87 int cc, esc;
88
89 cc = 0;
90 esc = 0;
91 while(*s){
92 if(*s == '[')
93 cc++;
94 if(*s == ']')
95 cc--;
96 if(!cc && 'a' <= *s && *s <= 'z'){
97 *d++ = '[';
98 *d++ = *s;
99 *d++ = *s+'A'-'a';
100 *d++ = ']';
101 }else
102 *d++ = *s;
103 if(*s == '\\')
104 esc++;
105 else if(esc)
106 esc--;
107 s++;
108 }
109 return d;
110 }
111
112 void
regerror(char * msg)113 regerror(char *msg)
114 {
115 sysfatal("regerror: %s", msg);
116 }
117
118 void
buildre(Dreprog * re[3])119 buildre(Dreprog *re[3])
120 {
121 int i;
122 static char buf[16384], *s;
123
124 re[0] = dregcomp("^From ");
125
126 s = buf;
127 for(i=0; i<nelem(keywords); i++){
128 if(i != 0)
129 *s++ = '|';
130 s = strcpycase(s, keywords[i]);
131 }
132 *s = 0;
133 re[1] = dregcomp(buf);
134
135 s = buf;
136 for(i=0; i<nelem(ignore); i++){
137 if(i != 0)
138 *s++ = '|';
139 s = strcpycase(s, ignore[i]);
140 }
141 *s = 0;
142 re[2] = dregcomp(buf);
143 }
144
145 void
usage(void)146 usage(void)
147 {
148 fprint(2, "usage: regen [-d]\n");
149 exits("usage");
150 }
151
152 void
main(int argc,char ** argv)153 main(int argc, char **argv)
154 {
155 Dreprog *re[3];
156 Biobuf b;
157
158 ARGBEGIN{
159 default:
160 usage();
161 case 'd':
162 debug = 1;
163 }ARGEND
164
165 if(argc != 0)
166 usage();
167
168 buildre(re);
169 Binit(&b, 1, OWRITE);
170 Bprintdfa(&b, re[0]);
171 Bprintdfa(&b, re[1]);
172 Bprintdfa(&b, re[2]);
173 exits(0);
174 }
175
176