xref: /inferno-os/appl/lib/html.b (revision 37da2899f40661e3e9631e497da8dc59b971cbd0)
1*37da2899SCharles.Forsythimplement HTML;
2*37da2899SCharles.Forsyth
3*37da2899SCharles.Forsythinclude "sys.m";
4*37da2899SCharles.Forsythinclude "html.m";
5*37da2899SCharles.Forsythinclude "strinttab.m";
6*37da2899SCharles.Forsyth
7*37da2899SCharles.Forsythsys:	Sys;
8*37da2899SCharles.ForsythT:	StringIntTab;
9*37da2899SCharles.Forsyth
10*37da2899SCharles.ForsythStringtab: adt
11*37da2899SCharles.Forsyth{
12*37da2899SCharles.Forsyth	name:	string;
13*37da2899SCharles.Forsyth	val:		int;
14*37da2899SCharles.Forsyth};
15*37da2899SCharles.Forsyth
16*37da2899SCharles.Forsythchartab:= array[] of { T->StringInt
17*37da2899SCharles.Forsyth	("AElig", 'Æ'),
18*37da2899SCharles.Forsyth	("Aacute", 'Á'),
19*37da2899SCharles.Forsyth	("Acirc", 'Â'),
20*37da2899SCharles.Forsyth	("Agrave", 'À'),
21*37da2899SCharles.Forsyth	("Aring", 'Å'),
22*37da2899SCharles.Forsyth	("Atilde", 'Ã'),
23*37da2899SCharles.Forsyth	("Auml", 'Ä'),
24*37da2899SCharles.Forsyth	("Ccedil", 'Ç'),
25*37da2899SCharles.Forsyth	("ETH", 'Ð'),
26*37da2899SCharles.Forsyth	("Eacute", 'É'),
27*37da2899SCharles.Forsyth	("Ecirc", 'Ê'),
28*37da2899SCharles.Forsyth	("Egrave", 'È'),
29*37da2899SCharles.Forsyth	("Euml", 'Ë'),
30*37da2899SCharles.Forsyth	("Iacute", 'Í'),
31*37da2899SCharles.Forsyth	("Icirc", 'Î'),
32*37da2899SCharles.Forsyth	("Igrave", 'Ì'),
33*37da2899SCharles.Forsyth	("Iuml", 'Ï'),
34*37da2899SCharles.Forsyth	("Ntilde", 'Ñ'),
35*37da2899SCharles.Forsyth	("Oacute", 'Ó'),
36*37da2899SCharles.Forsyth	("Ocirc", 'Ô'),
37*37da2899SCharles.Forsyth	("Ograve", 'Ò'),
38*37da2899SCharles.Forsyth	("Oslash", 'Ø'),
39*37da2899SCharles.Forsyth	("Otilde", 'Õ'),
40*37da2899SCharles.Forsyth	("Ouml", 'Ö'),
41*37da2899SCharles.Forsyth	("THORN", 'Þ'),
42*37da2899SCharles.Forsyth	("Uacute", 'Ú'),
43*37da2899SCharles.Forsyth	("Ucirc", 'Û'),
44*37da2899SCharles.Forsyth	("Ugrave", 'Ù'),
45*37da2899SCharles.Forsyth	("Uuml", 'Ü'),
46*37da2899SCharles.Forsyth	("Yacute", 'Ý'),
47*37da2899SCharles.Forsyth	("aacute", 'á'),
48*37da2899SCharles.Forsyth	("acirc", 'â'),
49*37da2899SCharles.Forsyth	("acute", '´'),
50*37da2899SCharles.Forsyth	("aelig", 'æ'),
51*37da2899SCharles.Forsyth	("agrave", 'à'),
52*37da2899SCharles.Forsyth	("alpha", 'α'),
53*37da2899SCharles.Forsyth	("amp", '&'),
54*37da2899SCharles.Forsyth	("aring", 'å'),
55*37da2899SCharles.Forsyth	("atilde", 'ã'),
56*37da2899SCharles.Forsyth	("auml", 'ä'),
57*37da2899SCharles.Forsyth	("beta", 'β'),
58*37da2899SCharles.Forsyth	("brvbar", '¦'),
59*37da2899SCharles.Forsyth	("ccedil", 'ç'),
60*37da2899SCharles.Forsyth	("cdots", '⋯'),
61*37da2899SCharles.Forsyth	("cedil", '¸'),
62*37da2899SCharles.Forsyth	("cent", '¢'),
63*37da2899SCharles.Forsyth	("chi", 'χ'),
64*37da2899SCharles.Forsyth	("copy", '©'),
65*37da2899SCharles.Forsyth	("curren", '¤'),
66*37da2899SCharles.Forsyth	("ddots", '⋱'),
67*37da2899SCharles.Forsyth	("deg", '°'),
68*37da2899SCharles.Forsyth	("delta", 'δ'),
69*37da2899SCharles.Forsyth	("divide", '÷'),
70*37da2899SCharles.Forsyth	("eacute", 'é'),
71*37da2899SCharles.Forsyth	("ecirc", 'ê'),
72*37da2899SCharles.Forsyth	("egrave", 'è'),
73*37da2899SCharles.Forsyth	("emdash", '—'),
74*37da2899SCharles.Forsyth	("emsp", ' '),
75*37da2899SCharles.Forsyth	("endash", '–'),
76*37da2899SCharles.Forsyth	("ensp", ' '),
77*37da2899SCharles.Forsyth	("epsilon", 'ε'),
78*37da2899SCharles.Forsyth	("eta", 'η'),
79*37da2899SCharles.Forsyth	("eth", 'ð'),
80*37da2899SCharles.Forsyth	("euml", 'ë'),
81*37da2899SCharles.Forsyth	("frac12", '½'),
82*37da2899SCharles.Forsyth	("frac14", '¼'),
83*37da2899SCharles.Forsyth	("frac34", '¾'),
84*37da2899SCharles.Forsyth	("gamma", 'γ'),
85*37da2899SCharles.Forsyth	("gt", '>'),
86*37da2899SCharles.Forsyth	("iacute", 'í'),
87*37da2899SCharles.Forsyth	("icirc", 'î'),
88*37da2899SCharles.Forsyth	("iexcl", '¡'),
89*37da2899SCharles.Forsyth	("igrave", 'ì'),
90*37da2899SCharles.Forsyth	("iota", 'ι'),
91*37da2899SCharles.Forsyth	("iquest", '¿'),
92*37da2899SCharles.Forsyth	("iuml", 'ï'),
93*37da2899SCharles.Forsyth	("kappa", 'κ'),
94*37da2899SCharles.Forsyth	("lambda", 'λ'),
95*37da2899SCharles.Forsyth	("laquo", '«'),
96*37da2899SCharles.Forsyth	("ldots", '…'),
97*37da2899SCharles.Forsyth	("lt", '<'),
98*37da2899SCharles.Forsyth	("macr", '¯'),
99*37da2899SCharles.Forsyth	("micro", 'µ'),
100*37da2899SCharles.Forsyth	("middot", '·'),
101*37da2899SCharles.Forsyth	("mu", 'μ'),
102*37da2899SCharles.Forsyth	("nbsp", ' '),
103*37da2899SCharles.Forsyth	("not", '¬'),
104*37da2899SCharles.Forsyth	("ntilde", 'ñ'),
105*37da2899SCharles.Forsyth	("nu", 'ν'),
106*37da2899SCharles.Forsyth	("oacute", 'ó'),
107*37da2899SCharles.Forsyth	("ocirc", 'ô'),
108*37da2899SCharles.Forsyth	("ograve", 'ò'),
109*37da2899SCharles.Forsyth	("omega", 'ω'),
110*37da2899SCharles.Forsyth	("omicron", 'ο'),
111*37da2899SCharles.Forsyth	("ordf", 'ª'),
112*37da2899SCharles.Forsyth	("ordm", 'º'),
113*37da2899SCharles.Forsyth	("oslash", 'ø'),
114*37da2899SCharles.Forsyth	("otilde", 'õ'),
115*37da2899SCharles.Forsyth	("ouml", 'ö'),
116*37da2899SCharles.Forsyth	("para", '¶'),
117*37da2899SCharles.Forsyth	("phi", 'φ'),
118*37da2899SCharles.Forsyth	("pi", 'π'),
119*37da2899SCharles.Forsyth	("plusmn", '±'),
120*37da2899SCharles.Forsyth	("pound", '£'),
121*37da2899SCharles.Forsyth	("psi", 'ψ'),
122*37da2899SCharles.Forsyth	("quad", ' '),
123*37da2899SCharles.Forsyth	("quot", '"'),
124*37da2899SCharles.Forsyth	("raquo", '»'),
125*37da2899SCharles.Forsyth	("reg", '®'),
126*37da2899SCharles.Forsyth	("rho", 'ρ'),
127*37da2899SCharles.Forsyth	("sect", '§'),
128*37da2899SCharles.Forsyth	("shy", '­'),
129*37da2899SCharles.Forsyth	("sigma", 'σ'),
130*37da2899SCharles.Forsyth	("sp", ' '),
131*37da2899SCharles.Forsyth	("sup1", '¹'),
132*37da2899SCharles.Forsyth	("sup2", '²'),
133*37da2899SCharles.Forsyth	("sup3", '³'),
134*37da2899SCharles.Forsyth	("szlig", 'ß'),
135*37da2899SCharles.Forsyth	("tau", 'τ'),
136*37da2899SCharles.Forsyth	("theta", 'θ'),
137*37da2899SCharles.Forsyth	("thinsp", ' '),
138*37da2899SCharles.Forsyth	("thorn", 'þ'),
139*37da2899SCharles.Forsyth	("times", '×'),
140*37da2899SCharles.Forsyth	("trade", '™'),
141*37da2899SCharles.Forsyth	("uacute", 'ú'),
142*37da2899SCharles.Forsyth	("ucirc", 'û'),
143*37da2899SCharles.Forsyth	("ugrave", 'ù'),
144*37da2899SCharles.Forsyth	("uml", '¨'),
145*37da2899SCharles.Forsyth	("upsilon", 'υ'),
146*37da2899SCharles.Forsyth	("uuml", 'ü'),
147*37da2899SCharles.Forsyth	("varepsilon", '∈'),
148*37da2899SCharles.Forsyth	("varphi", 'ϕ'),
149*37da2899SCharles.Forsyth	("varpi", 'ϖ'),
150*37da2899SCharles.Forsyth	("varrho", 'ϱ'),
151*37da2899SCharles.Forsyth	("vdots", '⋮'),
152*37da2899SCharles.Forsyth	("vsigma", 'ς'),
153*37da2899SCharles.Forsyth	("vtheta", 'ϑ'),
154*37da2899SCharles.Forsyth	("xi", 'ξ'),
155*37da2899SCharles.Forsyth	("yacute", 'ý'),
156*37da2899SCharles.Forsyth	("yen", '¥'),
157*37da2899SCharles.Forsyth	("yuml", 'ÿ'),
158*37da2899SCharles.Forsyth	("zeta", 'ζ'),
159*37da2899SCharles.Forsyth};
160*37da2899SCharles.Forsyth
161*37da2899SCharles.Forsythhtmlstringtab := array[] of { T->StringInt
162*37da2899SCharles.Forsyth	("a", Ta),
163*37da2899SCharles.Forsyth	("address", Taddress),
164*37da2899SCharles.Forsyth	("applet", Tapplet),
165*37da2899SCharles.Forsyth	("area", Tarea),
166*37da2899SCharles.Forsyth	("att_footer", Tatt_footer),
167*37da2899SCharles.Forsyth	("b", Tb),
168*37da2899SCharles.Forsyth	("base", Tbase),
169*37da2899SCharles.Forsyth	("basefont", Tbasefont),
170*37da2899SCharles.Forsyth	("big", Tbig),
171*37da2899SCharles.Forsyth	("blink", Tblink),
172*37da2899SCharles.Forsyth	("blockquote", Tblockquote),
173*37da2899SCharles.Forsyth	("body", Tbody),
174*37da2899SCharles.Forsyth	("bq", Tbq),
175*37da2899SCharles.Forsyth	("br", Tbr),
176*37da2899SCharles.Forsyth	("caption", Tcaption),
177*37da2899SCharles.Forsyth	("center", Tcenter),
178*37da2899SCharles.Forsyth	("cite", Tcite),
179*37da2899SCharles.Forsyth	("code", Tcode),
180*37da2899SCharles.Forsyth	("col", Tcol),
181*37da2899SCharles.Forsyth	("colgroup", Tcolgroup),
182*37da2899SCharles.Forsyth	("dd", Tdd),
183*37da2899SCharles.Forsyth	("dfn", Tdfn),
184*37da2899SCharles.Forsyth	("dir", Tdir),
185*37da2899SCharles.Forsyth	("div", Tdiv),
186*37da2899SCharles.Forsyth	("dl", Tdl),
187*37da2899SCharles.Forsyth	("dt", Tdt),
188*37da2899SCharles.Forsyth	("em", Tem),
189*37da2899SCharles.Forsyth	("font", Tfont),
190*37da2899SCharles.Forsyth	("form", Tform),
191*37da2899SCharles.Forsyth	("frame", Tframe),
192*37da2899SCharles.Forsyth	("frameset", Tframeset),
193*37da2899SCharles.Forsyth	("h1", Th1),
194*37da2899SCharles.Forsyth	("h2", Th2),
195*37da2899SCharles.Forsyth	("h3", Th3),
196*37da2899SCharles.Forsyth	("h4", Th4),
197*37da2899SCharles.Forsyth	("h5", Th5),
198*37da2899SCharles.Forsyth	("h6", Th6),
199*37da2899SCharles.Forsyth	("head", Thead),
200*37da2899SCharles.Forsyth	("hr", Thr),
201*37da2899SCharles.Forsyth	("html", Thtml),
202*37da2899SCharles.Forsyth	("i", Ti),
203*37da2899SCharles.Forsyth	("img", Timg),
204*37da2899SCharles.Forsyth	("input", Tinput),
205*37da2899SCharles.Forsyth	("isindex", Tisindex),
206*37da2899SCharles.Forsyth	("item", Titem),
207*37da2899SCharles.Forsyth	("kbd", Tkbd),
208*37da2899SCharles.Forsyth	("li", Tli),
209*37da2899SCharles.Forsyth	("link", Tlink),
210*37da2899SCharles.Forsyth	("map", Tmap),
211*37da2899SCharles.Forsyth	("menu", Tmenu),
212*37da2899SCharles.Forsyth	("meta", Tmeta),
213*37da2899SCharles.Forsyth	("nobr", Tnobr),
214*37da2899SCharles.Forsyth	("noframes", Tnoframes),
215*37da2899SCharles.Forsyth	("ol", Tol),
216*37da2899SCharles.Forsyth	("option", Toption),
217*37da2899SCharles.Forsyth	("p", Tp),
218*37da2899SCharles.Forsyth	("param", Tparam),
219*37da2899SCharles.Forsyth	("pre", Tpre),
220*37da2899SCharles.Forsyth	("q", Tq),
221*37da2899SCharles.Forsyth	("samp", Tsamp),
222*37da2899SCharles.Forsyth	("script", Tscript),
223*37da2899SCharles.Forsyth	("select", Tselect),
224*37da2899SCharles.Forsyth	("small", Tsmall),
225*37da2899SCharles.Forsyth	("strike", Tstrike),
226*37da2899SCharles.Forsyth	("strong", Tstrong),
227*37da2899SCharles.Forsyth	("style", Tstyle),
228*37da2899SCharles.Forsyth	("sub", Tsub),
229*37da2899SCharles.Forsyth	("sup", Tsup),
230*37da2899SCharles.Forsyth	("t", Tt),
231*37da2899SCharles.Forsyth	("table", Ttable),
232*37da2899SCharles.Forsyth	("tbody", Ttbody),
233*37da2899SCharles.Forsyth	("td", Ttd),
234*37da2899SCharles.Forsyth	("textarea", Ttextarea),
235*37da2899SCharles.Forsyth	("textflow", Ttextflow),
236*37da2899SCharles.Forsyth	("tfoot", Ttfoot),
237*37da2899SCharles.Forsyth	("th", Tth),
238*37da2899SCharles.Forsyth	("thead", Tthead),
239*37da2899SCharles.Forsyth	("title", Ttitle),
240*37da2899SCharles.Forsyth	("tr", Ttr),
241*37da2899SCharles.Forsyth	("tt", Ttt),
242*37da2899SCharles.Forsyth	("u", Tu),
243*37da2899SCharles.Forsyth	("ul", Tul),
244*37da2899SCharles.Forsyth	("var", Tvar)
245*37da2899SCharles.Forsyth};
246*37da2899SCharles.Forsyth
247*37da2899SCharles.ForsythW, D, L, U, N: con byte (1<<iota);
248*37da2899SCharles.ForsythNCTYPE: con 256;
249*37da2899SCharles.Forsyth
250*37da2899SCharles.Forsythctype := array[NCTYPE] of {
251*37da2899SCharles.Forsyth	'0'=>D, '1'=>D, '2'=>D, '3'=>D, '4'=>D,
252*37da2899SCharles.Forsyth	'5'=>D, '6'=>D, '7'=>D, '8'=>D, '9'=>D,
253*37da2899SCharles.Forsyth	'A'=>U, 'B'=>U, 'C'=>U, 'D'=>U, 'E'=>U, 'F'=>U,
254*37da2899SCharles.Forsyth	'G'=>U, 'H'=>U, 'I'=>U, 'J'=>U, 'K'=>U, 'L'=>U,
255*37da2899SCharles.Forsyth	'M'=>U, 'N'=>U, 'O'=>U, 'P'=>U, 'Q'=>U, 'R'=>U,
256*37da2899SCharles.Forsyth	'S'=>U, 'T'=>U, 'U'=>U, 'V'=>U, 'W'=>U, 'X'=>U,
257*37da2899SCharles.Forsyth	'Y'=>U, 'Z'=>U,
258*37da2899SCharles.Forsyth	'a'=>L, 'b'=>L, 'c'=>L, 'd'=>L, 'e'=>L, 'f'=>L,
259*37da2899SCharles.Forsyth	'g'=>L, 'h'=>L, 'i'=>L, 'j'=>L, 'k'=>L, 'l'=>L,
260*37da2899SCharles.Forsyth	'm'=>L, 'n'=>L, 'o'=>L, 'p'=>L, 'q'=>L, 'r'=>L,
261*37da2899SCharles.Forsyth	's'=>L, 't'=>L, 'u'=>L, 'v'=>L, 'w'=>L, 'x'=>L,
262*37da2899SCharles.Forsyth	'y'=>L, 'z'=>L,
263*37da2899SCharles.Forsyth	'.'=>N, '-'=>N,
264*37da2899SCharles.Forsyth	' '=>W, '\n'=>W, '\t'=>W, '\r'=>W,
265*37da2899SCharles.Forsyth	* => byte 0
266*37da2899SCharles.Forsyth};
267*37da2899SCharles.Forsyth
268*37da2899SCharles.Forsythlex(b: array of byte, charset: int, keepwh: int): array of ref Lex
269*37da2899SCharles.Forsyth{
270*37da2899SCharles.Forsyth	if(sys == nil)
271*37da2899SCharles.Forsyth		sys = load Sys Sys->PATH;
272*37da2899SCharles.Forsyth	if(T == nil)
273*37da2899SCharles.Forsyth		T = load StringIntTab StringIntTab->PATH;
274*37da2899SCharles.Forsyth	if(T == nil) {
275*37da2899SCharles.Forsyth		sys->print("HTML->lex: couldn't %s\n", StringIntTab->PATH);
276*37da2899SCharles.Forsyth		return nil;
277*37da2899SCharles.Forsyth	}
278*37da2899SCharles.Forsyth
279*37da2899SCharles.Forsyth	a: array of ref Lex;
280*37da2899SCharles.Forsyth	ai := 0;
281*37da2899SCharles.Forsyth	i := 0;
282*37da2899SCharles.Forsyth	nb := len b;
283*37da2899SCharles.Forsyth	for(;;){
284*37da2899SCharles.Forsyth   Whitespace:
285*37da2899SCharles.Forsyth		for(;;){
286*37da2899SCharles.Forsyth			# ignore nulls
287*37da2899SCharles.Forsyth			while(i<nb && (int b[i] == 0))
288*37da2899SCharles.Forsyth				i++;
289*37da2899SCharles.Forsyth			# skip white space
290*37da2899SCharles.Forsyth			if(!keepwh) {
291*37da2899SCharles.Forsyth				while(i<nb) {
292*37da2899SCharles.Forsyth					c := int b[i];
293*37da2899SCharles.Forsyth					if(!(int (ctype[c]&W)) && c != ' ')
294*37da2899SCharles.Forsyth						break;
295*37da2899SCharles.Forsyth					i++;
296*37da2899SCharles.Forsyth				}
297*37da2899SCharles.Forsyth			}
298*37da2899SCharles.Forsyth			# skip comments
299*37da2899SCharles.Forsyth			if(i<nb-4 && int b[i]=='<' && int b[i+1]=='!'
300*37da2899SCharles.Forsyth					&& int b[i+2]=='-' && int b[i+3]=='-') {
301*37da2899SCharles.Forsyth				i += 4;
302*37da2899SCharles.Forsyth				while(i<nb-3){
303*37da2899SCharles.Forsyth					if(int b[i]=='-' && int b[i+1]=='-' && int b[i+2]=='>'){
304*37da2899SCharles.Forsyth						i += 3;
305*37da2899SCharles.Forsyth						continue Whitespace;
306*37da2899SCharles.Forsyth					}
307*37da2899SCharles.Forsyth					i++;
308*37da2899SCharles.Forsyth				}
309*37da2899SCharles.Forsyth				continue Whitespace;
310*37da2899SCharles.Forsyth			}
311*37da2899SCharles.Forsyth			break;
312*37da2899SCharles.Forsyth		}
313*37da2899SCharles.Forsyth		if(i == nb)
314*37da2899SCharles.Forsyth			break;
315*37da2899SCharles.Forsyth		if(ai == len a){
316*37da2899SCharles.Forsyth			na := array[len a + 500] of ref Lex;
317*37da2899SCharles.Forsyth			if(a != nil)
318*37da2899SCharles.Forsyth				na[0:] = a;
319*37da2899SCharles.Forsyth			a = na;
320*37da2899SCharles.Forsyth		}
321*37da2899SCharles.Forsyth		if(int b[i] == '<'){
322*37da2899SCharles.Forsyth			lx : ref Lex;
323*37da2899SCharles.Forsyth			(lx, i) = gettag(b, i, charset);
324*37da2899SCharles.Forsyth			a[ai++] = lx;
325*37da2899SCharles.Forsyth		}
326*37da2899SCharles.Forsyth		else {
327*37da2899SCharles.Forsyth			s: string;
328*37da2899SCharles.Forsyth			(s, i) = getdata(b, i, keepwh, charset);
329*37da2899SCharles.Forsyth			a[ai++] = ref Lex (Data, s, nil);
330*37da2899SCharles.Forsyth		}
331*37da2899SCharles.Forsyth	}
332*37da2899SCharles.Forsyth	return a[0:ai];
333*37da2899SCharles.Forsyth}
334*37da2899SCharles.Forsyth
335*37da2899SCharles.Forsythgetdata(b: array of byte, i: int, keepnls, charset: int): (string, int)
336*37da2899SCharles.Forsyth{
337*37da2899SCharles.Forsyth	s:= "";
338*37da2899SCharles.Forsyth	j:= 0;
339*37da2899SCharles.Forsyth	c: int;
340*37da2899SCharles.Forsyth	nb := len b;
341*37da2899SCharles.Forsyth
342*37da2899SCharles.Forsythloop:
343*37da2899SCharles.Forsyth	while(i < nb){
344*37da2899SCharles.Forsyth		oldi := i;
345*37da2899SCharles.Forsyth		case charset{
346*37da2899SCharles.Forsyth		Latin1 =>
347*37da2899SCharles.Forsyth			c = int b[i++];
348*37da2899SCharles.Forsyth		UTF8 =>
349*37da2899SCharles.Forsyth			j: int;
350*37da2899SCharles.Forsyth			(c, j, nil) = sys->byte2char(b, i);
351*37da2899SCharles.Forsyth			i += j;
352*37da2899SCharles.Forsyth		}
353*37da2899SCharles.Forsyth		case c {
354*37da2899SCharles.Forsyth		0 or 16r1a =>
355*37da2899SCharles.Forsyth			continue loop;
356*37da2899SCharles.Forsyth		'<' =>
357*37da2899SCharles.Forsyth			i = oldi;
358*37da2899SCharles.Forsyth			break loop;
359*37da2899SCharles.Forsyth		'&' =>
360*37da2899SCharles.Forsyth			(c, i) = ampersand(b, i);
361*37da2899SCharles.Forsyth		'\n' =>
362*37da2899SCharles.Forsyth			if(!keepnls)
363*37da2899SCharles.Forsyth				c = ' ';
364*37da2899SCharles.Forsyth		'\r' =>
365*37da2899SCharles.Forsyth			if(oldi > 0 && int b[oldi-1] == '\n')
366*37da2899SCharles.Forsyth				continue loop;
367*37da2899SCharles.Forsyth			if(keepnls)
368*37da2899SCharles.Forsyth				c = '\n';
369*37da2899SCharles.Forsyth			else
370*37da2899SCharles.Forsyth				c = ' ';
371*37da2899SCharles.Forsyth		}
372*37da2899SCharles.Forsyth		s[j++] = c;
373*37da2899SCharles.Forsyth	}
374*37da2899SCharles.Forsyth	return (s, i);
375*37da2899SCharles.Forsyth}
376*37da2899SCharles.Forsyth
377*37da2899SCharles.Forsythgettag(b: array of byte, i, charset: int): (ref Lex, int)
378*37da2899SCharles.Forsyth{
379*37da2899SCharles.Forsyth	rbra := 0;
380*37da2899SCharles.Forsyth	nb := len b;
381*37da2899SCharles.Forsyth	ans := ref Lex(Notfound, "", nil);
382*37da2899SCharles.Forsyth	al: list of Attr;
383*37da2899SCharles.Forsyth	if(++i == nb)
384*37da2899SCharles.Forsyth		return (ans, i);
385*37da2899SCharles.Forsyth	istart := i;
386*37da2899SCharles.Forsyth	c := int b[i];
387*37da2899SCharles.Forsyth	if(c == '/') {
388*37da2899SCharles.Forsyth		rbra = RBRA;
389*37da2899SCharles.Forsyth		if(++i == nb)
390*37da2899SCharles.Forsyth			return (ans, i);
391*37da2899SCharles.Forsyth		c = int b[i];
392*37da2899SCharles.Forsyth	}
393*37da2899SCharles.Forsyth	if(c>=NCTYPE || !int (ctype[c]&(L|U))) {
394*37da2899SCharles.Forsyth		while(i < nb) {
395*37da2899SCharles.Forsyth			c = int b[i++];
396*37da2899SCharles.Forsyth			if(c == '>')
397*37da2899SCharles.Forsyth				break;
398*37da2899SCharles.Forsyth		}
399*37da2899SCharles.Forsyth		ans.text = string b[istart:i];
400*37da2899SCharles.Forsyth		return (ans, i);
401*37da2899SCharles.Forsyth	}
402*37da2899SCharles.Forsyth	namstart := i;
403*37da2899SCharles.Forsyth	while(c<NCTYPE && int (ctype[c]&(L|U|D|N))) {
404*37da2899SCharles.Forsyth		if(++i == nb) {
405*37da2899SCharles.Forsyth			ans.text = string b[istart:i];
406*37da2899SCharles.Forsyth			return (ans, i);
407*37da2899SCharles.Forsyth		}
408*37da2899SCharles.Forsyth		c = int b[i];
409*37da2899SCharles.Forsyth	}
410*37da2899SCharles.Forsyth	name := lowercase(b, namstart, i);
411*37da2899SCharles.Forsyth	(fnd, tag) := T->lookup(htmlstringtab, name);
412*37da2899SCharles.Forsyth	if(fnd)
413*37da2899SCharles.Forsyth		ans.tag = tag+rbra;
414*37da2899SCharles.Forsyth	else
415*37da2899SCharles.Forsyth		ans.text = name;
416*37da2899SCharles.Forsythattrloop:
417*37da2899SCharles.Forsyth	while(i < nb){
418*37da2899SCharles.Forsyth		# look for "ws name" or "ws name ws = ws val"  (ws=whitespace)
419*37da2899SCharles.Forsyth		# skip whitespace
420*37da2899SCharles.Forsyth		while(c<NCTYPE && int (ctype[c]&W)) {
421*37da2899SCharles.Forsyth			if(++i == nb)
422*37da2899SCharles.Forsyth				break attrloop;
423*37da2899SCharles.Forsyth			c = int b[i];
424*37da2899SCharles.Forsyth		}
425*37da2899SCharles.Forsyth		if(c == '>') {
426*37da2899SCharles.Forsyth			i++;
427*37da2899SCharles.Forsyth			break;
428*37da2899SCharles.Forsyth		}
429*37da2899SCharles.Forsyth		if(c == '<')
430*37da2899SCharles.Forsyth			break;	# error: unclosed tag
431*37da2899SCharles.Forsyth		if(c>=NCTYPE || !int (ctype[c]&(L|U))) {
432*37da2899SCharles.Forsyth			# error, not the start of a name
433*37da2899SCharles.Forsyth			# skip to end of tag
434*37da2899SCharles.Forsyth			while(i < nb) {
435*37da2899SCharles.Forsyth				c = int b[i++];
436*37da2899SCharles.Forsyth				if(c == '>')
437*37da2899SCharles.Forsyth					break;
438*37da2899SCharles.Forsyth			}
439*37da2899SCharles.Forsyth			break attrloop;
440*37da2899SCharles.Forsyth		}
441*37da2899SCharles.Forsyth		# gather name
442*37da2899SCharles.Forsyth		namstart = i;
443*37da2899SCharles.Forsyth		while(c<NCTYPE && int (ctype[c]&(L|U|D|N))) {
444*37da2899SCharles.Forsyth			if(++i == nb)
445*37da2899SCharles.Forsyth				break attrloop;
446*37da2899SCharles.Forsyth			c = int b[i];
447*37da2899SCharles.Forsyth		}
448*37da2899SCharles.Forsyth		name = lowercase(b, namstart, i);
449*37da2899SCharles.Forsyth		# skip whitespace
450*37da2899SCharles.Forsyth		while(c<NCTYPE && int (ctype[c]&W)) {
451*37da2899SCharles.Forsyth			if(++i == nb)
452*37da2899SCharles.Forsyth				break attrloop;
453*37da2899SCharles.Forsyth			c = int b[i];
454*37da2899SCharles.Forsyth		}
455*37da2899SCharles.Forsyth		if(c != '=') {
456*37da2899SCharles.Forsyth			# no value for this attr
457*37da2899SCharles.Forsyth			al = (name, "") :: al;
458*37da2899SCharles.Forsyth			continue attrloop;
459*37da2899SCharles.Forsyth		}
460*37da2899SCharles.Forsyth		# skip whitespace
461*37da2899SCharles.Forsyth		if(++i == nb)
462*37da2899SCharles.Forsyth			break attrloop;
463*37da2899SCharles.Forsyth		c = int b[i];
464*37da2899SCharles.Forsyth		while(c<NCTYPE && int (ctype[c]&W)) {
465*37da2899SCharles.Forsyth			if(++i == nb)
466*37da2899SCharles.Forsyth				break attrloop;
467*37da2899SCharles.Forsyth			c = int b[i];
468*37da2899SCharles.Forsyth		}
469*37da2899SCharles.Forsyth		# gather value
470*37da2899SCharles.Forsyth		quote := 0;
471*37da2899SCharles.Forsyth		if(c == '\'' || c == '"') {
472*37da2899SCharles.Forsyth			quote = c;
473*37da2899SCharles.Forsyth			i++;
474*37da2899SCharles.Forsyth		}
475*37da2899SCharles.Forsyth		val := "";
476*37da2899SCharles.Forsyth		nv := 0;
477*37da2899SCharles.Forsyth	valloop:
478*37da2899SCharles.Forsyth		while(i < nb) {
479*37da2899SCharles.Forsyth			case charset{
480*37da2899SCharles.Forsyth			Latin1 =>
481*37da2899SCharles.Forsyth				c = int b[i++];
482*37da2899SCharles.Forsyth			UTF8 =>
483*37da2899SCharles.Forsyth				j: int;
484*37da2899SCharles.Forsyth				(c, j, nil) = sys->byte2char(b, i);
485*37da2899SCharles.Forsyth				i += j;
486*37da2899SCharles.Forsyth			}
487*37da2899SCharles.Forsyth			if(c == '>') {
488*37da2899SCharles.Forsyth				if(quote) {
489*37da2899SCharles.Forsyth					# c might be part of string (though not good style)
490*37da2899SCharles.Forsyth					# but if line ends before close quote, assume
491*37da2899SCharles.Forsyth					# there was an unmatched quote
492*37da2899SCharles.Forsyth					for(k := i; k < nb; k++) {
493*37da2899SCharles.Forsyth						c = int b[k];
494*37da2899SCharles.Forsyth						if(c == quote) {
495*37da2899SCharles.Forsyth							val[nv++] = '>';
496*37da2899SCharles.Forsyth							continue valloop;
497*37da2899SCharles.Forsyth						}
498*37da2899SCharles.Forsyth						if(c == '\n') {
499*37da2899SCharles.Forsyth							i--;
500*37da2899SCharles.Forsyth							break valloop;
501*37da2899SCharles.Forsyth						}
502*37da2899SCharles.Forsyth					}
503*37da2899SCharles.Forsyth				}
504*37da2899SCharles.Forsyth				i--;
505*37da2899SCharles.Forsyth				break valloop;
506*37da2899SCharles.Forsyth			}
507*37da2899SCharles.Forsyth			if(quote) {
508*37da2899SCharles.Forsyth				if(c == quote)
509*37da2899SCharles.Forsyth					break valloop;
510*37da2899SCharles.Forsyth				if(c == '\n')
511*37da2899SCharles.Forsyth					continue valloop;
512*37da2899SCharles.Forsyth				if(c == '\t' || c == '\r')
513*37da2899SCharles.Forsyth					c = ' ';
514*37da2899SCharles.Forsyth			}
515*37da2899SCharles.Forsyth			else {
516*37da2899SCharles.Forsyth				if(c<NCTYPE && int (ctype[c]&W))
517*37da2899SCharles.Forsyth					break valloop;
518*37da2899SCharles.Forsyth			}
519*37da2899SCharles.Forsyth			if(c == '&')
520*37da2899SCharles.Forsyth				(c, i) = ampersand(b, i);
521*37da2899SCharles.Forsyth			val[nv++] = c;
522*37da2899SCharles.Forsyth		}
523*37da2899SCharles.Forsyth		al = (name, val) :: al;
524*37da2899SCharles.Forsyth		if(i < nb)
525*37da2899SCharles.Forsyth			c = int b[i];
526*37da2899SCharles.Forsyth	}
527*37da2899SCharles.Forsyth	ans.attr = al;
528*37da2899SCharles.Forsyth	return (ans, i);
529*37da2899SCharles.Forsyth}
530*37da2899SCharles.Forsyth
531*37da2899SCharles.Forsythampersand(b: array of byte, i: int): (int, int)
532*37da2899SCharles.Forsyth{
533*37da2899SCharles.Forsyth	starti := i;
534*37da2899SCharles.Forsyth	c := 0;
535*37da2899SCharles.Forsyth	nb := len b;
536*37da2899SCharles.Forsyth	if(i >= nb)
537*37da2899SCharles.Forsyth		return ('?', i);
538*37da2899SCharles.Forsyth	fnd := 0;
539*37da2899SCharles.Forsyth	ans := 0;
540*37da2899SCharles.Forsyth	if(int b[i] == '#'){
541*37da2899SCharles.Forsyth		i++;
542*37da2899SCharles.Forsyth		while(i<nb){
543*37da2899SCharles.Forsyth			d := int b[i];
544*37da2899SCharles.Forsyth			if(!(int (ctype[d]&D)))
545*37da2899SCharles.Forsyth				break;
546*37da2899SCharles.Forsyth			c = c*10 + d-'0';
547*37da2899SCharles.Forsyth			i++;
548*37da2899SCharles.Forsyth		}
549*37da2899SCharles.Forsyth		if(0<c && c<256) {
550*37da2899SCharles.Forsyth			if(c==160)
551*37da2899SCharles.Forsyth				c = ' ';   # non-breaking space
552*37da2899SCharles.Forsyth			ans = c;
553*37da2899SCharles.Forsyth			fnd = 1;
554*37da2899SCharles.Forsyth		}
555*37da2899SCharles.Forsyth	}
556*37da2899SCharles.Forsyth	else {
557*37da2899SCharles.Forsyth		s := "";
558*37da2899SCharles.Forsyth		k := 0;
559*37da2899SCharles.Forsyth		c = int b[i];
560*37da2899SCharles.Forsyth		if(int (ctype[c]&(L|U))) {
561*37da2899SCharles.Forsyth			while(i<nb) {
562*37da2899SCharles.Forsyth				c = int b[i];
563*37da2899SCharles.Forsyth				if(!(int (ctype[c]&(L|U|D|N))))
564*37da2899SCharles.Forsyth					break;
565*37da2899SCharles.Forsyth				s[k++] = c;
566*37da2899SCharles.Forsyth				i++;
567*37da2899SCharles.Forsyth			}
568*37da2899SCharles.Forsyth		}
569*37da2899SCharles.Forsyth		(fnd, ans) = T->lookup(chartab, s);
570*37da2899SCharles.Forsyth	}
571*37da2899SCharles.Forsyth	if(!fnd)
572*37da2899SCharles.Forsyth		return ('&', starti);
573*37da2899SCharles.Forsyth	if(i<nb && (int b[i]==';' || int b[i]=='\n'))
574*37da2899SCharles.Forsyth		i++;
575*37da2899SCharles.Forsyth	return (ans, i);
576*37da2899SCharles.Forsyth}
577*37da2899SCharles.Forsyth
578*37da2899SCharles.Forsythlowercase(b: array of byte, istart, iend: int): string
579*37da2899SCharles.Forsyth{
580*37da2899SCharles.Forsyth	l := "";
581*37da2899SCharles.Forsyth	j := 0;
582*37da2899SCharles.Forsyth	for(i:=istart; i<iend; i++) {
583*37da2899SCharles.Forsyth		c := int b[i];
584*37da2899SCharles.Forsyth		if(c < NCTYPE && int (ctype[c]&U))
585*37da2899SCharles.Forsyth			l[j] = c-'A'+'a';
586*37da2899SCharles.Forsyth		else
587*37da2899SCharles.Forsyth			l[j] = c;
588*37da2899SCharles.Forsyth		j++;
589*37da2899SCharles.Forsyth	}
590*37da2899SCharles.Forsyth	return l;
591*37da2899SCharles.Forsyth}
592*37da2899SCharles.Forsyth
593*37da2899SCharles.Forsythuppercase(s: string): string
594*37da2899SCharles.Forsyth{
595*37da2899SCharles.Forsyth	l := "";
596*37da2899SCharles.Forsyth
597*37da2899SCharles.Forsyth	for(i:=0; i<len s; i++) {
598*37da2899SCharles.Forsyth		c := s[i];
599*37da2899SCharles.Forsyth		if(c < NCTYPE && int (ctype[c]&L))
600*37da2899SCharles.Forsyth			l[i] = c+'A'-'a';
601*37da2899SCharles.Forsyth		else
602*37da2899SCharles.Forsyth			l[i] = c;
603*37da2899SCharles.Forsyth	}
604*37da2899SCharles.Forsyth	return l;
605*37da2899SCharles.Forsyth}
606*37da2899SCharles.Forsyth
607*37da2899SCharles.Forsythattrvalue(attr: list of Attr, name: string): (int, string)
608*37da2899SCharles.Forsyth{
609*37da2899SCharles.Forsyth	while(attr != nil){
610*37da2899SCharles.Forsyth		a := hd attr;
611*37da2899SCharles.Forsyth		if(a.name == name)
612*37da2899SCharles.Forsyth			return (1, a.value);
613*37da2899SCharles.Forsyth		attr = tl attr;
614*37da2899SCharles.Forsyth	}
615*37da2899SCharles.Forsyth	return (0, "");
616*37da2899SCharles.Forsyth}
617*37da2899SCharles.Forsyth
618*37da2899SCharles.Forsythglobalattr(html: array of ref Lex, tag: int, attr: string): (int, string)
619*37da2899SCharles.Forsyth{
620*37da2899SCharles.Forsyth	for(i:=0; i<len html; i++)
621*37da2899SCharles.Forsyth		if(html[i].tag == tag)
622*37da2899SCharles.Forsyth			return attrvalue(html[i].attr, attr);
623*37da2899SCharles.Forsyth	return (0, "");
624*37da2899SCharles.Forsyth}
625*37da2899SCharles.Forsyth
626*37da2899SCharles.Forsythisbreak(h: array of ref Lex, i: int): int
627*37da2899SCharles.Forsyth{
628*37da2899SCharles.Forsyth	for(; i<len h; i++){
629*37da2899SCharles.Forsyth		case h[i].tag{
630*37da2899SCharles.Forsyth		Th1 or Th2 or Th3 or Th4 or Th5 or Th6 or
631*37da2899SCharles.Forsyth		Tbr or Tp or Tbody or Taddress or Tblockquote or
632*37da2899SCharles.Forsyth		Tul or Tdl or Tdir or Tmenu or Tol or Tpre or Thr or Tform =>
633*37da2899SCharles.Forsyth			return 1;
634*37da2899SCharles.Forsyth		Data =>
635*37da2899SCharles.Forsyth			return 0;
636*37da2899SCharles.Forsyth		}
637*37da2899SCharles.Forsyth	}
638*37da2899SCharles.Forsyth	return 0;
639*37da2899SCharles.Forsyth}
640*37da2899SCharles.Forsyth
641*37da2899SCharles.Forsyth# for debugging
642*37da2899SCharles.Forsythlex2string(l: ref Lex): string
643*37da2899SCharles.Forsyth{
644*37da2899SCharles.Forsyth	ans := "";
645*37da2899SCharles.Forsyth	tag := l.tag;
646*37da2899SCharles.Forsyth	if(tag == HTML->Data)
647*37da2899SCharles.Forsyth		ans = "'" + l.text + "'";
648*37da2899SCharles.Forsyth	else {
649*37da2899SCharles.Forsyth		ans = "<";
650*37da2899SCharles.Forsyth		if(tag >= RBRA) {
651*37da2899SCharles.Forsyth			tag -= RBRA;
652*37da2899SCharles.Forsyth			ans = ans + "/";
653*37da2899SCharles.Forsyth		}
654*37da2899SCharles.Forsyth		tname := T->revlookup(htmlstringtab, tag);
655*37da2899SCharles.Forsyth		if(tname != nil)
656*37da2899SCharles.Forsyth				ans = ans + uppercase(tname);
657*37da2899SCharles.Forsyth		for(al := l.attr; al != nil; al = tl al) {
658*37da2899SCharles.Forsyth			a := hd al;
659*37da2899SCharles.Forsyth			ans = ans + " " + a.name + "='" + a.value + "'";
660*37da2899SCharles.Forsyth		}
661*37da2899SCharles.Forsyth		ans = ans + ">";
662*37da2899SCharles.Forsyth	}
663*37da2899SCharles.Forsyth	return ans;
664*37da2899SCharles.Forsyth}
665