xref: /inferno-os/appl/lib/html.b (revision 37da2899f40661e3e9631e497da8dc59b971cbd0)
1implement HTML;
2
3include "sys.m";
4include "html.m";
5include "strinttab.m";
6
7sys:	Sys;
8T:	StringIntTab;
9
10Stringtab: adt
11{
12	name:	string;
13	val:		int;
14};
15
16chartab:= array[] of { T->StringInt
17	("AElig", 'Æ'),
18	("Aacute", 'Á'),
19	("Acirc", 'Â'),
20	("Agrave", 'À'),
21	("Aring", 'Å'),
22	("Atilde", 'Ã'),
23	("Auml", 'Ä'),
24	("Ccedil", 'Ç'),
25	("ETH", 'Ð'),
26	("Eacute", 'É'),
27	("Ecirc", 'Ê'),
28	("Egrave", 'È'),
29	("Euml", 'Ë'),
30	("Iacute", 'Í'),
31	("Icirc", 'Î'),
32	("Igrave", 'Ì'),
33	("Iuml", 'Ï'),
34	("Ntilde", 'Ñ'),
35	("Oacute", 'Ó'),
36	("Ocirc", 'Ô'),
37	("Ograve", 'Ò'),
38	("Oslash", 'Ø'),
39	("Otilde", 'Õ'),
40	("Ouml", 'Ö'),
41	("THORN", 'Þ'),
42	("Uacute", 'Ú'),
43	("Ucirc", 'Û'),
44	("Ugrave", 'Ù'),
45	("Uuml", 'Ü'),
46	("Yacute", 'Ý'),
47	("aacute", 'á'),
48	("acirc", 'â'),
49	("acute", '´'),
50	("aelig", 'æ'),
51	("agrave", 'à'),
52	("alpha", 'α'),
53	("amp", '&'),
54	("aring", 'å'),
55	("atilde", 'ã'),
56	("auml", 'ä'),
57	("beta", 'β'),
58	("brvbar", '¦'),
59	("ccedil", 'ç'),
60	("cdots", '⋯'),
61	("cedil", '¸'),
62	("cent", '¢'),
63	("chi", 'χ'),
64	("copy", '©'),
65	("curren", '¤'),
66	("ddots", '⋱'),
67	("deg", '°'),
68	("delta", 'δ'),
69	("divide", '÷'),
70	("eacute", 'é'),
71	("ecirc", 'ê'),
72	("egrave", 'è'),
73	("emdash", '—'),
74	("emsp", ' '),
75	("endash", '–'),
76	("ensp", ' '),
77	("epsilon", 'ε'),
78	("eta", 'η'),
79	("eth", 'ð'),
80	("euml", 'ë'),
81	("frac12", '½'),
82	("frac14", '¼'),
83	("frac34", '¾'),
84	("gamma", 'γ'),
85	("gt", '>'),
86	("iacute", 'í'),
87	("icirc", 'î'),
88	("iexcl", '¡'),
89	("igrave", 'ì'),
90	("iota", 'ι'),
91	("iquest", '¿'),
92	("iuml", 'ï'),
93	("kappa", 'κ'),
94	("lambda", 'λ'),
95	("laquo", '«'),
96	("ldots", '…'),
97	("lt", '<'),
98	("macr", '¯'),
99	("micro", 'µ'),
100	("middot", '·'),
101	("mu", 'μ'),
102	("nbsp", ' '),
103	("not", '¬'),
104	("ntilde", 'ñ'),
105	("nu", 'ν'),
106	("oacute", 'ó'),
107	("ocirc", 'ô'),
108	("ograve", 'ò'),
109	("omega", 'ω'),
110	("omicron", 'ο'),
111	("ordf", 'ª'),
112	("ordm", 'º'),
113	("oslash", 'ø'),
114	("otilde", 'õ'),
115	("ouml", 'ö'),
116	("para", '¶'),
117	("phi", 'φ'),
118	("pi", 'π'),
119	("plusmn", '±'),
120	("pound", '£'),
121	("psi", 'ψ'),
122	("quad", ' '),
123	("quot", '"'),
124	("raquo", '»'),
125	("reg", '®'),
126	("rho", 'ρ'),
127	("sect", '§'),
128	("shy", '­'),
129	("sigma", 'σ'),
130	("sp", ' '),
131	("sup1", '¹'),
132	("sup2", '²'),
133	("sup3", '³'),
134	("szlig", 'ß'),
135	("tau", 'τ'),
136	("theta", 'θ'),
137	("thinsp", ' '),
138	("thorn", 'þ'),
139	("times", '×'),
140	("trade", '™'),
141	("uacute", 'ú'),
142	("ucirc", 'û'),
143	("ugrave", 'ù'),
144	("uml", '¨'),
145	("upsilon", 'υ'),
146	("uuml", 'ü'),
147	("varepsilon", '∈'),
148	("varphi", 'ϕ'),
149	("varpi", 'ϖ'),
150	("varrho", 'ϱ'),
151	("vdots", '⋮'),
152	("vsigma", 'ς'),
153	("vtheta", 'ϑ'),
154	("xi", 'ξ'),
155	("yacute", 'ý'),
156	("yen", '¥'),
157	("yuml", 'ÿ'),
158	("zeta", 'ζ'),
159};
160
161htmlstringtab := array[] of { T->StringInt
162	("a", Ta),
163	("address", Taddress),
164	("applet", Tapplet),
165	("area", Tarea),
166	("att_footer", Tatt_footer),
167	("b", Tb),
168	("base", Tbase),
169	("basefont", Tbasefont),
170	("big", Tbig),
171	("blink", Tblink),
172	("blockquote", Tblockquote),
173	("body", Tbody),
174	("bq", Tbq),
175	("br", Tbr),
176	("caption", Tcaption),
177	("center", Tcenter),
178	("cite", Tcite),
179	("code", Tcode),
180	("col", Tcol),
181	("colgroup", Tcolgroup),
182	("dd", Tdd),
183	("dfn", Tdfn),
184	("dir", Tdir),
185	("div", Tdiv),
186	("dl", Tdl),
187	("dt", Tdt),
188	("em", Tem),
189	("font", Tfont),
190	("form", Tform),
191	("frame", Tframe),
192	("frameset", Tframeset),
193	("h1", Th1),
194	("h2", Th2),
195	("h3", Th3),
196	("h4", Th4),
197	("h5", Th5),
198	("h6", Th6),
199	("head", Thead),
200	("hr", Thr),
201	("html", Thtml),
202	("i", Ti),
203	("img", Timg),
204	("input", Tinput),
205	("isindex", Tisindex),
206	("item", Titem),
207	("kbd", Tkbd),
208	("li", Tli),
209	("link", Tlink),
210	("map", Tmap),
211	("menu", Tmenu),
212	("meta", Tmeta),
213	("nobr", Tnobr),
214	("noframes", Tnoframes),
215	("ol", Tol),
216	("option", Toption),
217	("p", Tp),
218	("param", Tparam),
219	("pre", Tpre),
220	("q", Tq),
221	("samp", Tsamp),
222	("script", Tscript),
223	("select", Tselect),
224	("small", Tsmall),
225	("strike", Tstrike),
226	("strong", Tstrong),
227	("style", Tstyle),
228	("sub", Tsub),
229	("sup", Tsup),
230	("t", Tt),
231	("table", Ttable),
232	("tbody", Ttbody),
233	("td", Ttd),
234	("textarea", Ttextarea),
235	("textflow", Ttextflow),
236	("tfoot", Ttfoot),
237	("th", Tth),
238	("thead", Tthead),
239	("title", Ttitle),
240	("tr", Ttr),
241	("tt", Ttt),
242	("u", Tu),
243	("ul", Tul),
244	("var", Tvar)
245};
246
247W, D, L, U, N: con byte (1<<iota);
248NCTYPE: con 256;
249
250ctype := array[NCTYPE] of {
251	'0'=>D, '1'=>D, '2'=>D, '3'=>D, '4'=>D,
252	'5'=>D, '6'=>D, '7'=>D, '8'=>D, '9'=>D,
253	'A'=>U, 'B'=>U, 'C'=>U, 'D'=>U, 'E'=>U, 'F'=>U,
254	'G'=>U, 'H'=>U, 'I'=>U, 'J'=>U, 'K'=>U, 'L'=>U,
255	'M'=>U, 'N'=>U, 'O'=>U, 'P'=>U, 'Q'=>U, 'R'=>U,
256	'S'=>U, 'T'=>U, 'U'=>U, 'V'=>U, 'W'=>U, 'X'=>U,
257	'Y'=>U, 'Z'=>U,
258	'a'=>L, 'b'=>L, 'c'=>L, 'd'=>L, 'e'=>L, 'f'=>L,
259	'g'=>L, 'h'=>L, 'i'=>L, 'j'=>L, 'k'=>L, 'l'=>L,
260	'm'=>L, 'n'=>L, 'o'=>L, 'p'=>L, 'q'=>L, 'r'=>L,
261	's'=>L, 't'=>L, 'u'=>L, 'v'=>L, 'w'=>L, 'x'=>L,
262	'y'=>L, 'z'=>L,
263	'.'=>N, '-'=>N,
264	' '=>W, '\n'=>W, '\t'=>W, '\r'=>W,
265	* => byte 0
266};
267
268lex(b: array of byte, charset: int, keepwh: int): array of ref Lex
269{
270	if(sys == nil)
271		sys = load Sys Sys->PATH;
272	if(T == nil)
273		T = load StringIntTab StringIntTab->PATH;
274	if(T == nil) {
275		sys->print("HTML->lex: couldn't %s\n", StringIntTab->PATH);
276		return nil;
277	}
278
279	a: array of ref Lex;
280	ai := 0;
281	i := 0;
282	nb := len b;
283	for(;;){
284   Whitespace:
285		for(;;){
286			# ignore nulls
287			while(i<nb && (int b[i] == 0))
288				i++;
289			# skip white space
290			if(!keepwh) {
291				while(i<nb) {
292					c := int b[i];
293					if(!(int (ctype[c]&W)) && c != ' ')
294						break;
295					i++;
296				}
297			}
298			# skip comments
299			if(i<nb-4 && int b[i]=='<' && int b[i+1]=='!'
300					&& int b[i+2]=='-' && int b[i+3]=='-') {
301				i += 4;
302				while(i<nb-3){
303					if(int b[i]=='-' && int b[i+1]=='-' && int b[i+2]=='>'){
304						i += 3;
305						continue Whitespace;
306					}
307					i++;
308				}
309				continue Whitespace;
310			}
311			break;
312		}
313		if(i == nb)
314			break;
315		if(ai == len a){
316			na := array[len a + 500] of ref Lex;
317			if(a != nil)
318				na[0:] = a;
319			a = na;
320		}
321		if(int b[i] == '<'){
322			lx : ref Lex;
323			(lx, i) = gettag(b, i, charset);
324			a[ai++] = lx;
325		}
326		else {
327			s: string;
328			(s, i) = getdata(b, i, keepwh, charset);
329			a[ai++] = ref Lex (Data, s, nil);
330		}
331	}
332	return a[0:ai];
333}
334
335getdata(b: array of byte, i: int, keepnls, charset: int): (string, int)
336{
337	s:= "";
338	j:= 0;
339	c: int;
340	nb := len b;
341
342loop:
343	while(i < nb){
344		oldi := i;
345		case charset{
346		Latin1 =>
347			c = int b[i++];
348		UTF8 =>
349			j: int;
350			(c, j, nil) = sys->byte2char(b, i);
351			i += j;
352		}
353		case c {
354		0 or 16r1a =>
355			continue loop;
356		'<' =>
357			i = oldi;
358			break loop;
359		'&' =>
360			(c, i) = ampersand(b, i);
361		'\n' =>
362			if(!keepnls)
363				c = ' ';
364		'\r' =>
365			if(oldi > 0 && int b[oldi-1] == '\n')
366				continue loop;
367			if(keepnls)
368				c = '\n';
369			else
370				c = ' ';
371		}
372		s[j++] = c;
373	}
374	return (s, i);
375}
376
377gettag(b: array of byte, i, charset: int): (ref Lex, int)
378{
379	rbra := 0;
380	nb := len b;
381	ans := ref Lex(Notfound, "", nil);
382	al: list of Attr;
383	if(++i == nb)
384		return (ans, i);
385	istart := i;
386	c := int b[i];
387	if(c == '/') {
388		rbra = RBRA;
389		if(++i == nb)
390			return (ans, i);
391		c = int b[i];
392	}
393	if(c>=NCTYPE || !int (ctype[c]&(L|U))) {
394		while(i < nb) {
395			c = int b[i++];
396			if(c == '>')
397				break;
398		}
399		ans.text = string b[istart:i];
400		return (ans, i);
401	}
402	namstart := i;
403	while(c<NCTYPE && int (ctype[c]&(L|U|D|N))) {
404		if(++i == nb) {
405			ans.text = string b[istart:i];
406			return (ans, i);
407		}
408		c = int b[i];
409	}
410	name := lowercase(b, namstart, i);
411	(fnd, tag) := T->lookup(htmlstringtab, name);
412	if(fnd)
413		ans.tag = tag+rbra;
414	else
415		ans.text = name;
416attrloop:
417	while(i < nb){
418		# look for "ws name" or "ws name ws = ws val"  (ws=whitespace)
419		# skip whitespace
420		while(c<NCTYPE && int (ctype[c]&W)) {
421			if(++i == nb)
422				break attrloop;
423			c = int b[i];
424		}
425		if(c == '>') {
426			i++;
427			break;
428		}
429		if(c == '<')
430			break;	# error: unclosed tag
431		if(c>=NCTYPE || !int (ctype[c]&(L|U))) {
432			# error, not the start of a name
433			# skip to end of tag
434			while(i < nb) {
435				c = int b[i++];
436				if(c == '>')
437					break;
438			}
439			break attrloop;
440		}
441		# gather name
442		namstart = i;
443		while(c<NCTYPE && int (ctype[c]&(L|U|D|N))) {
444			if(++i == nb)
445				break attrloop;
446			c = int b[i];
447		}
448		name = lowercase(b, namstart, i);
449		# skip whitespace
450		while(c<NCTYPE && int (ctype[c]&W)) {
451			if(++i == nb)
452				break attrloop;
453			c = int b[i];
454		}
455		if(c != '=') {
456			# no value for this attr
457			al = (name, "") :: al;
458			continue attrloop;
459		}
460		# skip whitespace
461		if(++i == nb)
462			break attrloop;
463		c = int b[i];
464		while(c<NCTYPE && int (ctype[c]&W)) {
465			if(++i == nb)
466				break attrloop;
467			c = int b[i];
468		}
469		# gather value
470		quote := 0;
471		if(c == '\'' || c == '"') {
472			quote = c;
473			i++;
474		}
475		val := "";
476		nv := 0;
477	valloop:
478		while(i < nb) {
479			case charset{
480			Latin1 =>
481				c = int b[i++];
482			UTF8 =>
483				j: int;
484				(c, j, nil) = sys->byte2char(b, i);
485				i += j;
486			}
487			if(c == '>') {
488				if(quote) {
489					# c might be part of string (though not good style)
490					# but if line ends before close quote, assume
491					# there was an unmatched quote
492					for(k := i; k < nb; k++) {
493						c = int b[k];
494						if(c == quote) {
495							val[nv++] = '>';
496							continue valloop;
497						}
498						if(c == '\n') {
499							i--;
500							break valloop;
501						}
502					}
503				}
504				i--;
505				break valloop;
506			}
507			if(quote) {
508				if(c == quote)
509					break valloop;
510				if(c == '\n')
511					continue valloop;
512				if(c == '\t' || c == '\r')
513					c = ' ';
514			}
515			else {
516				if(c<NCTYPE && int (ctype[c]&W))
517					break valloop;
518			}
519			if(c == '&')
520				(c, i) = ampersand(b, i);
521			val[nv++] = c;
522		}
523		al = (name, val) :: al;
524		if(i < nb)
525			c = int b[i];
526	}
527	ans.attr = al;
528	return (ans, i);
529}
530
531ampersand(b: array of byte, i: int): (int, int)
532{
533	starti := i;
534	c := 0;
535	nb := len b;
536	if(i >= nb)
537		return ('?', i);
538	fnd := 0;
539	ans := 0;
540	if(int b[i] == '#'){
541		i++;
542		while(i<nb){
543			d := int b[i];
544			if(!(int (ctype[d]&D)))
545				break;
546			c = c*10 + d-'0';
547			i++;
548		}
549		if(0<c && c<256) {
550			if(c==160)
551				c = ' ';   # non-breaking space
552			ans = c;
553			fnd = 1;
554		}
555	}
556	else {
557		s := "";
558		k := 0;
559		c = int b[i];
560		if(int (ctype[c]&(L|U))) {
561			while(i<nb) {
562				c = int b[i];
563				if(!(int (ctype[c]&(L|U|D|N))))
564					break;
565				s[k++] = c;
566				i++;
567			}
568		}
569		(fnd, ans) = T->lookup(chartab, s);
570	}
571	if(!fnd)
572		return ('&', starti);
573	if(i<nb && (int b[i]==';' || int b[i]=='\n'))
574		i++;
575	return (ans, i);
576}
577
578lowercase(b: array of byte, istart, iend: int): string
579{
580	l := "";
581	j := 0;
582	for(i:=istart; i<iend; i++) {
583		c := int b[i];
584		if(c < NCTYPE && int (ctype[c]&U))
585			l[j] = c-'A'+'a';
586		else
587			l[j] = c;
588		j++;
589	}
590	return l;
591}
592
593uppercase(s: string): string
594{
595	l := "";
596
597	for(i:=0; i<len s; i++) {
598		c := s[i];
599		if(c < NCTYPE && int (ctype[c]&L))
600			l[i] = c+'A'-'a';
601		else
602			l[i] = c;
603	}
604	return l;
605}
606
607attrvalue(attr: list of Attr, name: string): (int, string)
608{
609	while(attr != nil){
610		a := hd attr;
611		if(a.name == name)
612			return (1, a.value);
613		attr = tl attr;
614	}
615	return (0, "");
616}
617
618globalattr(html: array of ref Lex, tag: int, attr: string): (int, string)
619{
620	for(i:=0; i<len html; i++)
621		if(html[i].tag == tag)
622			return attrvalue(html[i].attr, attr);
623	return (0, "");
624}
625
626isbreak(h: array of ref Lex, i: int): int
627{
628	for(; i<len h; i++){
629		case h[i].tag{
630		Th1 or Th2 or Th3 or Th4 or Th5 or Th6 or
631		Tbr or Tp or Tbody or Taddress or Tblockquote or
632		Tul or Tdl or Tdir or Tmenu or Tol or Tpre or Thr or Tform =>
633			return 1;
634		Data =>
635			return 0;
636		}
637	}
638	return 0;
639}
640
641# for debugging
642lex2string(l: ref Lex): string
643{
644	ans := "";
645	tag := l.tag;
646	if(tag == HTML->Data)
647		ans = "'" + l.text + "'";
648	else {
649		ans = "<";
650		if(tag >= RBRA) {
651			tag -= RBRA;
652			ans = ans + "/";
653		}
654		tname := T->revlookup(htmlstringtab, tag);
655		if(tname != nil)
656				ans = ans + uppercase(tname);
657		for(al := l.attr; al != nil; al = tl al) {
658			a := hd al;
659			ans = ans + " " + a.name + "='" + a.value + "'";
660		}
661		ans = ans + ">";
662	}
663	return ans;
664}
665