xref: /inferno-os/appl/lib/xml.b (revision 46439007cf417cbd9ac8049bb4122c890097a0fa)
1implement Xml;
2
3#
4# Portions copyright © 2002 Vita Nuova Holdings Limited
5#
6#
7# Derived from saxparser.b Copyright © 2001-2002 by John Powers or his employer
8#
9
10# TO DO:
11# - provide a way of getting attributes out of <?...?> (process) requests,
12# so that we can process stylesheet requests given in that way.
13
14include "sys.m";
15	sys: Sys;
16include "bufio.m";
17	bufio: Bufio;
18	Iobuf: import bufio;
19include "string.m";
20	str: String;
21include "hash.m";
22	hash: Hash;
23	HashTable: import hash;
24include "xml.m";
25
26Parcel: adt {
27	pick {
28	Start or
29	Empty =>
30		name: string;
31		attrs: Attributes;
32	End =>
33		name: string;
34	Text =>
35		ch: string;
36		ws1, ws2: int;
37	Process =>
38		target: string;
39		data: string;
40	Error =>
41		loc:	Locator;
42		msg:	string;
43	Doctype =>
44		name:	string;
45		public:	int;
46		params:	list of string;
47	Stylesheet =>
48		attrs: Attributes;
49	EOF =>
50	}
51};
52
53entinit := array[] of {
54	("AElig", "Æ"),
55	("OElig", "Œ"),
56	("aelig", "æ"),
57	("amp", "&"),
58	("apos", "\'"),
59	("copy", "©"),
60	("gt", ">"),
61	("ldquo", "``"),
62	("lt", "<"),
63	("mdash", "-"),		# XXX ??
64	("oelig", "œ"),
65	("quot", "\""),
66	("rdquo", "''"),
67	("rsquo", "'"),
68	("trade", "™"),
69	("nbsp", "\u00a0"),
70};
71entdict: ref HashTable;
72
73init(): string
74{
75	sys = load Sys Sys->PATH;
76	bufio = load Bufio Bufio->PATH;
77	if (bufio == nil)
78		return sys->sprint("cannot load %s: %r", Bufio->PATH);
79	str = load String String->PATH;
80	if (str == nil)
81		return sys->sprint("cannot load %s: %r", String->PATH);
82	hash = load Hash Hash->PATH;
83	if (hash == nil)
84		return sys->sprint("cannot load %s: %r", Hash->PATH);
85	entdict = hash->new(23);
86	for (i := 0; i < len entinit; i += 1) {
87		(key, value) := entinit[i];
88		entdict.insert(key, (0, 0.0, value));
89	}
90	return nil;
91}
92
93blankparser: Parser;
94
95open(srcfile: string, warning: chan of (Locator, string), preelem: string): (ref Parser, string)
96{
97	x := ref blankparser;
98	x.in = bufio->open(srcfile, Bufio->OREAD);
99	if (x.in == nil)
100		return (nil, sys->sprint("cannot open %s: %r", srcfile));
101	# ignore utf16 intialisation character (yuck)
102	c := x.in.getc();
103	if (c != 16rfffe && c != 16rfeff)
104		x.in.ungetc();
105	x.estack = nil;
106	x.loc = Locator(1, srcfile, "");
107	x.warning = warning;
108	x.preelem = preelem;
109	return (x, "");
110}
111
112Parser.next(x: self ref Parser): ref Item
113{
114	curroffset := x.fileoffset;
115	currloc := x.loc;
116	# read up until end of current item
117	while (x.actdepth > x.readdepth) {
118		pick p := getparcel(x) {
119		Start =>
120			x.actdepth++;
121		End =>
122			x.actdepth--;
123		EOF =>
124			x.actdepth = 0;			# premature EOF closes all tags
125		Error =>
126			return ref Item.Error(curroffset, x.loc, x.errormsg);
127		}
128	}
129	if (x.actdepth < x.readdepth) {
130		x.fileoffset = int x.in.offset();
131		return nil;
132	}
133	gp := getparcel(x);
134	item: ref Item;
135	pick p := gp {
136	Start =>
137		x.actdepth++;
138		item = ref Item.Tag(curroffset, p.name, p.attrs);
139	End =>
140		x.actdepth--;
141		item = nil;
142	EOF =>
143		x.actdepth = 0;
144		item = nil;
145	Error =>
146		x.actdepth = 0;			# XXX is this the right thing to do?
147		item = ref Item.Error(curroffset, currloc, x.errormsg);
148	Text =>
149		item = ref Item.Text(curroffset, p.ch, p.ws1, p.ws2);
150	Process =>
151		item = ref Item.Process(curroffset, p.target, p.data);
152	Empty =>
153		item = ref Item.Tag(curroffset, p.name, p.attrs);
154	Doctype =>
155		item = ref Item.Doctype(curroffset, p.name, p.public, p.params);
156	Stylesheet =>
157		item = ref Item.Stylesheet(curroffset, p.attrs);
158	}
159	x.fileoffset = int x.in.offset();
160	return item;
161}
162
163Parser.atmark(x: self ref Parser, m: ref Mark): int
164{
165	return  int x.in.offset() == m.offset;
166}
167
168Parser.down(x: self ref Parser)
169{
170	x.readdepth++;
171}
172
173Parser.up(x: self ref Parser)
174{
175	x.readdepth--;
176}
177
178# mark is only defined after a next(), not after up() or down().
179# this means that we don't have to record lots of state when going up or down levels.
180Parser.mark(x: self ref Parser): ref Mark
181{
182	return ref Mark(x.estack, x.loc.line, int x.in.offset(), x.readdepth);
183}
184
185Parser.goto(x: self ref Parser, m: ref Mark)
186{
187	x.in.seek(big m.offset, Sys->SEEKSTART);
188	x.fileoffset = m.offset;
189	x.eof = 0;
190	x.estack = m.estack;
191	x.loc.line = m.line;
192	x.readdepth = m.readdepth;
193	x.actdepth = len x.estack;
194}
195
196Mark.str(m: self ref Mark): string
197{
198	# assume that neither the filename nor any of the tags contain spaces.
199	# format:
200	# offset readdepth linenum [tag...]
201	# XXX would be nice if the produced string did not contain
202	# any spaces so it could be treated as a word in other contexts.
203	s := sys->sprint("%d %d %d", m.offset, m.readdepth, m.line);
204	for (t := m.estack; t != nil; t = tl t)
205		s += " " + hd t;
206	return s;
207}
208
209Parser.str2mark(p: self ref Parser, s: string): ref Mark
210{
211	(n, toks) := sys->tokenize(s, " ");
212	if (n < 3)
213		return nil;
214	m := ref Mark(nil, p.loc.line, 0, 0);
215	(m.offset, toks) = (int hd toks, tl toks);
216	(m.readdepth, toks) = (int hd toks, tl toks);
217	(m.line, toks) = (int hd toks, tl toks);
218	m.estack = toks;
219	return m;
220}
221
222getparcel(x: ref Parser): ref Parcel
223{
224	{
225		p: ref Parcel;
226		while (!x.eof && p == nil) {
227			c := getc(x);
228			if (c == '<')
229				p = element(x);
230			else {
231				ungetc(x);
232				p = characters(x);
233			}
234		}
235		if (p == nil)
236			p = ref Parcel.EOF;
237		return p;
238	}
239	exception e{
240		"sax:*" =>
241			return ref Parcel.Error(x.loc, x.errormsg);
242	}
243}
244
245parcelstr(gi: ref Parcel): string
246{
247	if (gi == nil)
248		return "nil";
249	pick i := gi {
250	Start =>
251		return sys->sprint("Start: %s", i.name);
252	Empty =>
253		return sys->sprint("Empty: %s", i.name);
254	End =>
255		return "End";
256	Text =>
257		return "Text";
258	Doctype =>
259		return sys->sprint("Doctype: %s", i.name);
260	Stylesheet =>
261		return "Stylesheet";
262	Error =>
263		return "Error: " + i.msg;
264	EOF =>
265		return "EOF";
266	* =>
267		return "Unknown";
268	}
269}
270
271element(x: ref Parser): ref Parcel
272{
273	# <tag ...>
274	elemname := xmlname(x);
275	c: int;
276	if (elemname != "") {
277		attrs := buildattrs(x);
278		skipwhite(x);
279		c = getc(x);
280		isend := 0;
281		if (c == '/')
282			isend = 1;
283		else
284			ungetc(x);
285		expect(x, '>');
286
287		if (isend)
288			return ref Parcel.Empty(elemname, attrs);
289		else {
290			startelement(x, elemname);
291			return ref Parcel.Start(elemname, attrs);
292		}
293	# </tag>
294	} else if ((c = getc(x)) == '/') {
295		elemname = xmlname(x);
296		if (elemname != "") {
297			expect(x, '>');
298			endelement(x, elemname);
299			return ref Parcel.End(elemname);
300		}
301		else
302			error(x, sys->sprint("illegal beginning of tag: '%c'", c));
303	# <?tag ... ?>
304	} else if (c == '?') {
305		elemname = xmlname(x);
306		if (elemname != "") {
307			# this special case could be generalised if there were many
308			# processing instructions that took attributes like this.
309			if (elemname == "xml-stylesheet") {
310				attrs := buildattrs(x);
311				balancedstring(x, "?>");
312				return ref Parcel.Stylesheet(attrs);
313			} else {
314				data := balancedstring(x, "?>");
315				return ref Parcel.Process(elemname, data);
316			}
317		}
318	} else if (c == '!') {
319		c = getc(x);
320		case c {
321		'-' =>
322			# <!-- comment -->
323			if(getc(x) == '-'){
324				balancedstring(x, "-->");
325				return nil;
326			}
327		'[' =>
328			# <![CDATA[...]]
329			s := xmlname(x);
330			if(s == "CDATA" && getc(x) == '['){
331				data := balancedstring(x, "]]>");
332				return ref Parcel.Text(data, 0, 0);
333			}
334		* =>
335			# <!declaration
336			ungetc(x);
337			s := xmlname(x);
338			case s {
339			"DOCTYPE" =>
340				# <!DOCTYPE name (SYSTEM "filename" | PUBLIC "pubid" "uri"?)? ("[" decls "]")?>
341				skipwhite(x);
342				name := xmlname(x);
343				if(name == nil)
344					break;
345				id := "";
346				uri := "";
347				public := 0;
348				skipwhite(x);
349				case sort := xmlname(x) {
350				"SYSTEM" =>
351					id = xmlstring(x, 1);
352				"PUBLIC" =>
353					public = 1;
354					id = xmlstring(x, 1);
355					skipwhite(x);
356					c = getc(x);
357					ungetc(x);
358					if(c == '"' || c == '\'')
359						uri = xmlstring(x, 1);
360				* =>
361					error(x, sys->sprint("unknown DOCTYPE: %s", sort));
362					return nil;
363				}
364				skipwhite(x);
365				if(getc(x) == '['){
366					error(x, "cannot handle DOCTYPE with declarations");
367					return nil;
368				}
369				ungetc(x);
370				skipwhite(x);
371				if(getc(x) == '>')
372					return ref Parcel.Doctype(name, public, id :: uri :: nil);
373			"ELEMENT" or "ATTRLIST" or "NOTATION" or "ENTITY" =>
374				# don't interpret internal DTDs
375				# <!ENTITY name ("value" | SYSTEM "filename")>
376				s = gets(x, '>');
377				if(s == nil || s[len s-1] != '>')
378					error(x, "end of file in declaration");
379				return nil;
380			* =>
381				error(x, sys->sprint("unknown declaration: %s", s));
382			}
383		}
384		error(x, "invalid XML declaration");
385	} else
386		error(x, sys->sprint("illegal beginning of tag: %c", c));
387	return nil;
388}
389
390characters(x: ref Parser): ref Parcel
391{
392	p: ref Parcel;
393	content := gets(x, '<');
394	if (len content > 0) {
395		if (content[len content - 1] == '<') {
396			ungetc(x);
397			content = content[0:len content - 1];
398		}
399		ws1, ws2: int;
400		if (x.ispre) {
401			content = substituteentities(x, content);
402			ws1 = ws2 = 0;
403		} else
404			(content, ws1, ws2) = substituteentities_sp(x, content);
405		if (content != nil || ws1)
406			p = ref Parcel.Text(content, ws1, ws2);
407	}
408	return p;
409}
410
411startelement(x: ref Parser, name: string)
412{
413	x.estack = name :: x.estack;
414	if (name == x.preelem)
415		x.ispre++;
416}
417
418endelement(x: ref Parser, name: string)
419{
420	if (x.estack != nil && name == hd x.estack) {
421		x.estack = tl x.estack;
422		if (name == x.preelem)
423			x.ispre--;
424	} else {
425		starttag := "";
426		if (x.estack != nil)
427			starttag = hd x.estack;
428		warning(x, sys->sprint("<%s></%s> mismatch", starttag, name));
429
430		# invalid XML but try to recover anyway to reduce turnaround time on fixing errors.
431		# loop back up through the tag stack to see if there's a matching tag, in which case
432		# jump up in the stack to that, making some rude assumptions about the
433		# way Parcels are handled at the top level.
434		n := 0;
435		for (t := x.estack; t != nil; (t, n) = (tl t, n + 1))
436			if (hd t == name)
437				break;
438		if (t != nil) {
439			x.estack = tl t;
440			x.actdepth -= n;
441		}
442	}
443}
444
445buildattrs(x: ref Parser): Attributes
446{
447	attrs: list of Attribute;
448
449	attr: Attribute;
450	for (;;) {
451		skipwhite(x);
452		attr.name = xmlname(x);
453		if (attr.name == nil)
454			break;
455		skipwhite(x);
456		c := getc(x);
457		if(c != '='){
458			ungetc(x);
459			attr.value = nil;
460		}else
461			attr.value = xmlstring(x, 1);
462		attrs = attr :: attrs;
463	}
464	return Attributes(attrs);
465}
466
467xmlstring(x: ref Parser, dosub: int): string
468{
469	skipwhite(x);
470	s := "";
471	delim := getc(x);
472	if (delim == '\"' || delim == '\'') {
473		s = gets(x, delim);
474		n := len s;
475		if (n == 0 || s[n-1] != delim)
476			error(x, "unclosed string at end of file");
477		s = s[0:n-1];	# TO DO: avoid copy
478		if(dosub)
479			s = substituteentities(x, s);
480	} else
481		error(x, sys->sprint("illegal string delimiter: %c", delim));
482	return s;
483}
484
485xmlname(x: ref Parser): string
486{
487	name := "";
488	ch := getc(x);
489	case ch {
490	'_' or ':' or
491	'a' to 'z' or
492	'A' to 'Z' or
493	16r100 to 16rd7ff or
494	16re000 or 16rfffd =>
495		name[0] = ch;
496loop:
497		for (;;) {
498			case ch = getc(x) {
499			'_' or '-' or ':' or '.' or
500			'a' to 'z' or
501			'0' to '9' or
502			'A' to 'Z' or
503			16r100 to 16rd7ff or
504			16re000 to 16rfffd =>
505				name[len name] = ch;
506			* =>
507				break loop;
508			}
509		}
510	}
511	ungetc(x);
512	return name;
513}
514
515substituteentities(x: ref Parser, buff: string): string
516{
517	i := 0;
518	while (i < len buff) {
519		if (buff[i] == '&') {
520			(t, j) := translateentity(x, buff, i);
521			# XXX could be quicker
522			buff = buff[0:i] + t + buff[j:];
523			i += len t;
524		} else
525			i++;
526	}
527	return buff;
528}
529
530# subsitute entities, squashing whitespace along the way.
531substituteentities_sp(x: ref Parser, buf: string): (string, int, int)
532{
533	firstwhite := 0;
534	# skip initial white space
535	for (i := 0; i < len buf; i++) {
536		c := buf[i];
537		if (c != ' ' && c != '\t' && c != '\n' && c != '\r')
538			break;
539		firstwhite = 1;
540	}
541
542	lastwhite := 0;
543	s := "";
544	for (; i < len buf; i++) {
545		c := buf[i];
546		if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
547			lastwhite = 1;
548		else {
549			if (lastwhite) {
550				s[len s] = ' ';
551				lastwhite = 0;
552			}
553			if (c == '&') {
554				# should &x20; count as whitespace?
555				(ent, j) := translateentity(x, buf, i);
556				i = j - 1;
557				s += ent;
558			} else
559				s[len s] = c;
560		}
561	}
562	return (s, firstwhite, lastwhite);
563}
564
565translateentity(x: ref Parser, s: string, i: int): (string, int)
566{
567	i++;
568	for (j := i; j < len s; j++)
569		if (s[j] == ';')
570			break;
571	ent := s[i:j];
572	if (j == len s) {
573		if (len ent > 10)
574			ent = ent[0:11] + "...";
575		warning(x, sys->sprint("missing ; at end of entity (&%s)", ent));
576		return (nil, i);
577	}
578	j++;
579	if (ent == nil) {
580		warning(x, "empty entity");
581		return ("", j);
582	}
583	if (ent[0] == '#') {
584		n: int;
585		rem := ent;
586		if (len ent >= 3 && ent[1] == 'x')
587			(n, rem) = str->toint(ent[2:], 16);
588		else if (len ent >= 2)
589			(n, rem) = str->toint(ent[1:], 10);
590		if (rem != nil) {
591			warning(x, sys->sprint("unrecognized entity (&%s)", ent));
592			return (nil, j);
593		}
594		ch: string = nil;
595		ch[0] = n;
596		return (ch, j);
597	}
598	hv := entdict.find(ent);
599	if (hv == nil) {
600		warning(x, sys->sprint("unrecognized entity (&%s)", ent));
601		return (nil, j);
602	}
603	return (hv.s, j);
604}
605
606balancedstring(x: ref Parser, eos: string): string
607{
608	s := "";
609	instring := 0;
610	quote: int;
611
612	for (i := 0; i < len eos; i++)
613		s[len s] = ' ';
614
615	skipwhite(x);
616	while ((c := getc(x)) != Bufio->EOF) {
617		s[len s] = c;
618		if (instring) {
619			if (c == quote)
620				instring = 0;
621		} else if (c == '\"' || c == '\'') {
622			quote = c;
623			instring = 1;
624		} else if (s[len s - len eos : len s] == eos)
625			return s[len eos : len s - len eos];
626	}
627	error(x, sys->sprint("unexpected end of file while looking for \"%s\"", eos));
628	return "";
629}
630
631skipwhite(x: ref Parser)
632{
633	while ((c := getc(x)) == ' ' || c == '\t' || c == '\n' || c == '\r')
634		;
635	ungetc(x);
636}
637
638expectwhite(x: ref Parser)
639{
640	if ((c := getc(x)) != ' ' && c != '\t' && c != '\n' && c != '\r')
641		error(x, "expecting white space");
642	skipwhite(x);
643}
644
645expect(x: ref Parser, ch: int)
646{
647	skipwhite(x);
648	c := getc(x);
649	if (c != ch)
650		error(x, sys->sprint("expecting %c", ch));
651}
652
653getc(x: ref Parser): int
654{
655	if (x.eof)
656		return Bufio->EOF;
657	ch := x.in.getc();
658	if (ch == Bufio->EOF)
659		x.eof = 1;
660	else if (ch == '\n')
661		x.loc.line++;
662	x.lastnl = ch == '\n';
663	return ch;
664}
665
666gets(x: ref Parser, delim: int): string
667{
668	if (x.eof)
669		return "";
670	s := x.in.gets(delim);
671	for (i := 0; i < len s; i++)
672		if (s[i] == '\n')
673			x.loc.line++;
674	if (s == "")
675		x.eof = 1;
676	else
677		x.lastnl = s[len s - 1] == '\n';
678	return s;
679}
680
681ungetc(x: ref Parser)
682{
683	if (x.eof)
684		return;
685	x.in.ungetc();
686	x.loc.line -= x.lastnl;
687}
688
689Attributes.all(al: self Attributes): list of Attribute
690{
691	return al.attrs;
692}
693
694Attributes.get(attrs: self Attributes, name: string): string
695{
696	for (a := attrs.attrs; a != nil; a = tl a)
697		if ((hd a).name == name)
698			return (hd a).value;
699	return nil;
700}
701
702warning(x: ref Parser, msg: string)
703{
704	if (x.warning != nil)
705		x.warning <-= (x.loc, msg);
706}
707
708error(x: ref Parser, msg: string)
709{
710	x.errormsg = msg;
711	raise "sax:error";
712}
713