xref: /inferno-os/appl/lib/xml.b (revision 9dc22068e29604f4b484e746112a9a4efe6fd57f)
1implement Xml;
2
3#
4# Portions copyright © 2002 Vita Nuova Holdings Limited
5#
6#
7# Derived from saxparser.b Copyright © 2001-2002 by John Powers or his employer
8#
9
10# TO DO:
11# - provide a way of getting attributes out of <?...?> (process) requests,
12# so that we can process stylesheet requests given in that way.
13
14include "sys.m";
15	sys: Sys;
16include "bufio.m";
17	bufio: Bufio;
18	Iobuf: import bufio;
19include "string.m";
20	str: String;
21include "hash.m";
22	hash: Hash;
23	HashTable: import hash;
24include "xml.m";
25
26Parcel: adt {
27	pick {
28	Start or
29	Empty =>
30		name: string;
31		attrs: Attributes;
32	End =>
33		name: string;
34	Text =>
35		ch: string;
36		ws1, ws2: int;
37	Process =>
38		target: string;
39		data: string;
40	Error =>
41		loc:	Locator;
42		msg:	string;
43	Doctype =>
44		name:	string;
45		public:	int;
46		params:	list of string;
47	Stylesheet =>
48		attrs: Attributes;
49	EOF =>
50	}
51};
52
53entinit := array[] of {
54	("AElig", "Æ"),
55	("OElig", "Œ"),
56	("aelig", "æ"),
57	("amp", "&"),
58	("apos", "\'"),
59	("copy", "©"),
60	("gt", ">"),
61	("ldquo", "``"),
62	("lt", "<"),
63	("mdash", "-"),		# XXX ??
64	("oelig", "œ"),
65	("quot", "\""),
66	("rdquo", "''"),
67	("rsquo", "'"),
68	("trade", "™"),
69	("nbsp", "\u00a0"),
70};
71entdict: ref HashTable;
72
73init(): string
74{
75	sys = load Sys Sys->PATH;
76	bufio = load Bufio Bufio->PATH;
77	if (bufio == nil)
78		return sys->sprint("cannot load %s: %r", Bufio->PATH);
79	str = load String String->PATH;
80	if (str == nil)
81		return sys->sprint("cannot load %s: %r", String->PATH);
82	hash = load Hash Hash->PATH;
83	if (hash == nil)
84		return sys->sprint("cannot load %s: %r", Hash->PATH);
85	entdict = hash->new(23);
86	for (i := 0; i < len entinit; i += 1) {
87		(key, value) := entinit[i];
88		entdict.insert(key, (0, 0.0, value));
89	}
90	return nil;
91}
92
93blankparser: Parser;
94
95open(srcfile: string, warning: chan of (Locator, string), preelem: string): (ref Parser, string)
96{
97	fd := bufio->open(srcfile, Bufio->OREAD);
98	if(fd == nil)
99		return (nil, sys->sprint("cannot open %s: %r", srcfile));
100	return fopen(fd, srcfile, warning, preelem);
101}
102
103fopen(fd: ref Bufio->Iobuf, name: string, warning: chan of (Locator, string), preelem: string): (ref Parser, string)
104{
105	x := ref blankparser;
106	x.in = fd;
107	# ignore utf16 initialisation character (yuck)
108	c := x.in.getc();
109	if (c != 16rfffe && c != 16rfeff)
110		x.in.ungetc();
111	x.estack = nil;
112	x.loc = Locator(1, name, "");
113	x.warning = warning;
114	x.preelem = preelem;
115	return (x, "");
116}
117
118Parser.next(x: self ref Parser): ref Item
119{
120	curroffset := x.fileoffset;
121	currloc := x.loc;
122	# read up until end of current item
123	while (x.actdepth > x.readdepth) {
124		pick p := getparcel(x) {
125		Start =>
126			x.actdepth++;
127		End =>
128			x.actdepth--;
129		EOF =>
130			x.actdepth = 0;			# premature EOF closes all tags
131		Error =>
132			return ref Item.Error(curroffset, x.loc, x.errormsg);
133		}
134	}
135	if (x.actdepth < x.readdepth) {
136		x.fileoffset = int x.in.offset();
137		return nil;
138	}
139	gp := getparcel(x);
140	item: ref Item;
141	pick p := gp {
142	Start =>
143		x.actdepth++;
144		item = ref Item.Tag(curroffset, p.name, p.attrs);
145	End =>
146		x.actdepth--;
147		item = nil;
148	EOF =>
149		x.actdepth = 0;
150		item = nil;
151	Error =>
152		x.actdepth = 0;			# XXX is this the right thing to do?
153		item = ref Item.Error(curroffset, currloc, x.errormsg);
154	Text =>
155		item = ref Item.Text(curroffset, p.ch, p.ws1, p.ws2);
156	Process =>
157		item = ref Item.Process(curroffset, p.target, p.data);
158	Empty =>
159		item = ref Item.Tag(curroffset, p.name, p.attrs);
160	Doctype =>
161		item = ref Item.Doctype(curroffset, p.name, p.public, p.params);
162	Stylesheet =>
163		item = ref Item.Stylesheet(curroffset, p.attrs);
164	}
165	x.fileoffset = int x.in.offset();
166	return item;
167}
168
169Parser.atmark(x: self ref Parser, m: ref Mark): int
170{
171	return  int x.in.offset() == m.offset;
172}
173
174Parser.down(x: self ref Parser)
175{
176	x.readdepth++;
177}
178
179Parser.up(x: self ref Parser)
180{
181	x.readdepth--;
182}
183
184# mark is only defined after a next(), not after up() or down().
185# this means that we don't have to record lots of state when going up or down levels.
186Parser.mark(x: self ref Parser): ref Mark
187{
188	return ref Mark(x.estack, x.loc.line, int x.in.offset(), x.readdepth);
189}
190
191Parser.goto(x: self ref Parser, m: ref Mark)
192{
193	x.in.seek(big m.offset, Sys->SEEKSTART);
194	x.fileoffset = m.offset;
195	x.eof = 0;
196	x.estack = m.estack;
197	x.loc.line = m.line;
198	x.readdepth = m.readdepth;
199	x.actdepth = len x.estack;
200}
201
202Mark.str(m: self ref Mark): string
203{
204	# assume that neither the filename nor any of the tags contain spaces.
205	# format:
206	# offset readdepth linenum [tag...]
207	# XXX would be nice if the produced string did not contain
208	# any spaces so it could be treated as a word in other contexts.
209	s := sys->sprint("%d %d %d", m.offset, m.readdepth, m.line);
210	for (t := m.estack; t != nil; t = tl t)
211		s += " " + hd t;
212	return s;
213}
214
215Parser.str2mark(p: self ref Parser, s: string): ref Mark
216{
217	(n, toks) := sys->tokenize(s, " ");
218	if (n < 3)
219		return nil;
220	m := ref Mark(nil, p.loc.line, 0, 0);
221	(m.offset, toks) = (int hd toks, tl toks);
222	(m.readdepth, toks) = (int hd toks, tl toks);
223	(m.line, toks) = (int hd toks, tl toks);
224	m.estack = toks;
225	return m;
226}
227
228getparcel(x: ref Parser): ref Parcel
229{
230	{
231		p: ref Parcel;
232		while (!x.eof && p == nil) {
233			c := getc(x);
234			if (c == '<')
235				p = element(x);
236			else {
237				ungetc(x);
238				p = characters(x);
239			}
240		}
241		if (p == nil)
242			p = ref Parcel.EOF;
243		return p;
244	}exception e{
245	"sax:*" =>
246			return ref Parcel.Error(x.loc, x.errormsg);
247	}
248}
249
250parcelstr(gi: ref Parcel): string
251{
252	if (gi == nil)
253		return "nil";
254	pick i := gi {
255	Start =>
256		return sys->sprint("Start: %s", i.name);
257	Empty =>
258		return sys->sprint("Empty: %s", i.name);
259	End =>
260		return "End";
261	Text =>
262		return "Text";
263	Doctype =>
264		return sys->sprint("Doctype: %s", i.name);
265	Stylesheet =>
266		return "Stylesheet";
267	Error =>
268		return "Error: " + i.msg;
269	EOF =>
270		return "EOF";
271	* =>
272		return "Unknown";
273	}
274}
275
276element(x: ref Parser): ref Parcel
277{
278	# <tag ...>
279	elemname := xmlname(x);
280	c: int;
281	if (elemname != "") {
282		attrs := buildattrs(x);
283		skipwhite(x);
284		c = getc(x);
285		isend := 0;
286		if (c == '/')
287			isend = 1;
288		else
289			ungetc(x);
290		expect(x, '>');
291
292		if (isend)
293			return ref Parcel.Empty(elemname, attrs);
294		else {
295			startelement(x, elemname);
296			return ref Parcel.Start(elemname, attrs);
297		}
298	# </tag>
299	} else if ((c = getc(x)) == '/') {
300		elemname = xmlname(x);
301		if (elemname != "") {
302			expect(x, '>');
303			endelement(x, elemname);
304			return ref Parcel.End(elemname);
305		}
306		else
307			error(x, sys->sprint("illegal beginning of tag: '%c'", c));
308	# <?tag ... ?>
309	} else if (c == '?') {
310		elemname = xmlname(x);
311		if (elemname != "") {
312			# this special case could be generalised if there were many
313			# processing instructions that took attributes like this.
314			if (elemname == "xml-stylesheet") {
315				attrs := buildattrs(x);
316				balancedstring(x, "?>");
317				return ref Parcel.Stylesheet(attrs);
318			} else {
319				data := balancedstring(x, "?>");
320				return ref Parcel.Process(elemname, data);
321			}
322		}
323	} else if (c == '!') {
324		c = getc(x);
325		case c {
326		'-' =>
327			# <!-- comment -->
328			if(getc(x) == '-'){
329				balancedstring(x, "-->");
330				return nil;
331			}
332		'[' =>
333			# <![CDATA[...]]
334			s := xmlname(x);
335			if(s == "CDATA" && getc(x) == '['){
336				data := balancedstring(x, "]]>");
337				return ref Parcel.Text(data, 0, 0);
338			}
339		* =>
340			# <!declaration
341			ungetc(x);
342			s := xmlname(x);
343			case s {
344			"DOCTYPE" =>
345				# <!DOCTYPE name (SYSTEM "filename" | PUBLIC "pubid" "uri"?)? ("[" decls "]")?>
346				skipwhite(x);
347				name := xmlname(x);
348				if(name == nil)
349					break;
350				id := "";
351				uri := "";
352				public := 0;
353				skipwhite(x);
354				case sort := xmlname(x) {
355				"SYSTEM" =>
356					id = xmlstring(x, 1);
357				"PUBLIC" =>
358					public = 1;
359					id = xmlstring(x, 1);
360					skipwhite(x);
361					c = getc(x);
362					ungetc(x);
363					if(c == '"' || c == '\'')
364						uri = xmlstring(x, 1);
365				* =>
366					error(x, sys->sprint("unknown DOCTYPE: %s", sort));
367					return nil;
368				}
369				skipwhite(x);
370				if(getc(x) == '['){
371					error(x, "cannot handle DOCTYPE with declarations");
372					return nil;
373				}
374				ungetc(x);
375				skipwhite(x);
376				if(getc(x) == '>')
377					return ref Parcel.Doctype(name, public, id :: uri :: nil);
378			"ELEMENT" or "ATTRLIST" or "NOTATION" or "ENTITY" =>
379				# don't interpret internal DTDs
380				# <!ENTITY name ("value" | SYSTEM "filename")>
381				s = gets(x, '>');
382				if(s == nil || s[len s-1] != '>')
383					error(x, "end of file in declaration");
384				return nil;
385			* =>
386				error(x, sys->sprint("unknown declaration: %s", s));
387			}
388		}
389		error(x, "invalid XML declaration");
390	} else
391		error(x, sys->sprint("illegal beginning of tag: %c", c));
392	return nil;
393}
394
395characters(x: ref Parser): ref Parcel
396{
397	p: ref Parcel;
398	content := gets(x, '<');
399	if (len content > 0) {
400		if (content[len content - 1] == '<') {
401			ungetc(x);
402			content = content[0:len content - 1];
403		}
404		ws1, ws2: int;
405		if (x.ispre) {
406			content = substituteentities(x, content);
407			ws1 = ws2 = 0;
408		} else
409			(content, ws1, ws2) = substituteentities_sp(x, content);
410		if (content != nil || ws1)
411			p = ref Parcel.Text(content, ws1, ws2);
412	}
413	return p;
414}
415
416startelement(x: ref Parser, name: string)
417{
418	x.estack = name :: x.estack;
419	if (name == x.preelem)
420		x.ispre++;
421}
422
423endelement(x: ref Parser, name: string)
424{
425	if (x.estack != nil && name == hd x.estack) {
426		x.estack = tl x.estack;
427		if (name == x.preelem)
428			x.ispre--;
429	} else {
430		starttag := "";
431		if (x.estack != nil)
432			starttag = hd x.estack;
433		warning(x, sys->sprint("<%s></%s> mismatch", starttag, name));
434
435		# invalid XML but try to recover anyway to reduce turnaround time on fixing errors.
436		# loop back up through the tag stack to see if there's a matching tag, in which case
437		# jump up in the stack to that, making some rude assumptions about the
438		# way Parcels are handled at the top level.
439		n := 0;
440		for (t := x.estack; t != nil; (t, n) = (tl t, n + 1))
441			if (hd t == name)
442				break;
443		if (t != nil) {
444			x.estack = tl t;
445			x.actdepth -= n;
446		}
447	}
448}
449
450buildattrs(x: ref Parser): Attributes
451{
452	attrs: list of Attribute;
453
454	attr: Attribute;
455	for (;;) {
456		skipwhite(x);
457		attr.name = xmlname(x);
458		if (attr.name == nil)
459			break;
460		skipwhite(x);
461		c := getc(x);
462		if(c != '='){
463			ungetc(x);
464			attr.value = nil;
465		}else
466			attr.value = xmlstring(x, 1);
467		attrs = attr :: attrs;
468	}
469	return Attributes(attrs);
470}
471
472xmlstring(x: ref Parser, dosub: int): string
473{
474	skipwhite(x);
475	s := "";
476	delim := getc(x);
477	if (delim == '\"' || delim == '\'') {
478		s = gets(x, delim);
479		n := len s;
480		if (n == 0 || s[n-1] != delim)
481			error(x, "unclosed string at end of file");
482		s = s[0:n-1];	# TO DO: avoid copy
483		if(dosub)
484			s = substituteentities(x, s);
485	} else
486		error(x, sys->sprint("illegal string delimiter: %c", delim));
487	return s;
488}
489
490xmlname(x: ref Parser): string
491{
492	name := "";
493	ch := getc(x);
494	case ch {
495	'_' or ':' or
496	'a' to 'z' or
497	'A' to 'Z' or
498	16r100 to 16rd7ff or
499	16re000 or 16rfffd =>
500		name[0] = ch;
501loop:
502		for (;;) {
503			case ch = getc(x) {
504			'_' or '-' or ':' or '.' or
505			'a' to 'z' or
506			'0' to '9' or
507			'A' to 'Z' or
508			16r100 to 16rd7ff or
509			16re000 to 16rfffd =>
510				name[len name] = ch;
511			* =>
512				break loop;
513			}
514		}
515	}
516	ungetc(x);
517	return name;
518}
519
520substituteentities(x: ref Parser, buff: string): string
521{
522	i := 0;
523	while (i < len buff) {
524		if (buff[i] == '&') {
525			(t, j) := translateentity(x, buff, i);
526			# XXX could be quicker
527			buff = buff[0:i] + t + buff[j:];
528			i += len t;
529		} else
530			i++;
531	}
532	return buff;
533}
534
535# subsitute entities, squashing whitespace along the way.
536substituteentities_sp(x: ref Parser, buf: string): (string, int, int)
537{
538	firstwhite := 0;
539	# skip initial white space
540	for (i := 0; i < len buf; i++) {
541		c := buf[i];
542		if (c != ' ' && c != '\t' && c != '\n' && c != '\r')
543			break;
544		firstwhite = 1;
545	}
546
547	lastwhite := 0;
548	s := "";
549	for (; i < len buf; i++) {
550		c := buf[i];
551		if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
552			lastwhite = 1;
553		else {
554			if (lastwhite) {
555				s[len s] = ' ';
556				lastwhite = 0;
557			}
558			if (c == '&') {
559				# should &x20; count as whitespace?
560				(ent, j) := translateentity(x, buf, i);
561				i = j - 1;
562				s += ent;
563			} else
564				s[len s] = c;
565		}
566	}
567	return (s, firstwhite, lastwhite);
568}
569
570translateentity(x: ref Parser, s: string, i: int): (string, int)
571{
572	i++;
573	for (j := i; j < len s; j++)
574		if (s[j] == ';')
575			break;
576	ent := s[i:j];
577	if (j == len s) {
578		if (len ent > 10)
579			ent = ent[0:11] + "...";
580		warning(x, sys->sprint("missing ; at end of entity (&%s)", ent));
581		return (nil, i);
582	}
583	j++;
584	if (ent == nil) {
585		warning(x, "empty entity");
586		return ("", j);
587	}
588	if (ent[0] == '#') {
589		n: int;
590		rem := ent;
591		if (len ent >= 3 && ent[1] == 'x')
592			(n, rem) = str->toint(ent[2:], 16);
593		else if (len ent >= 2)
594			(n, rem) = str->toint(ent[1:], 10);
595		if (rem != nil) {
596			warning(x, sys->sprint("unrecognized entity (&%s)", ent));
597			return (nil, j);
598		}
599		ch: string = nil;
600		ch[0] = n;
601		return (ch, j);
602	}
603	hv := entdict.find(ent);
604	if (hv == nil) {
605		warning(x, sys->sprint("unrecognized entity (&%s)", ent));
606		return (nil, j);
607	}
608	return (hv.s, j);
609}
610
611balancedstring(x: ref Parser, eos: string): string
612{
613	s := "";
614	instring := 0;
615	quote: int;
616
617	for (i := 0; i < len eos; i++)
618		s[len s] = ' ';
619
620	skipwhite(x);
621	while ((c := getc(x)) != Bufio->EOF) {
622		s[len s] = c;
623		if (instring) {
624			if (c == quote)
625				instring = 0;
626		} else if (c == '\"' || c == '\'') {
627			quote = c;
628			instring = 1;
629		} else if (s[len s - len eos : len s] == eos)
630			return s[len eos : len s - len eos];
631	}
632	error(x, sys->sprint("unexpected end of file while looking for \"%s\"", eos));
633	return "";
634}
635
636skipwhite(x: ref Parser)
637{
638	while ((c := getc(x)) == ' ' || c == '\t' || c == '\n' || c == '\r')
639		;
640	ungetc(x);
641}
642
643expectwhite(x: ref Parser)
644{
645	if ((c := getc(x)) != ' ' && c != '\t' && c != '\n' && c != '\r')
646		error(x, "expecting white space");
647	skipwhite(x);
648}
649
650expect(x: ref Parser, ch: int)
651{
652	skipwhite(x);
653	c := getc(x);
654	if (c != ch)
655		error(x, sys->sprint("expecting %c", ch));
656}
657
658getc(x: ref Parser): int
659{
660	if (x.eof)
661		return Bufio->EOF;
662	ch := x.in.getc();
663	if (ch == Bufio->EOF)
664		x.eof = 1;
665	else if (ch == '\n')
666		x.loc.line++;
667	x.lastnl = ch == '\n';
668	return ch;
669}
670
671gets(x: ref Parser, delim: int): string
672{
673	if (x.eof)
674		return "";
675	s := x.in.gets(delim);
676	for (i := 0; i < len s; i++)
677		if (s[i] == '\n')
678			x.loc.line++;
679	if (s == "")
680		x.eof = 1;
681	else
682		x.lastnl = s[len s - 1] == '\n';
683	return s;
684}
685
686ungetc(x: ref Parser)
687{
688	if (x.eof)
689		return;
690	x.in.ungetc();
691	x.loc.line -= x.lastnl;
692}
693
694Attributes.all(al: self Attributes): list of Attribute
695{
696	return al.attrs;
697}
698
699Attributes.get(attrs: self Attributes, name: string): string
700{
701	for (a := attrs.attrs; a != nil; a = tl a)
702		if ((hd a).name == name)
703			return (hd a).value;
704	return nil;
705}
706
707warning(x: ref Parser, msg: string)
708{
709	if (x.warning != nil)
710		x.warning <-= (x.loc, msg);
711}
712
713error(x: ref Parser, msg: string)
714{
715	x.errormsg = msg;
716	raise "sax:error";
717}
718