1implement Xml; 2 3# 4# Portions copyright © 2002 Vita Nuova Holdings Limited 5# 6# 7# Derived from saxparser.b Copyright © 2001-2002 by John Powers or his employer 8# 9 10# TO DO: 11# - provide a way of getting attributes out of <?...?> (process) requests, 12# so that we can process stylesheet requests given in that way. 13 14include "sys.m"; 15 sys: Sys; 16include "bufio.m"; 17 bufio: Bufio; 18 Iobuf: import bufio; 19include "string.m"; 20 str: String; 21include "hash.m"; 22 hash: Hash; 23 HashTable: import hash; 24include "xml.m"; 25 26Parcel: adt { 27 pick { 28 Start or 29 Empty => 30 name: string; 31 attrs: Attributes; 32 End => 33 name: string; 34 Text => 35 ch: string; 36 ws1, ws2: int; 37 Process => 38 target: string; 39 data: string; 40 Error => 41 loc: Locator; 42 msg: string; 43 Doctype => 44 name: string; 45 public: int; 46 params: list of string; 47 Stylesheet => 48 attrs: Attributes; 49 EOF => 50 } 51}; 52 53entinit := array[] of { 54 ("AElig", "Æ"), 55 ("OElig", "Œ"), 56 ("aelig", "æ"), 57 ("amp", "&"), 58 ("apos", "\'"), 59 ("copy", "©"), 60 ("gt", ">"), 61 ("ldquo", "``"), 62 ("lt", "<"), 63 ("mdash", "-"), # XXX ?? 64 ("oelig", "œ"), 65 ("quot", "\""), 66 ("rdquo", "''"), 67 ("rsquo", "'"), 68 ("trade", "™"), 69 ("nbsp", "\u00a0"), 70}; 71entdict: ref HashTable; 72 73init(): string 74{ 75 sys = load Sys Sys->PATH; 76 bufio = load Bufio Bufio->PATH; 77 if (bufio == nil) 78 return sys->sprint("cannot load %s: %r", Bufio->PATH); 79 str = load String String->PATH; 80 if (str == nil) 81 return sys->sprint("cannot load %s: %r", String->PATH); 82 hash = load Hash Hash->PATH; 83 if (hash == nil) 84 return sys->sprint("cannot load %s: %r", Hash->PATH); 85 entdict = hash->new(23); 86 for (i := 0; i < len entinit; i += 1) { 87 (key, value) := entinit[i]; 88 entdict.insert(key, (0, 0.0, value)); 89 } 90 return nil; 91} 92 93blankparser: Parser; 94 95open(srcfile: string, warning: chan of (Locator, string), preelem: string): (ref Parser, string) 96{ 97 fd := bufio->open(srcfile, Bufio->OREAD); 98 if(fd == nil) 99 return (nil, sys->sprint("cannot open %s: %r", srcfile)); 100 return fopen(fd, srcfile, warning, preelem); 101} 102 103fopen(fd: ref Bufio->Iobuf, name: string, warning: chan of (Locator, string), preelem: string): (ref Parser, string) 104{ 105 x := ref blankparser; 106 x.in = fd; 107 # ignore utf16 initialisation character (yuck) 108 c := x.in.getc(); 109 if (c != 16rfffe && c != 16rfeff) 110 x.in.ungetc(); 111 x.estack = nil; 112 x.loc = Locator(1, name, ""); 113 x.warning = warning; 114 x.preelem = preelem; 115 return (x, ""); 116} 117 118Parser.next(x: self ref Parser): ref Item 119{ 120 curroffset := x.fileoffset; 121 currloc := x.loc; 122 # read up until end of current item 123 while (x.actdepth > x.readdepth) { 124 pick p := getparcel(x) { 125 Start => 126 x.actdepth++; 127 End => 128 x.actdepth--; 129 EOF => 130 x.actdepth = 0; # premature EOF closes all tags 131 Error => 132 return ref Item.Error(curroffset, x.loc, x.errormsg); 133 } 134 } 135 if (x.actdepth < x.readdepth) { 136 x.fileoffset = int x.in.offset(); 137 return nil; 138 } 139 gp := getparcel(x); 140 item: ref Item; 141 pick p := gp { 142 Start => 143 x.actdepth++; 144 item = ref Item.Tag(curroffset, p.name, p.attrs); 145 End => 146 x.actdepth--; 147 item = nil; 148 EOF => 149 x.actdepth = 0; 150 item = nil; 151 Error => 152 x.actdepth = 0; # XXX is this the right thing to do? 153 item = ref Item.Error(curroffset, currloc, x.errormsg); 154 Text => 155 item = ref Item.Text(curroffset, p.ch, p.ws1, p.ws2); 156 Process => 157 item = ref Item.Process(curroffset, p.target, p.data); 158 Empty => 159 item = ref Item.Tag(curroffset, p.name, p.attrs); 160 Doctype => 161 item = ref Item.Doctype(curroffset, p.name, p.public, p.params); 162 Stylesheet => 163 item = ref Item.Stylesheet(curroffset, p.attrs); 164 } 165 x.fileoffset = int x.in.offset(); 166 return item; 167} 168 169Parser.atmark(x: self ref Parser, m: ref Mark): int 170{ 171 return int x.in.offset() == m.offset; 172} 173 174Parser.down(x: self ref Parser) 175{ 176 x.readdepth++; 177} 178 179Parser.up(x: self ref Parser) 180{ 181 x.readdepth--; 182} 183 184# mark is only defined after a next(), not after up() or down(). 185# this means that we don't have to record lots of state when going up or down levels. 186Parser.mark(x: self ref Parser): ref Mark 187{ 188 return ref Mark(x.estack, x.loc.line, int x.in.offset(), x.readdepth); 189} 190 191Parser.goto(x: self ref Parser, m: ref Mark) 192{ 193 x.in.seek(big m.offset, Sys->SEEKSTART); 194 x.fileoffset = m.offset; 195 x.eof = 0; 196 x.estack = m.estack; 197 x.loc.line = m.line; 198 x.readdepth = m.readdepth; 199 x.actdepth = len x.estack; 200} 201 202Mark.str(m: self ref Mark): string 203{ 204 # assume that neither the filename nor any of the tags contain spaces. 205 # format: 206 # offset readdepth linenum [tag...] 207 # XXX would be nice if the produced string did not contain 208 # any spaces so it could be treated as a word in other contexts. 209 s := sys->sprint("%d %d %d", m.offset, m.readdepth, m.line); 210 for (t := m.estack; t != nil; t = tl t) 211 s += " " + hd t; 212 return s; 213} 214 215Parser.str2mark(p: self ref Parser, s: string): ref Mark 216{ 217 (n, toks) := sys->tokenize(s, " "); 218 if (n < 3) 219 return nil; 220 m := ref Mark(nil, p.loc.line, 0, 0); 221 (m.offset, toks) = (int hd toks, tl toks); 222 (m.readdepth, toks) = (int hd toks, tl toks); 223 (m.line, toks) = (int hd toks, tl toks); 224 m.estack = toks; 225 return m; 226} 227 228getparcel(x: ref Parser): ref Parcel 229{ 230 { 231 p: ref Parcel; 232 while (!x.eof && p == nil) { 233 c := getc(x); 234 if (c == '<') 235 p = element(x); 236 else { 237 ungetc(x); 238 p = characters(x); 239 } 240 } 241 if (p == nil) 242 p = ref Parcel.EOF; 243 return p; 244 }exception e{ 245 "sax:*" => 246 return ref Parcel.Error(x.loc, x.errormsg); 247 } 248} 249 250parcelstr(gi: ref Parcel): string 251{ 252 if (gi == nil) 253 return "nil"; 254 pick i := gi { 255 Start => 256 return sys->sprint("Start: %s", i.name); 257 Empty => 258 return sys->sprint("Empty: %s", i.name); 259 End => 260 return "End"; 261 Text => 262 return "Text"; 263 Doctype => 264 return sys->sprint("Doctype: %s", i.name); 265 Stylesheet => 266 return "Stylesheet"; 267 Error => 268 return "Error: " + i.msg; 269 EOF => 270 return "EOF"; 271 * => 272 return "Unknown"; 273 } 274} 275 276element(x: ref Parser): ref Parcel 277{ 278 # <tag ...> 279 elemname := xmlname(x); 280 c: int; 281 if (elemname != "") { 282 attrs := buildattrs(x); 283 skipwhite(x); 284 c = getc(x); 285 isend := 0; 286 if (c == '/') 287 isend = 1; 288 else 289 ungetc(x); 290 expect(x, '>'); 291 292 if (isend) 293 return ref Parcel.Empty(elemname, attrs); 294 else { 295 startelement(x, elemname); 296 return ref Parcel.Start(elemname, attrs); 297 } 298 # </tag> 299 } else if ((c = getc(x)) == '/') { 300 elemname = xmlname(x); 301 if (elemname != "") { 302 expect(x, '>'); 303 endelement(x, elemname); 304 return ref Parcel.End(elemname); 305 } 306 else 307 error(x, sys->sprint("illegal beginning of tag: '%c'", c)); 308 # <?tag ... ?> 309 } else if (c == '?') { 310 elemname = xmlname(x); 311 if (elemname != "") { 312 # this special case could be generalised if there were many 313 # processing instructions that took attributes like this. 314 if (elemname == "xml-stylesheet") { 315 attrs := buildattrs(x); 316 balancedstring(x, "?>"); 317 return ref Parcel.Stylesheet(attrs); 318 } else { 319 data := balancedstring(x, "?>"); 320 return ref Parcel.Process(elemname, data); 321 } 322 } 323 } else if (c == '!') { 324 c = getc(x); 325 case c { 326 '-' => 327 # <!-- comment --> 328 if(getc(x) == '-'){ 329 balancedstring(x, "-->"); 330 return nil; 331 } 332 '[' => 333 # <![CDATA[...]] 334 s := xmlname(x); 335 if(s == "CDATA" && getc(x) == '['){ 336 data := balancedstring(x, "]]>"); 337 return ref Parcel.Text(data, 0, 0); 338 } 339 * => 340 # <!declaration 341 ungetc(x); 342 s := xmlname(x); 343 case s { 344 "DOCTYPE" => 345 # <!DOCTYPE name (SYSTEM "filename" | PUBLIC "pubid" "uri"?)? ("[" decls "]")?> 346 skipwhite(x); 347 name := xmlname(x); 348 if(name == nil) 349 break; 350 id := ""; 351 uri := ""; 352 public := 0; 353 skipwhite(x); 354 case sort := xmlname(x) { 355 "SYSTEM" => 356 id = xmlstring(x, 1); 357 "PUBLIC" => 358 public = 1; 359 id = xmlstring(x, 1); 360 skipwhite(x); 361 c = getc(x); 362 ungetc(x); 363 if(c == '"' || c == '\'') 364 uri = xmlstring(x, 1); 365 * => 366 error(x, sys->sprint("unknown DOCTYPE: %s", sort)); 367 return nil; 368 } 369 skipwhite(x); 370 if(getc(x) == '['){ 371 error(x, "cannot handle DOCTYPE with declarations"); 372 return nil; 373 } 374 ungetc(x); 375 skipwhite(x); 376 if(getc(x) == '>') 377 return ref Parcel.Doctype(name, public, id :: uri :: nil); 378 "ELEMENT" or "ATTRLIST" or "NOTATION" or "ENTITY" => 379 # don't interpret internal DTDs 380 # <!ENTITY name ("value" | SYSTEM "filename")> 381 s = gets(x, '>'); 382 if(s == nil || s[len s-1] != '>') 383 error(x, "end of file in declaration"); 384 return nil; 385 * => 386 error(x, sys->sprint("unknown declaration: %s", s)); 387 } 388 } 389 error(x, "invalid XML declaration"); 390 } else 391 error(x, sys->sprint("illegal beginning of tag: %c", c)); 392 return nil; 393} 394 395characters(x: ref Parser): ref Parcel 396{ 397 p: ref Parcel; 398 content := gets(x, '<'); 399 if (len content > 0) { 400 if (content[len content - 1] == '<') { 401 ungetc(x); 402 content = content[0:len content - 1]; 403 } 404 ws1, ws2: int; 405 if (x.ispre) { 406 content = substituteentities(x, content); 407 ws1 = ws2 = 0; 408 } else 409 (content, ws1, ws2) = substituteentities_sp(x, content); 410 if (content != nil || ws1) 411 p = ref Parcel.Text(content, ws1, ws2); 412 } 413 return p; 414} 415 416startelement(x: ref Parser, name: string) 417{ 418 x.estack = name :: x.estack; 419 if (name == x.preelem) 420 x.ispre++; 421} 422 423endelement(x: ref Parser, name: string) 424{ 425 if (x.estack != nil && name == hd x.estack) { 426 x.estack = tl x.estack; 427 if (name == x.preelem) 428 x.ispre--; 429 } else { 430 starttag := ""; 431 if (x.estack != nil) 432 starttag = hd x.estack; 433 warning(x, sys->sprint("<%s></%s> mismatch", starttag, name)); 434 435 # invalid XML but try to recover anyway to reduce turnaround time on fixing errors. 436 # loop back up through the tag stack to see if there's a matching tag, in which case 437 # jump up in the stack to that, making some rude assumptions about the 438 # way Parcels are handled at the top level. 439 n := 0; 440 for (t := x.estack; t != nil; (t, n) = (tl t, n + 1)) 441 if (hd t == name) 442 break; 443 if (t != nil) { 444 x.estack = tl t; 445 x.actdepth -= n; 446 } 447 } 448} 449 450buildattrs(x: ref Parser): Attributes 451{ 452 attrs: list of Attribute; 453 454 attr: Attribute; 455 for (;;) { 456 skipwhite(x); 457 attr.name = xmlname(x); 458 if (attr.name == nil) 459 break; 460 skipwhite(x); 461 c := getc(x); 462 if(c != '='){ 463 ungetc(x); 464 attr.value = nil; 465 }else 466 attr.value = xmlstring(x, 1); 467 attrs = attr :: attrs; 468 } 469 return Attributes(attrs); 470} 471 472xmlstring(x: ref Parser, dosub: int): string 473{ 474 skipwhite(x); 475 s := ""; 476 delim := getc(x); 477 if (delim == '\"' || delim == '\'') { 478 s = gets(x, delim); 479 n := len s; 480 if (n == 0 || s[n-1] != delim) 481 error(x, "unclosed string at end of file"); 482 s = s[0:n-1]; # TO DO: avoid copy 483 if(dosub) 484 s = substituteentities(x, s); 485 } else 486 error(x, sys->sprint("illegal string delimiter: %c", delim)); 487 return s; 488} 489 490xmlname(x: ref Parser): string 491{ 492 name := ""; 493 ch := getc(x); 494 case ch { 495 '_' or ':' or 496 'a' to 'z' or 497 'A' to 'Z' or 498 16r100 to 16rd7ff or 499 16re000 or 16rfffd => 500 name[0] = ch; 501loop: 502 for (;;) { 503 case ch = getc(x) { 504 '_' or '-' or ':' or '.' or 505 'a' to 'z' or 506 '0' to '9' or 507 'A' to 'Z' or 508 16r100 to 16rd7ff or 509 16re000 to 16rfffd => 510 name[len name] = ch; 511 * => 512 break loop; 513 } 514 } 515 } 516 ungetc(x); 517 return name; 518} 519 520substituteentities(x: ref Parser, buff: string): string 521{ 522 i := 0; 523 while (i < len buff) { 524 if (buff[i] == '&') { 525 (t, j) := translateentity(x, buff, i); 526 # XXX could be quicker 527 buff = buff[0:i] + t + buff[j:]; 528 i += len t; 529 } else 530 i++; 531 } 532 return buff; 533} 534 535# subsitute entities, squashing whitespace along the way. 536substituteentities_sp(x: ref Parser, buf: string): (string, int, int) 537{ 538 firstwhite := 0; 539 # skip initial white space 540 for (i := 0; i < len buf; i++) { 541 c := buf[i]; 542 if (c != ' ' && c != '\t' && c != '\n' && c != '\r') 543 break; 544 firstwhite = 1; 545 } 546 547 lastwhite := 0; 548 s := ""; 549 for (; i < len buf; i++) { 550 c := buf[i]; 551 if (c == ' ' || c == '\t' || c == '\n' || c == '\r') 552 lastwhite = 1; 553 else { 554 if (lastwhite) { 555 s[len s] = ' '; 556 lastwhite = 0; 557 } 558 if (c == '&') { 559 # should &x20; count as whitespace? 560 (ent, j) := translateentity(x, buf, i); 561 i = j - 1; 562 s += ent; 563 } else 564 s[len s] = c; 565 } 566 } 567 return (s, firstwhite, lastwhite); 568} 569 570translateentity(x: ref Parser, s: string, i: int): (string, int) 571{ 572 i++; 573 for (j := i; j < len s; j++) 574 if (s[j] == ';') 575 break; 576 ent := s[i:j]; 577 if (j == len s) { 578 if (len ent > 10) 579 ent = ent[0:11] + "..."; 580 warning(x, sys->sprint("missing ; at end of entity (&%s)", ent)); 581 return (nil, i); 582 } 583 j++; 584 if (ent == nil) { 585 warning(x, "empty entity"); 586 return ("", j); 587 } 588 if (ent[0] == '#') { 589 n: int; 590 rem := ent; 591 if (len ent >= 3 && ent[1] == 'x') 592 (n, rem) = str->toint(ent[2:], 16); 593 else if (len ent >= 2) 594 (n, rem) = str->toint(ent[1:], 10); 595 if (rem != nil) { 596 warning(x, sys->sprint("unrecognized entity (&%s)", ent)); 597 return (nil, j); 598 } 599 ch: string = nil; 600 ch[0] = n; 601 return (ch, j); 602 } 603 hv := entdict.find(ent); 604 if (hv == nil) { 605 warning(x, sys->sprint("unrecognized entity (&%s)", ent)); 606 return (nil, j); 607 } 608 return (hv.s, j); 609} 610 611balancedstring(x: ref Parser, eos: string): string 612{ 613 s := ""; 614 instring := 0; 615 quote: int; 616 617 for (i := 0; i < len eos; i++) 618 s[len s] = ' '; 619 620 skipwhite(x); 621 while ((c := getc(x)) != Bufio->EOF) { 622 s[len s] = c; 623 if (instring) { 624 if (c == quote) 625 instring = 0; 626 } else if (c == '\"' || c == '\'') { 627 quote = c; 628 instring = 1; 629 } else if (s[len s - len eos : len s] == eos) 630 return s[len eos : len s - len eos]; 631 } 632 error(x, sys->sprint("unexpected end of file while looking for \"%s\"", eos)); 633 return ""; 634} 635 636skipwhite(x: ref Parser) 637{ 638 while ((c := getc(x)) == ' ' || c == '\t' || c == '\n' || c == '\r') 639 ; 640 ungetc(x); 641} 642 643expectwhite(x: ref Parser) 644{ 645 if ((c := getc(x)) != ' ' && c != '\t' && c != '\n' && c != '\r') 646 error(x, "expecting white space"); 647 skipwhite(x); 648} 649 650expect(x: ref Parser, ch: int) 651{ 652 skipwhite(x); 653 c := getc(x); 654 if (c != ch) 655 error(x, sys->sprint("expecting %c", ch)); 656} 657 658getc(x: ref Parser): int 659{ 660 if (x.eof) 661 return Bufio->EOF; 662 ch := x.in.getc(); 663 if (ch == Bufio->EOF) 664 x.eof = 1; 665 else if (ch == '\n') 666 x.loc.line++; 667 x.lastnl = ch == '\n'; 668 return ch; 669} 670 671gets(x: ref Parser, delim: int): string 672{ 673 if (x.eof) 674 return ""; 675 s := x.in.gets(delim); 676 for (i := 0; i < len s; i++) 677 if (s[i] == '\n') 678 x.loc.line++; 679 if (s == "") 680 x.eof = 1; 681 else 682 x.lastnl = s[len s - 1] == '\n'; 683 return s; 684} 685 686ungetc(x: ref Parser) 687{ 688 if (x.eof) 689 return; 690 x.in.ungetc(); 691 x.loc.line -= x.lastnl; 692} 693 694Attributes.all(al: self Attributes): list of Attribute 695{ 696 return al.attrs; 697} 698 699Attributes.get(attrs: self Attributes, name: string): string 700{ 701 for (a := attrs.attrs; a != nil; a = tl a) 702 if ((hd a).name == name) 703 return (hd a).value; 704 return nil; 705} 706 707warning(x: ref Parser, msg: string) 708{ 709 if (x.warning != nil) 710 x.warning <-= (x.loc, msg); 711} 712 713error(x: ref Parser, msg: string) 714{ 715 x.errormsg = msg; 716 raise "sax:error"; 717} 718