1implement Xml; 2 3# 4# Portions copyright © 2002 Vita Nuova Holdings Limited 5# 6# 7# Derived from saxparser.b Copyright © 2001-2002 by John Powers or his employer 8# 9 10# TO DO: 11# - provide a way of getting attributes out of <?...?> (process) requests, 12# so that we can process stylesheet requests given in that way. 13 14include "sys.m"; 15 sys: Sys; 16include "bufio.m"; 17 bufio: Bufio; 18 Iobuf: import bufio; 19include "string.m"; 20 str: String; 21include "hash.m"; 22 hash: Hash; 23 HashTable: import hash; 24include "xml.m"; 25 26Parcel: adt { 27 pick { 28 Start or 29 Empty => 30 name: string; 31 attrs: Attributes; 32 End => 33 name: string; 34 Text => 35 ch: string; 36 ws1, ws2: int; 37 Process => 38 target: string; 39 data: string; 40 Error => 41 loc: Locator; 42 msg: string; 43 Doctype => 44 name: string; 45 public: int; 46 params: list of string; 47 Stylesheet => 48 attrs: Attributes; 49 EOF => 50 } 51}; 52 53entinit := array[] of { 54 ("AElig", "Æ"), 55 ("OElig", "Œ"), 56 ("aelig", "æ"), 57 ("amp", "&"), 58 ("apos", "\'"), 59 ("copy", "©"), 60 ("gt", ">"), 61 ("ldquo", "``"), 62 ("lt", "<"), 63 ("mdash", "-"), # XXX ?? 64 ("oelig", "œ"), 65 ("quot", "\""), 66 ("rdquo", "''"), 67 ("rsquo", "'"), 68 ("trade", "™"), 69 ("nbsp", "\u00a0"), 70}; 71entdict: ref HashTable; 72 73init(): string 74{ 75 sys = load Sys Sys->PATH; 76 bufio = load Bufio Bufio->PATH; 77 if (bufio == nil) 78 return sys->sprint("cannot load %s: %r", Bufio->PATH); 79 str = load String String->PATH; 80 if (str == nil) 81 return sys->sprint("cannot load %s: %r", String->PATH); 82 hash = load Hash Hash->PATH; 83 if (hash == nil) 84 return sys->sprint("cannot load %s: %r", Hash->PATH); 85 entdict = hash->new(23); 86 for (i := 0; i < len entinit; i += 1) { 87 (key, value) := entinit[i]; 88 entdict.insert(key, (0, 0.0, value)); 89 } 90 return nil; 91} 92 93blankparser: Parser; 94 95open(srcfile: string, warning: chan of (Locator, string), preelem: string): (ref Parser, string) 96{ 97 x := ref blankparser; 98 x.in = bufio->open(srcfile, Bufio->OREAD); 99 if (x.in == nil) 100 return (nil, sys->sprint("cannot open %s: %r", srcfile)); 101 # ignore utf16 intialisation character (yuck) 102 c := x.in.getc(); 103 if (c != 16rfffe && c != 16rfeff) 104 x.in.ungetc(); 105 x.estack = nil; 106 x.loc = Locator(1, srcfile, ""); 107 x.warning = warning; 108 x.preelem = preelem; 109 return (x, ""); 110} 111 112Parser.next(x: self ref Parser): ref Item 113{ 114 curroffset := x.fileoffset; 115 currloc := x.loc; 116 # read up until end of current item 117 while (x.actdepth > x.readdepth) { 118 pick p := getparcel(x) { 119 Start => 120 x.actdepth++; 121 End => 122 x.actdepth--; 123 EOF => 124 x.actdepth = 0; # premature EOF closes all tags 125 Error => 126 return ref Item.Error(curroffset, x.loc, x.errormsg); 127 } 128 } 129 if (x.actdepth < x.readdepth) { 130 x.fileoffset = int x.in.offset(); 131 return nil; 132 } 133 gp := getparcel(x); 134 item: ref Item; 135 pick p := gp { 136 Start => 137 x.actdepth++; 138 item = ref Item.Tag(curroffset, p.name, p.attrs); 139 End => 140 x.actdepth--; 141 item = nil; 142 EOF => 143 x.actdepth = 0; 144 item = nil; 145 Error => 146 x.actdepth = 0; # XXX is this the right thing to do? 147 item = ref Item.Error(curroffset, currloc, x.errormsg); 148 Text => 149 item = ref Item.Text(curroffset, p.ch, p.ws1, p.ws2); 150 Process => 151 item = ref Item.Process(curroffset, p.target, p.data); 152 Empty => 153 item = ref Item.Tag(curroffset, p.name, p.attrs); 154 Doctype => 155 item = ref Item.Doctype(curroffset, p.name, p.public, p.params); 156 Stylesheet => 157 item = ref Item.Stylesheet(curroffset, p.attrs); 158 } 159 x.fileoffset = int x.in.offset(); 160 return item; 161} 162 163Parser.atmark(x: self ref Parser, m: ref Mark): int 164{ 165 return int x.in.offset() == m.offset; 166} 167 168Parser.down(x: self ref Parser) 169{ 170 x.readdepth++; 171} 172 173Parser.up(x: self ref Parser) 174{ 175 x.readdepth--; 176} 177 178# mark is only defined after a next(), not after up() or down(). 179# this means that we don't have to record lots of state when going up or down levels. 180Parser.mark(x: self ref Parser): ref Mark 181{ 182 return ref Mark(x.estack, x.loc.line, int x.in.offset(), x.readdepth); 183} 184 185Parser.goto(x: self ref Parser, m: ref Mark) 186{ 187 x.in.seek(big m.offset, Sys->SEEKSTART); 188 x.fileoffset = m.offset; 189 x.eof = 0; 190 x.estack = m.estack; 191 x.loc.line = m.line; 192 x.readdepth = m.readdepth; 193 x.actdepth = len x.estack; 194} 195 196Mark.str(m: self ref Mark): string 197{ 198 # assume that neither the filename nor any of the tags contain spaces. 199 # format: 200 # offset readdepth linenum [tag...] 201 # XXX would be nice if the produced string did not contain 202 # any spaces so it could be treated as a word in other contexts. 203 s := sys->sprint("%d %d %d", m.offset, m.readdepth, m.line); 204 for (t := m.estack; t != nil; t = tl t) 205 s += " " + hd t; 206 return s; 207} 208 209Parser.str2mark(p: self ref Parser, s: string): ref Mark 210{ 211 (n, toks) := sys->tokenize(s, " "); 212 if (n < 3) 213 return nil; 214 m := ref Mark(nil, p.loc.line, 0, 0); 215 (m.offset, toks) = (int hd toks, tl toks); 216 (m.readdepth, toks) = (int hd toks, tl toks); 217 (m.line, toks) = (int hd toks, tl toks); 218 m.estack = toks; 219 return m; 220} 221 222getparcel(x: ref Parser): ref Parcel 223{ 224 { 225 p: ref Parcel; 226 while (!x.eof && p == nil) { 227 c := getc(x); 228 if (c == '<') 229 p = element(x); 230 else { 231 ungetc(x); 232 p = characters(x); 233 } 234 } 235 if (p == nil) 236 p = ref Parcel.EOF; 237 return p; 238 } 239 exception e{ 240 "sax:*" => 241 return ref Parcel.Error(x.loc, x.errormsg); 242 } 243} 244 245parcelstr(gi: ref Parcel): string 246{ 247 if (gi == nil) 248 return "nil"; 249 pick i := gi { 250 Start => 251 return sys->sprint("Start: %s", i.name); 252 Empty => 253 return sys->sprint("Empty: %s", i.name); 254 End => 255 return "End"; 256 Text => 257 return "Text"; 258 Doctype => 259 return sys->sprint("Doctype: %s", i.name); 260 Stylesheet => 261 return "Stylesheet"; 262 Error => 263 return "Error: " + i.msg; 264 EOF => 265 return "EOF"; 266 * => 267 return "Unknown"; 268 } 269} 270 271element(x: ref Parser): ref Parcel 272{ 273 # <tag ...> 274 elemname := xmlname(x); 275 c: int; 276 if (elemname != "") { 277 attrs := buildattrs(x); 278 skipwhite(x); 279 c = getc(x); 280 isend := 0; 281 if (c == '/') 282 isend = 1; 283 else 284 ungetc(x); 285 expect(x, '>'); 286 287 if (isend) 288 return ref Parcel.Empty(elemname, attrs); 289 else { 290 startelement(x, elemname); 291 return ref Parcel.Start(elemname, attrs); 292 } 293 # </tag> 294 } else if ((c = getc(x)) == '/') { 295 elemname = xmlname(x); 296 if (elemname != "") { 297 expect(x, '>'); 298 endelement(x, elemname); 299 return ref Parcel.End(elemname); 300 } 301 else 302 error(x, sys->sprint("illegal beginning of tag: '%c'", c)); 303 # <?tag ... ?> 304 } else if (c == '?') { 305 elemname = xmlname(x); 306 if (elemname != "") { 307 # this special case could be generalised if there were many 308 # processing instructions that took attributes like this. 309 if (elemname == "xml-stylesheet") { 310 attrs := buildattrs(x); 311 balancedstring(x, "?>"); 312 return ref Parcel.Stylesheet(attrs); 313 } else { 314 data := balancedstring(x, "?>"); 315 return ref Parcel.Process(elemname, data); 316 } 317 } 318 } else if (c == '!') { 319 c = getc(x); 320 case c { 321 '-' => 322 # <!-- comment --> 323 if(getc(x) == '-'){ 324 balancedstring(x, "-->"); 325 return nil; 326 } 327 '[' => 328 # <![CDATA[...]] 329 s := xmlname(x); 330 if(s == "CDATA" && getc(x) == '['){ 331 data := balancedstring(x, "]]>"); 332 return ref Parcel.Text(data, 0, 0); 333 } 334 * => 335 # <!declaration 336 ungetc(x); 337 s := xmlname(x); 338 case s { 339 "DOCTYPE" => 340 # <!DOCTYPE name (SYSTEM "filename" | PUBLIC "pubid" "uri"?)? ("[" decls "]")?> 341 skipwhite(x); 342 name := xmlname(x); 343 if(name == nil) 344 break; 345 id := ""; 346 uri := ""; 347 public := 0; 348 skipwhite(x); 349 case sort := xmlname(x) { 350 "SYSTEM" => 351 id = xmlstring(x, 1); 352 "PUBLIC" => 353 public = 1; 354 id = xmlstring(x, 1); 355 skipwhite(x); 356 c = getc(x); 357 ungetc(x); 358 if(c == '"' || c == '\'') 359 uri = xmlstring(x, 1); 360 * => 361 error(x, sys->sprint("unknown DOCTYPE: %s", sort)); 362 return nil; 363 } 364 skipwhite(x); 365 if(getc(x) == '['){ 366 error(x, "cannot handle DOCTYPE with declarations"); 367 return nil; 368 } 369 ungetc(x); 370 skipwhite(x); 371 if(getc(x) == '>') 372 return ref Parcel.Doctype(name, public, id :: uri :: nil); 373 "ELEMENT" or "ATTRLIST" or "NOTATION" or "ENTITY" => 374 # don't interpret internal DTDs 375 # <!ENTITY name ("value" | SYSTEM "filename")> 376 s = gets(x, '>'); 377 if(s == nil || s[len s-1] != '>') 378 error(x, "end of file in declaration"); 379 return nil; 380 * => 381 error(x, sys->sprint("unknown declaration: %s", s)); 382 } 383 } 384 error(x, "invalid XML declaration"); 385 } else 386 error(x, sys->sprint("illegal beginning of tag: %c", c)); 387 return nil; 388} 389 390characters(x: ref Parser): ref Parcel 391{ 392 p: ref Parcel; 393 content := gets(x, '<'); 394 if (len content > 0) { 395 if (content[len content - 1] == '<') { 396 ungetc(x); 397 content = content[0:len content - 1]; 398 } 399 ws1, ws2: int; 400 if (x.ispre) { 401 content = substituteentities(x, content); 402 ws1 = ws2 = 0; 403 } else 404 (content, ws1, ws2) = substituteentities_sp(x, content); 405 if (content != nil || ws1) 406 p = ref Parcel.Text(content, ws1, ws2); 407 } 408 return p; 409} 410 411startelement(x: ref Parser, name: string) 412{ 413 x.estack = name :: x.estack; 414 if (name == x.preelem) 415 x.ispre++; 416} 417 418endelement(x: ref Parser, name: string) 419{ 420 if (x.estack != nil && name == hd x.estack) { 421 x.estack = tl x.estack; 422 if (name == x.preelem) 423 x.ispre--; 424 } else { 425 starttag := ""; 426 if (x.estack != nil) 427 starttag = hd x.estack; 428 warning(x, sys->sprint("<%s></%s> mismatch", starttag, name)); 429 430 # invalid XML but try to recover anyway to reduce turnaround time on fixing errors. 431 # loop back up through the tag stack to see if there's a matching tag, in which case 432 # jump up in the stack to that, making some rude assumptions about the 433 # way Parcels are handled at the top level. 434 n := 0; 435 for (t := x.estack; t != nil; (t, n) = (tl t, n + 1)) 436 if (hd t == name) 437 break; 438 if (t != nil) { 439 x.estack = tl t; 440 x.actdepth -= n; 441 } 442 } 443} 444 445buildattrs(x: ref Parser): Attributes 446{ 447 attrs: list of Attribute; 448 449 attr: Attribute; 450 for (;;) { 451 skipwhite(x); 452 attr.name = xmlname(x); 453 if (attr.name == nil) 454 break; 455 skipwhite(x); 456 c := getc(x); 457 if(c != '='){ 458 ungetc(x); 459 attr.value = nil; 460 }else 461 attr.value = xmlstring(x, 1); 462 attrs = attr :: attrs; 463 } 464 return Attributes(attrs); 465} 466 467xmlstring(x: ref Parser, dosub: int): string 468{ 469 skipwhite(x); 470 s := ""; 471 delim := getc(x); 472 if (delim == '\"' || delim == '\'') { 473 s = gets(x, delim); 474 n := len s; 475 if (n == 0 || s[n-1] != delim) 476 error(x, "unclosed string at end of file"); 477 s = s[0:n-1]; # TO DO: avoid copy 478 if(dosub) 479 s = substituteentities(x, s); 480 } else 481 error(x, sys->sprint("illegal string delimiter: %c", delim)); 482 return s; 483} 484 485xmlname(x: ref Parser): string 486{ 487 name := ""; 488 ch := getc(x); 489 case ch { 490 '_' or ':' or 491 'a' to 'z' or 492 'A' to 'Z' or 493 16r100 to 16rd7ff or 494 16re000 or 16rfffd => 495 name[0] = ch; 496loop: 497 for (;;) { 498 case ch = getc(x) { 499 '_' or '-' or ':' or '.' or 500 'a' to 'z' or 501 '0' to '9' or 502 'A' to 'Z' or 503 16r100 to 16rd7ff or 504 16re000 to 16rfffd => 505 name[len name] = ch; 506 * => 507 break loop; 508 } 509 } 510 } 511 ungetc(x); 512 return name; 513} 514 515substituteentities(x: ref Parser, buff: string): string 516{ 517 i := 0; 518 while (i < len buff) { 519 if (buff[i] == '&') { 520 (t, j) := translateentity(x, buff, i); 521 # XXX could be quicker 522 buff = buff[0:i] + t + buff[j:]; 523 i += len t; 524 } else 525 i++; 526 } 527 return buff; 528} 529 530# subsitute entities, squashing whitespace along the way. 531substituteentities_sp(x: ref Parser, buf: string): (string, int, int) 532{ 533 firstwhite := 0; 534 # skip initial white space 535 for (i := 0; i < len buf; i++) { 536 c := buf[i]; 537 if (c != ' ' && c != '\t' && c != '\n' && c != '\r') 538 break; 539 firstwhite = 1; 540 } 541 542 lastwhite := 0; 543 s := ""; 544 for (; i < len buf; i++) { 545 c := buf[i]; 546 if (c == ' ' || c == '\t' || c == '\n' || c == '\r') 547 lastwhite = 1; 548 else { 549 if (lastwhite) { 550 s[len s] = ' '; 551 lastwhite = 0; 552 } 553 if (c == '&') { 554 # should &x20; count as whitespace? 555 (ent, j) := translateentity(x, buf, i); 556 i = j - 1; 557 s += ent; 558 } else 559 s[len s] = c; 560 } 561 } 562 return (s, firstwhite, lastwhite); 563} 564 565translateentity(x: ref Parser, s: string, i: int): (string, int) 566{ 567 i++; 568 for (j := i; j < len s; j++) 569 if (s[j] == ';') 570 break; 571 ent := s[i:j]; 572 if (j == len s) { 573 if (len ent > 10) 574 ent = ent[0:11] + "..."; 575 warning(x, sys->sprint("missing ; at end of entity (&%s)", ent)); 576 return (nil, i); 577 } 578 j++; 579 if (ent == nil) { 580 warning(x, "empty entity"); 581 return ("", j); 582 } 583 if (ent[0] == '#') { 584 n: int; 585 rem := ent; 586 if (len ent >= 3 && ent[1] == 'x') 587 (n, rem) = str->toint(ent[2:], 16); 588 else if (len ent >= 2) 589 (n, rem) = str->toint(ent[1:], 10); 590 if (rem != nil) { 591 warning(x, sys->sprint("unrecognized entity (&%s)", ent)); 592 return (nil, j); 593 } 594 ch: string = nil; 595 ch[0] = n; 596 return (ch, j); 597 } 598 hv := entdict.find(ent); 599 if (hv == nil) { 600 warning(x, sys->sprint("unrecognized entity (&%s)", ent)); 601 return (nil, j); 602 } 603 return (hv.s, j); 604} 605 606balancedstring(x: ref Parser, eos: string): string 607{ 608 s := ""; 609 instring := 0; 610 quote: int; 611 612 for (i := 0; i < len eos; i++) 613 s[len s] = ' '; 614 615 skipwhite(x); 616 while ((c := getc(x)) != Bufio->EOF) { 617 s[len s] = c; 618 if (instring) { 619 if (c == quote) 620 instring = 0; 621 } else if (c == '\"' || c == '\'') { 622 quote = c; 623 instring = 1; 624 } else if (s[len s - len eos : len s] == eos) 625 return s[len eos : len s - len eos]; 626 } 627 error(x, sys->sprint("unexpected end of file while looking for \"%s\"", eos)); 628 return ""; 629} 630 631skipwhite(x: ref Parser) 632{ 633 while ((c := getc(x)) == ' ' || c == '\t' || c == '\n' || c == '\r') 634 ; 635 ungetc(x); 636} 637 638expectwhite(x: ref Parser) 639{ 640 if ((c := getc(x)) != ' ' && c != '\t' && c != '\n' && c != '\r') 641 error(x, "expecting white space"); 642 skipwhite(x); 643} 644 645expect(x: ref Parser, ch: int) 646{ 647 skipwhite(x); 648 c := getc(x); 649 if (c != ch) 650 error(x, sys->sprint("expecting %c", ch)); 651} 652 653getc(x: ref Parser): int 654{ 655 if (x.eof) 656 return Bufio->EOF; 657 ch := x.in.getc(); 658 if (ch == Bufio->EOF) 659 x.eof = 1; 660 else if (ch == '\n') 661 x.loc.line++; 662 x.lastnl = ch == '\n'; 663 return ch; 664} 665 666gets(x: ref Parser, delim: int): string 667{ 668 if (x.eof) 669 return ""; 670 s := x.in.gets(delim); 671 for (i := 0; i < len s; i++) 672 if (s[i] == '\n') 673 x.loc.line++; 674 if (s == "") 675 x.eof = 1; 676 else 677 x.lastnl = s[len s - 1] == '\n'; 678 return s; 679} 680 681ungetc(x: ref Parser) 682{ 683 if (x.eof) 684 return; 685 x.in.ungetc(); 686 x.loc.line -= x.lastnl; 687} 688 689Attributes.all(al: self Attributes): list of Attribute 690{ 691 return al.attrs; 692} 693 694Attributes.get(attrs: self Attributes, name: string): string 695{ 696 for (a := attrs.attrs; a != nil; a = tl a) 697 if ((hd a).name == name) 698 return (hd a).value; 699 return nil; 700} 701 702warning(x: ref Parser, msg: string) 703{ 704 if (x.warning != nil) 705 x.warning <-= (x.loc, msg); 706} 707 708error(x: ref Parser, msg: string) 709{ 710 x.errormsg = msg; 711 raise "sax:error"; 712} 713