1 /*
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net>
11 Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12 Copyright (c) 2004-2009 Karl Waclawek <karl@waclawek.net>
13 Copyright (c) 2005-2007 Steven Solie <steven@solie.ca>
14 Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
15 Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk>
16 Copyright (c) 2019 David Loffredo <loffredo@steptools.com>
17 Copyright (c) 2020 Joe Orton <jorton@redhat.com>
18 Copyright (c) 2020 Kleber Tarcísio <klebertarcisio@yahoo.com.br>
19 Copyright (c) 2021 Tim Bray <tbray@textuality.com>
20 Licensed under the MIT license:
21
22 Permission is hereby granted, free of charge, to any person obtaining
23 a copy of this software and associated documentation files (the
24 "Software"), to deal in the Software without restriction, including
25 without limitation the rights to use, copy, modify, merge, publish,
26 distribute, sublicense, and/or sell copies of the Software, and to permit
27 persons to whom the Software is furnished to do so, subject to the
28 following conditions:
29
30 The above copyright notice and this permission notice shall be included
31 in all copies or substantial portions of the Software.
32
33 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
34 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
35 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
36 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
37 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
38 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
39 USE OR OTHER DEALINGS IN THE SOFTWARE.
40 */
41
42 #include <expat_config.h>
43
44 #include <assert.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <stddef.h>
48 #include <string.h>
49 #include <math.h> /* for isnan */
50 #include <errno.h>
51
52 #include "expat.h"
53 #include "codepage.h"
54 #include "internal.h" /* for UNUSED_P only */
55 #include "xmlfile.h"
56 #include "xmltchar.h"
57
58 #ifdef _MSC_VER
59 # include <crtdbg.h>
60 #endif
61
62 #ifdef XML_UNICODE
63 # include <wchar.h>
64 #endif
65
66 enum ExitCode {
67 XMLWF_EXIT_SUCCESS = 0,
68 XMLWF_EXIT_INTERNAL_ERROR = 1,
69 XMLWF_EXIT_NOT_WELLFORMED = 2,
70 XMLWF_EXIT_OUTPUT_ERROR = 3,
71 XMLWF_EXIT_USAGE_ERROR = 4,
72 };
73
74 /* Structures for handler user data */
75 typedef struct NotationList {
76 struct NotationList *next;
77 const XML_Char *notationName;
78 const XML_Char *systemId;
79 const XML_Char *publicId;
80 } NotationList;
81
82 typedef struct xmlwfUserData {
83 FILE *fp;
84 NotationList *notationListHead;
85 const XML_Char *currentDoctypeName;
86 } XmlwfUserData;
87
88 /* This ensures proper sorting. */
89
90 #define NSSEP T('\001')
91
92 static void XMLCALL
characterData(void * userData,const XML_Char * s,int len)93 characterData(void *userData, const XML_Char *s, int len) {
94 FILE *fp = ((XmlwfUserData *)userData)->fp;
95 for (; len > 0; --len, ++s) {
96 switch (*s) {
97 case T('&'):
98 fputts(T("&"), fp);
99 break;
100 case T('<'):
101 fputts(T("<"), fp);
102 break;
103 case T('>'):
104 fputts(T(">"), fp);
105 break;
106 #ifdef W3C14N
107 case 13:
108 fputts(T("
"), fp);
109 break;
110 #else
111 case T('"'):
112 fputts(T("""), fp);
113 break;
114 case 9:
115 case 10:
116 case 13:
117 ftprintf(fp, T("&#%d;"), *s);
118 break;
119 #endif
120 default:
121 puttc(*s, fp);
122 break;
123 }
124 }
125 }
126
127 static void
attributeValue(FILE * fp,const XML_Char * s)128 attributeValue(FILE *fp, const XML_Char *s) {
129 puttc(T('='), fp);
130 puttc(T('"'), fp);
131 assert(s);
132 for (;;) {
133 switch (*s) {
134 case 0:
135 case NSSEP:
136 puttc(T('"'), fp);
137 return;
138 case T('&'):
139 fputts(T("&"), fp);
140 break;
141 case T('<'):
142 fputts(T("<"), fp);
143 break;
144 case T('"'):
145 fputts(T("""), fp);
146 break;
147 #ifdef W3C14N
148 case 9:
149 fputts(T("	"), fp);
150 break;
151 case 10:
152 fputts(T("
"), fp);
153 break;
154 case 13:
155 fputts(T("
"), fp);
156 break;
157 #else
158 case T('>'):
159 fputts(T(">"), fp);
160 break;
161 case 9:
162 case 10:
163 case 13:
164 ftprintf(fp, T("&#%d;"), *s);
165 break;
166 #endif
167 default:
168 puttc(*s, fp);
169 break;
170 }
171 s++;
172 }
173 }
174
175 /* Lexicographically comparing UTF-8 encoded attribute values,
176 is equivalent to lexicographically comparing based on the character number. */
177
178 static int
attcmp(const void * att1,const void * att2)179 attcmp(const void *att1, const void *att2) {
180 return tcscmp(*(const XML_Char **)att1, *(const XML_Char **)att2);
181 }
182
183 static void XMLCALL
startElement(void * userData,const XML_Char * name,const XML_Char ** atts)184 startElement(void *userData, const XML_Char *name, const XML_Char **atts) {
185 int nAtts;
186 const XML_Char **p;
187 FILE *fp = ((XmlwfUserData *)userData)->fp;
188 puttc(T('<'), fp);
189 fputts(name, fp);
190
191 p = atts;
192 while (*p)
193 ++p;
194 nAtts = (int)((p - atts) >> 1);
195 if (nAtts > 1)
196 qsort((void *)atts, nAtts, sizeof(XML_Char *) * 2, attcmp);
197 while (*atts) {
198 puttc(T(' '), fp);
199 fputts(*atts++, fp);
200 attributeValue(fp, *atts);
201 atts++;
202 }
203 puttc(T('>'), fp);
204 }
205
206 static void XMLCALL
endElement(void * userData,const XML_Char * name)207 endElement(void *userData, const XML_Char *name) {
208 FILE *fp = ((XmlwfUserData *)userData)->fp;
209 puttc(T('<'), fp);
210 puttc(T('/'), fp);
211 fputts(name, fp);
212 puttc(T('>'), fp);
213 }
214
215 static int
nsattcmp(const void * p1,const void * p2)216 nsattcmp(const void *p1, const void *p2) {
217 const XML_Char *att1 = *(const XML_Char **)p1;
218 const XML_Char *att2 = *(const XML_Char **)p2;
219 int sep1 = (tcsrchr(att1, NSSEP) != 0);
220 int sep2 = (tcsrchr(att1, NSSEP) != 0);
221 if (sep1 != sep2)
222 return sep1 - sep2;
223 return tcscmp(att1, att2);
224 }
225
226 static void XMLCALL
startElementNS(void * userData,const XML_Char * name,const XML_Char ** atts)227 startElementNS(void *userData, const XML_Char *name, const XML_Char **atts) {
228 int nAtts;
229 int nsi;
230 const XML_Char **p;
231 FILE *fp = ((XmlwfUserData *)userData)->fp;
232 const XML_Char *sep;
233 puttc(T('<'), fp);
234
235 sep = tcsrchr(name, NSSEP);
236 if (sep) {
237 fputts(T("n1:"), fp);
238 fputts(sep + 1, fp);
239 fputts(T(" xmlns:n1"), fp);
240 attributeValue(fp, name);
241 nsi = 2;
242 } else {
243 fputts(name, fp);
244 nsi = 1;
245 }
246
247 p = atts;
248 while (*p)
249 ++p;
250 nAtts = (int)((p - atts) >> 1);
251 if (nAtts > 1)
252 qsort((void *)atts, nAtts, sizeof(XML_Char *) * 2, nsattcmp);
253 while (*atts) {
254 name = *atts++;
255 sep = tcsrchr(name, NSSEP);
256 puttc(T(' '), fp);
257 if (sep) {
258 ftprintf(fp, T("n%d:"), nsi);
259 fputts(sep + 1, fp);
260 } else
261 fputts(name, fp);
262 attributeValue(fp, *atts);
263 if (sep) {
264 ftprintf(fp, T(" xmlns:n%d"), nsi++);
265 attributeValue(fp, name);
266 }
267 atts++;
268 }
269 puttc(T('>'), fp);
270 }
271
272 static void XMLCALL
endElementNS(void * userData,const XML_Char * name)273 endElementNS(void *userData, const XML_Char *name) {
274 FILE *fp = ((XmlwfUserData *)userData)->fp;
275 const XML_Char *sep;
276 puttc(T('<'), fp);
277 puttc(T('/'), fp);
278 sep = tcsrchr(name, NSSEP);
279 if (sep) {
280 fputts(T("n1:"), fp);
281 fputts(sep + 1, fp);
282 } else
283 fputts(name, fp);
284 puttc(T('>'), fp);
285 }
286
287 #ifndef W3C14N
288
289 static void XMLCALL
processingInstruction(void * userData,const XML_Char * target,const XML_Char * data)290 processingInstruction(void *userData, const XML_Char *target,
291 const XML_Char *data) {
292 FILE *fp = ((XmlwfUserData *)userData)->fp;
293 puttc(T('<'), fp);
294 puttc(T('?'), fp);
295 fputts(target, fp);
296 puttc(T(' '), fp);
297 fputts(data, fp);
298 puttc(T('?'), fp);
299 puttc(T('>'), fp);
300 }
301
302 static XML_Char *
xcsdup(const XML_Char * s)303 xcsdup(const XML_Char *s) {
304 XML_Char *result;
305 int count = 0;
306 int numBytes;
307
308 /* Get the length of the string, including terminator */
309 while (s[count++] != 0) {
310 /* Do nothing */
311 }
312 numBytes = count * sizeof(XML_Char);
313 result = malloc(numBytes);
314 if (result == NULL)
315 return NULL;
316 memcpy(result, s, numBytes);
317 return result;
318 }
319
320 static void XMLCALL
startDoctypeDecl(void * userData,const XML_Char * doctypeName,const XML_Char * sysid,const XML_Char * publid,int has_internal_subset)321 startDoctypeDecl(void *userData, const XML_Char *doctypeName,
322 const XML_Char *sysid, const XML_Char *publid,
323 int has_internal_subset) {
324 XmlwfUserData *data = (XmlwfUserData *)userData;
325 UNUSED_P(sysid);
326 UNUSED_P(publid);
327 UNUSED_P(has_internal_subset);
328 data->currentDoctypeName = xcsdup(doctypeName);
329 }
330
331 static void
freeNotations(XmlwfUserData * data)332 freeNotations(XmlwfUserData *data) {
333 NotationList *notationListHead = data->notationListHead;
334
335 while (notationListHead != NULL) {
336 NotationList *next = notationListHead->next;
337 free((void *)notationListHead->notationName);
338 free((void *)notationListHead->systemId);
339 free((void *)notationListHead->publicId);
340 free(notationListHead);
341 notationListHead = next;
342 }
343 data->notationListHead = NULL;
344 }
345
346 static void
cleanupUserData(XmlwfUserData * userData)347 cleanupUserData(XmlwfUserData *userData) {
348 free((void *)userData->currentDoctypeName);
349 userData->currentDoctypeName = NULL;
350 freeNotations(userData);
351 }
352
353 static int
xcscmp(const XML_Char * xs,const XML_Char * xt)354 xcscmp(const XML_Char *xs, const XML_Char *xt) {
355 while (*xs != 0 && *xt != 0) {
356 if (*xs < *xt)
357 return -1;
358 if (*xs > *xt)
359 return 1;
360 xs++;
361 xt++;
362 }
363 if (*xs < *xt)
364 return -1;
365 if (*xs > *xt)
366 return 1;
367 return 0;
368 }
369
370 static int
notationCmp(const void * a,const void * b)371 notationCmp(const void *a, const void *b) {
372 const NotationList *const n1 = *(NotationList **)a;
373 const NotationList *const n2 = *(NotationList **)b;
374
375 return xcscmp(n1->notationName, n2->notationName);
376 }
377
378 static void XMLCALL
endDoctypeDecl(void * userData)379 endDoctypeDecl(void *userData) {
380 XmlwfUserData *data = (XmlwfUserData *)userData;
381 NotationList **notations;
382 int notationCount = 0;
383 NotationList *p;
384 int i;
385
386 /* How many notations do we have? */
387 for (p = data->notationListHead; p != NULL; p = p->next)
388 notationCount++;
389 if (notationCount == 0) {
390 /* Nothing to report */
391 free((void *)data->currentDoctypeName);
392 data->currentDoctypeName = NULL;
393 return;
394 }
395
396 notations = malloc(notationCount * sizeof(NotationList *));
397 if (notations == NULL) {
398 fprintf(stderr, "Unable to sort notations");
399 freeNotations(data);
400 return;
401 }
402
403 for (p = data->notationListHead, i = 0; i < notationCount; p = p->next, i++) {
404 notations[i] = p;
405 }
406 qsort(notations, notationCount, sizeof(NotationList *), notationCmp);
407
408 /* Output the DOCTYPE header */
409 fputts(T("<!DOCTYPE "), data->fp);
410 fputts(data->currentDoctypeName, data->fp);
411 fputts(T(" [\n"), data->fp);
412
413 /* Now the NOTATIONs */
414 for (i = 0; i < notationCount; i++) {
415 fputts(T("<!NOTATION "), data->fp);
416 fputts(notations[i]->notationName, data->fp);
417 if (notations[i]->publicId != NULL) {
418 fputts(T(" PUBLIC '"), data->fp);
419 fputts(notations[i]->publicId, data->fp);
420 puttc(T('\''), data->fp);
421 if (notations[i]->systemId != NULL) {
422 puttc(T(' '), data->fp);
423 puttc(T('\''), data->fp);
424 fputts(notations[i]->systemId, data->fp);
425 puttc(T('\''), data->fp);
426 }
427 } else if (notations[i]->systemId != NULL) {
428 fputts(T(" SYSTEM '"), data->fp);
429 fputts(notations[i]->systemId, data->fp);
430 puttc(T('\''), data->fp);
431 }
432 puttc(T('>'), data->fp);
433 puttc(T('\n'), data->fp);
434 }
435
436 /* Finally end the DOCTYPE */
437 fputts(T("]>\n"), data->fp);
438
439 free(notations);
440 freeNotations(data);
441 free((void *)data->currentDoctypeName);
442 data->currentDoctypeName = NULL;
443 }
444
445 static void XMLCALL
notationDecl(void * userData,const XML_Char * notationName,const XML_Char * base,const XML_Char * systemId,const XML_Char * publicId)446 notationDecl(void *userData, const XML_Char *notationName, const XML_Char *base,
447 const XML_Char *systemId, const XML_Char *publicId) {
448 XmlwfUserData *data = (XmlwfUserData *)userData;
449 NotationList *entry = malloc(sizeof(NotationList));
450 const char *errorMessage = "Unable to store NOTATION for output\n";
451
452 UNUSED_P(base);
453 if (entry == NULL) {
454 fputs(errorMessage, stderr);
455 return; /* Nothing we can really do about this */
456 }
457 entry->notationName = xcsdup(notationName);
458 if (entry->notationName == NULL) {
459 fputs(errorMessage, stderr);
460 free(entry);
461 return;
462 }
463 if (systemId != NULL) {
464 entry->systemId = xcsdup(systemId);
465 if (entry->systemId == NULL) {
466 fputs(errorMessage, stderr);
467 free((void *)entry->notationName);
468 free(entry);
469 return;
470 }
471 } else {
472 entry->systemId = NULL;
473 }
474 if (publicId != NULL) {
475 entry->publicId = xcsdup(publicId);
476 if (entry->publicId == NULL) {
477 fputs(errorMessage, stderr);
478 free((void *)entry->systemId); /* Safe if it's NULL */
479 free((void *)entry->notationName);
480 free(entry);
481 return;
482 }
483 } else {
484 entry->publicId = NULL;
485 }
486
487 entry->next = data->notationListHead;
488 data->notationListHead = entry;
489 }
490
491 #endif /* not W3C14N */
492
493 static void XMLCALL
defaultCharacterData(void * userData,const XML_Char * s,int len)494 defaultCharacterData(void *userData, const XML_Char *s, int len) {
495 UNUSED_P(s);
496 UNUSED_P(len);
497 XML_DefaultCurrent((XML_Parser)userData);
498 }
499
500 static void XMLCALL
defaultStartElement(void * userData,const XML_Char * name,const XML_Char ** atts)501 defaultStartElement(void *userData, const XML_Char *name,
502 const XML_Char **atts) {
503 UNUSED_P(name);
504 UNUSED_P(atts);
505 XML_DefaultCurrent((XML_Parser)userData);
506 }
507
508 static void XMLCALL
defaultEndElement(void * userData,const XML_Char * name)509 defaultEndElement(void *userData, const XML_Char *name) {
510 UNUSED_P(name);
511 XML_DefaultCurrent((XML_Parser)userData);
512 }
513
514 static void XMLCALL
defaultProcessingInstruction(void * userData,const XML_Char * target,const XML_Char * data)515 defaultProcessingInstruction(void *userData, const XML_Char *target,
516 const XML_Char *data) {
517 UNUSED_P(target);
518 UNUSED_P(data);
519 XML_DefaultCurrent((XML_Parser)userData);
520 }
521
522 static void XMLCALL
nopCharacterData(void * userData,const XML_Char * s,int len)523 nopCharacterData(void *userData, const XML_Char *s, int len) {
524 UNUSED_P(userData);
525 UNUSED_P(s);
526 UNUSED_P(len);
527 }
528
529 static void XMLCALL
nopStartElement(void * userData,const XML_Char * name,const XML_Char ** atts)530 nopStartElement(void *userData, const XML_Char *name, const XML_Char **atts) {
531 UNUSED_P(userData);
532 UNUSED_P(name);
533 UNUSED_P(atts);
534 }
535
536 static void XMLCALL
nopEndElement(void * userData,const XML_Char * name)537 nopEndElement(void *userData, const XML_Char *name) {
538 UNUSED_P(userData);
539 UNUSED_P(name);
540 }
541
542 static void XMLCALL
nopProcessingInstruction(void * userData,const XML_Char * target,const XML_Char * data)543 nopProcessingInstruction(void *userData, const XML_Char *target,
544 const XML_Char *data) {
545 UNUSED_P(userData);
546 UNUSED_P(target);
547 UNUSED_P(data);
548 }
549
550 static void XMLCALL
markup(void * userData,const XML_Char * s,int len)551 markup(void *userData, const XML_Char *s, int len) {
552 FILE *fp = ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp;
553 for (; len > 0; --len, ++s)
554 puttc(*s, fp);
555 }
556
557 static void
metaLocation(XML_Parser parser)558 metaLocation(XML_Parser parser) {
559 const XML_Char *uri = XML_GetBase(parser);
560 FILE *fp = ((XmlwfUserData *)XML_GetUserData(parser))->fp;
561 if (uri)
562 ftprintf(fp, T(" uri=\"%s\""), uri);
563 ftprintf(fp,
564 T(" byte=\"%") T(XML_FMT_INT_MOD) T("d\"") T(" nbytes=\"%d\"")
565 T(" line=\"%") T(XML_FMT_INT_MOD) T("u\"") T(" col=\"%")
566 T(XML_FMT_INT_MOD) T("u\""),
567 XML_GetCurrentByteIndex(parser), XML_GetCurrentByteCount(parser),
568 XML_GetCurrentLineNumber(parser),
569 XML_GetCurrentColumnNumber(parser));
570 }
571
572 static void
metaStartDocument(void * userData)573 metaStartDocument(void *userData) {
574 fputts(T("<document>\n"),
575 ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp);
576 }
577
578 static void
metaEndDocument(void * userData)579 metaEndDocument(void *userData) {
580 fputts(T("</document>\n"),
581 ((XmlwfUserData *)XML_GetUserData((XML_Parser)userData))->fp);
582 }
583
584 static void XMLCALL
metaStartElement(void * userData,const XML_Char * name,const XML_Char ** atts)585 metaStartElement(void *userData, const XML_Char *name, const XML_Char **atts) {
586 XML_Parser parser = (XML_Parser)userData;
587 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
588 FILE *fp = data->fp;
589 const XML_Char **specifiedAttsEnd
590 = atts + XML_GetSpecifiedAttributeCount(parser);
591 const XML_Char **idAttPtr;
592 int idAttIndex = XML_GetIdAttributeIndex(parser);
593 if (idAttIndex < 0)
594 idAttPtr = 0;
595 else
596 idAttPtr = atts + idAttIndex;
597
598 ftprintf(fp, T("<starttag name=\"%s\""), name);
599 metaLocation(parser);
600 if (*atts) {
601 fputts(T(">\n"), fp);
602 do {
603 ftprintf(fp, T("<attribute name=\"%s\" value=\""), atts[0]);
604 characterData(data, atts[1], (int)tcslen(atts[1]));
605 if (atts >= specifiedAttsEnd)
606 fputts(T("\" defaulted=\"yes\"/>\n"), fp);
607 else if (atts == idAttPtr)
608 fputts(T("\" id=\"yes\"/>\n"), fp);
609 else
610 fputts(T("\"/>\n"), fp);
611 } while (*(atts += 2));
612 fputts(T("</starttag>\n"), fp);
613 } else
614 fputts(T("/>\n"), fp);
615 }
616
617 static void XMLCALL
metaEndElement(void * userData,const XML_Char * name)618 metaEndElement(void *userData, const XML_Char *name) {
619 XML_Parser parser = (XML_Parser)userData;
620 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
621 FILE *fp = data->fp;
622 ftprintf(fp, T("<endtag name=\"%s\""), name);
623 metaLocation(parser);
624 fputts(T("/>\n"), fp);
625 }
626
627 static void XMLCALL
metaProcessingInstruction(void * userData,const XML_Char * target,const XML_Char * data)628 metaProcessingInstruction(void *userData, const XML_Char *target,
629 const XML_Char *data) {
630 XML_Parser parser = (XML_Parser)userData;
631 XmlwfUserData *usrData = (XmlwfUserData *)XML_GetUserData(parser);
632 FILE *fp = usrData->fp;
633 ftprintf(fp, T("<pi target=\"%s\" data=\""), target);
634 characterData(usrData, data, (int)tcslen(data));
635 puttc(T('"'), fp);
636 metaLocation(parser);
637 fputts(T("/>\n"), fp);
638 }
639
640 static void XMLCALL
metaComment(void * userData,const XML_Char * data)641 metaComment(void *userData, const XML_Char *data) {
642 XML_Parser parser = (XML_Parser)userData;
643 XmlwfUserData *usrData = (XmlwfUserData *)XML_GetUserData(parser);
644 FILE *fp = usrData->fp;
645 fputts(T("<comment data=\""), fp);
646 characterData(usrData, data, (int)tcslen(data));
647 puttc(T('"'), fp);
648 metaLocation(parser);
649 fputts(T("/>\n"), fp);
650 }
651
652 static void XMLCALL
metaStartCdataSection(void * userData)653 metaStartCdataSection(void *userData) {
654 XML_Parser parser = (XML_Parser)userData;
655 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
656 FILE *fp = data->fp;
657 fputts(T("<startcdata"), fp);
658 metaLocation(parser);
659 fputts(T("/>\n"), fp);
660 }
661
662 static void XMLCALL
metaEndCdataSection(void * userData)663 metaEndCdataSection(void *userData) {
664 XML_Parser parser = (XML_Parser)userData;
665 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
666 FILE *fp = data->fp;
667 fputts(T("<endcdata"), fp);
668 metaLocation(parser);
669 fputts(T("/>\n"), fp);
670 }
671
672 static void XMLCALL
metaCharacterData(void * userData,const XML_Char * s,int len)673 metaCharacterData(void *userData, const XML_Char *s, int len) {
674 XML_Parser parser = (XML_Parser)userData;
675 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
676 FILE *fp = data->fp;
677 fputts(T("<chars str=\""), fp);
678 characterData(data, s, len);
679 puttc(T('"'), fp);
680 metaLocation(parser);
681 fputts(T("/>\n"), fp);
682 }
683
684 static void XMLCALL
metaStartDoctypeDecl(void * userData,const XML_Char * doctypeName,const XML_Char * sysid,const XML_Char * pubid,int has_internal_subset)685 metaStartDoctypeDecl(void *userData, const XML_Char *doctypeName,
686 const XML_Char *sysid, const XML_Char *pubid,
687 int has_internal_subset) {
688 XML_Parser parser = (XML_Parser)userData;
689 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
690 FILE *fp = data->fp;
691 UNUSED_P(sysid);
692 UNUSED_P(pubid);
693 UNUSED_P(has_internal_subset);
694 ftprintf(fp, T("<startdoctype name=\"%s\""), doctypeName);
695 metaLocation(parser);
696 fputts(T("/>\n"), fp);
697 }
698
699 static void XMLCALL
metaEndDoctypeDecl(void * userData)700 metaEndDoctypeDecl(void *userData) {
701 XML_Parser parser = (XML_Parser)userData;
702 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
703 FILE *fp = data->fp;
704 fputts(T("<enddoctype"), fp);
705 metaLocation(parser);
706 fputts(T("/>\n"), fp);
707 }
708
709 static void XMLCALL
metaNotationDecl(void * userData,const XML_Char * notationName,const XML_Char * base,const XML_Char * systemId,const XML_Char * publicId)710 metaNotationDecl(void *userData, const XML_Char *notationName,
711 const XML_Char *base, const XML_Char *systemId,
712 const XML_Char *publicId) {
713 XML_Parser parser = (XML_Parser)userData;
714 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
715 FILE *fp = data->fp;
716 UNUSED_P(base);
717 ftprintf(fp, T("<notation name=\"%s\""), notationName);
718 if (publicId)
719 ftprintf(fp, T(" public=\"%s\""), publicId);
720 if (systemId) {
721 fputts(T(" system=\""), fp);
722 characterData(data, systemId, (int)tcslen(systemId));
723 puttc(T('"'), fp);
724 }
725 metaLocation(parser);
726 fputts(T("/>\n"), fp);
727 }
728
729 static void XMLCALL
metaEntityDecl(void * userData,const XML_Char * entityName,int is_param,const XML_Char * value,int value_length,const XML_Char * base,const XML_Char * systemId,const XML_Char * publicId,const XML_Char * notationName)730 metaEntityDecl(void *userData, const XML_Char *entityName, int is_param,
731 const XML_Char *value, int value_length, const XML_Char *base,
732 const XML_Char *systemId, const XML_Char *publicId,
733 const XML_Char *notationName) {
734 XML_Parser parser = (XML_Parser)userData;
735 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
736 FILE *fp = data->fp;
737
738 UNUSED_P(is_param);
739 UNUSED_P(base);
740 if (value) {
741 ftprintf(fp, T("<entity name=\"%s\""), entityName);
742 metaLocation(parser);
743 puttc(T('>'), fp);
744 characterData(data, value, value_length);
745 fputts(T("</entity/>\n"), fp);
746 } else if (notationName) {
747 ftprintf(fp, T("<entity name=\"%s\""), entityName);
748 if (publicId)
749 ftprintf(fp, T(" public=\"%s\""), publicId);
750 fputts(T(" system=\""), fp);
751 characterData(data, systemId, (int)tcslen(systemId));
752 puttc(T('"'), fp);
753 ftprintf(fp, T(" notation=\"%s\""), notationName);
754 metaLocation(parser);
755 fputts(T("/>\n"), fp);
756 } else {
757 ftprintf(fp, T("<entity name=\"%s\""), entityName);
758 if (publicId)
759 ftprintf(fp, T(" public=\"%s\""), publicId);
760 fputts(T(" system=\""), fp);
761 characterData(data, systemId, (int)tcslen(systemId));
762 puttc(T('"'), fp);
763 metaLocation(parser);
764 fputts(T("/>\n"), fp);
765 }
766 }
767
768 static void XMLCALL
metaStartNamespaceDecl(void * userData,const XML_Char * prefix,const XML_Char * uri)769 metaStartNamespaceDecl(void *userData, const XML_Char *prefix,
770 const XML_Char *uri) {
771 XML_Parser parser = (XML_Parser)userData;
772 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
773 FILE *fp = data->fp;
774 fputts(T("<startns"), fp);
775 if (prefix)
776 ftprintf(fp, T(" prefix=\"%s\""), prefix);
777 if (uri) {
778 fputts(T(" ns=\""), fp);
779 characterData(data, uri, (int)tcslen(uri));
780 fputts(T("\"/>\n"), fp);
781 } else
782 fputts(T("/>\n"), fp);
783 }
784
785 static void XMLCALL
metaEndNamespaceDecl(void * userData,const XML_Char * prefix)786 metaEndNamespaceDecl(void *userData, const XML_Char *prefix) {
787 XML_Parser parser = (XML_Parser)userData;
788 XmlwfUserData *data = (XmlwfUserData *)XML_GetUserData(parser);
789 FILE *fp = data->fp;
790 if (! prefix)
791 fputts(T("<endns/>\n"), fp);
792 else
793 ftprintf(fp, T("<endns prefix=\"%s\"/>\n"), prefix);
794 }
795
796 static int XMLCALL
unknownEncodingConvert(void * data,const char * p)797 unknownEncodingConvert(void *data, const char *p) {
798 return codepageConvert(*(int *)data, p);
799 }
800
801 static int XMLCALL
unknownEncoding(void * userData,const XML_Char * name,XML_Encoding * info)802 unknownEncoding(void *userData, const XML_Char *name, XML_Encoding *info) {
803 int cp;
804 static const XML_Char prefixL[] = T("windows-");
805 static const XML_Char prefixU[] = T("WINDOWS-");
806 int i;
807
808 UNUSED_P(userData);
809 for (i = 0; prefixU[i]; i++)
810 if (name[i] != prefixU[i] && name[i] != prefixL[i])
811 return 0;
812
813 cp = 0;
814 for (; name[i]; i++) {
815 static const XML_Char digits[] = T("0123456789");
816 const XML_Char *s = tcschr(digits, name[i]);
817 if (! s)
818 return 0;
819 cp *= 10;
820 cp += (int)(s - digits);
821 if (cp >= 0x10000)
822 return 0;
823 }
824 if (! codepageMap(cp, info->map))
825 return 0;
826 info->convert = unknownEncodingConvert;
827 /* We could just cast the code page integer to a void *,
828 and avoid the use of release. */
829 info->release = free;
830 info->data = malloc(sizeof(int));
831 if (! info->data)
832 return 0;
833 *(int *)info->data = cp;
834 return 1;
835 }
836
837 static int XMLCALL
notStandalone(void * userData)838 notStandalone(void *userData) {
839 UNUSED_P(userData);
840 return 0;
841 }
842
843 static void
showVersion(XML_Char * prog)844 showVersion(XML_Char *prog) {
845 XML_Char *s = prog;
846 XML_Char ch;
847 const XML_Feature *features = XML_GetFeatureList();
848 while ((ch = *s) != 0) {
849 if (ch == '/'
850 #if defined(_WIN32)
851 || ch == '\\'
852 #endif
853 )
854 prog = s + 1;
855 ++s;
856 }
857 ftprintf(stdout, T("%s using %s\n"), prog, XML_ExpatVersion());
858 if (features != NULL && features[0].feature != XML_FEATURE_END) {
859 int i = 1;
860 ftprintf(stdout, T("%s"), features[0].name);
861 if (features[0].value)
862 ftprintf(stdout, T("=%ld"), features[0].value);
863 while (features[i].feature != XML_FEATURE_END) {
864 ftprintf(stdout, T(", %s"), features[i].name);
865 if (features[i].value)
866 ftprintf(stdout, T("=%ld"), features[i].value);
867 ++i;
868 }
869 ftprintf(stdout, T("\n"));
870 }
871 }
872
873 static void
usage(const XML_Char * prog,int rc)874 usage(const XML_Char *prog, int rc) {
875 ftprintf(
876 stderr,
877 /* Generated with:
878 * $ xmlwf/xmlwf_helpgen.sh
879 * To update, change xmlwf/xmlwf_helpgen.py, then paste the output of
880 * xmlwf/xmlwf_helpgen.sh in here.
881 */
882 /* clang-format off */
883 T("usage:\n")
884 T(" %s [OPTIONS] [FILE ...]\n")
885 T(" %s -h\n")
886 T(" %s -v\n")
887 T("\n")
888 T("xmlwf - Determines if an XML document is well-formed\n")
889 T("\n")
890 T("positional arguments:\n")
891 T(" FILE file to process (default: STDIN)\n")
892 T("\n")
893 T("input control arguments:\n")
894 T(" -s print an error if the document is not [s]tandalone\n")
895 T(" -n enable [n]amespace processing\n")
896 T(" -p enable processing external DTDs and [p]arameter entities\n")
897 T(" -x enable processing of e[x]ternal entities\n")
898 T(" -e ENCODING override any in-document [e]ncoding declaration\n")
899 T(" -w enable support for [W]indows code pages\n")
900 T(" -r disable memory-mapping and use normal file [r]ead IO calls instead\n")
901 T(" -k when processing multiple files, [k]eep processing after first file with error\n")
902 T("\n")
903 T("output control arguments:\n")
904 T(" -d DIRECTORY output [d]estination directory\n")
905 T(" -c write a [c]opy of input XML, not canonical XML\n")
906 T(" -m write [m]eta XML, not canonical XML\n")
907 T(" -t write no XML output for [t]iming of plain parsing\n")
908 T(" -N enable adding doctype and [n]otation declarations\n")
909 T("\n")
910 T("billion laughs attack protection:\n")
911 T(" NOTE: If you ever need to increase these values for non-attack payload, please file a bug report.\n")
912 T("\n")
913 T(" -a FACTOR set maximum tolerated [a]mplification factor (default: 100.0)\n")
914 T(" -b BYTES set number of output [b]ytes needed to activate (default: 8 MiB)\n")
915 T("\n")
916 T("info arguments:\n")
917 T(" -h show this [h]elp message and exit\n")
918 T(" -v show program's [v]ersion number and exit\n")
919 T("\n")
920 T("exit status:\n")
921 T(" 0 the input files are well-formed and the output (if requested) was written successfully\n")
922 T(" 1 could not allocate data structures, signals a serious problem with execution environment\n")
923 T(" 2 one or more input files were not well-formed\n")
924 T(" 3 could not create an output file\n")
925 T(" 4 command-line argument error\n")
926 T("\n")
927 T("xmlwf of libexpat is software libre, licensed under the MIT license.\n")
928 T("Please report bugs at https://github.com/libexpat/libexpat/issues. Thank you!\n")
929 , /* clang-format on */
930 prog, prog, prog);
931 exit(rc);
932 }
933
934 #if defined(__MINGW32__) && defined(XML_UNICODE)
935 /* Silence warning about missing prototype */
936 int wmain(int argc, XML_Char **argv);
937 #endif
938
939 #define XMLWF_SHIFT_ARG_INTO(constCharStarTarget, argc, argv, i, j) \
940 { \
941 if (argv[i][j + 1] == T('\0')) { \
942 if (++i == argc) \
943 usage(argv[0], XMLWF_EXIT_USAGE_ERROR); \
944 constCharStarTarget = argv[i]; \
945 } else { \
946 constCharStarTarget = argv[i] + j + 1; \
947 } \
948 i++; \
949 j = 0; \
950 }
951
952 int
tmain(int argc,XML_Char ** argv)953 tmain(int argc, XML_Char **argv) {
954 int i, j;
955 const XML_Char *outputDir = NULL;
956 const XML_Char *encoding = NULL;
957 unsigned processFlags = XML_MAP_FILE;
958 int windowsCodePages = 0;
959 int outputType = 0;
960 int useNamespaces = 0;
961 int requireStandalone = 0;
962 int requiresNotations = 0;
963 int continueOnError = 0;
964
965 float attackMaximumAmplification = -1.0f; /* signaling "not set" */
966 unsigned long long attackThresholdBytes;
967 XML_Bool attackThresholdGiven = XML_FALSE;
968
969 int exitCode = XMLWF_EXIT_SUCCESS;
970 enum XML_ParamEntityParsing paramEntityParsing
971 = XML_PARAM_ENTITY_PARSING_NEVER;
972 int useStdin = 0;
973 XmlwfUserData userData = {NULL, NULL, NULL};
974
975 #ifdef _MSC_VER
976 _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
977 #endif
978
979 i = 1;
980 j = 0;
981 while (i < argc) {
982 if (j == 0) {
983 if (argv[i][0] != T('-'))
984 break;
985 if (argv[i][1] == T('-') && argv[i][2] == T('\0')) {
986 i++;
987 break;
988 }
989 j++;
990 }
991 switch (argv[i][j]) {
992 case T('r'):
993 processFlags &= ~XML_MAP_FILE;
994 j++;
995 break;
996 case T('s'):
997 requireStandalone = 1;
998 j++;
999 break;
1000 case T('n'):
1001 useNamespaces = 1;
1002 j++;
1003 break;
1004 case T('p'):
1005 paramEntityParsing = XML_PARAM_ENTITY_PARSING_ALWAYS;
1006 /* fall through */
1007 case T('x'):
1008 processFlags |= XML_EXTERNAL_ENTITIES;
1009 j++;
1010 break;
1011 case T('w'):
1012 windowsCodePages = 1;
1013 j++;
1014 break;
1015 case T('m'):
1016 outputType = 'm';
1017 j++;
1018 break;
1019 case T('c'):
1020 outputType = 'c';
1021 useNamespaces = 0;
1022 j++;
1023 break;
1024 case T('t'):
1025 outputType = 't';
1026 j++;
1027 break;
1028 case T('N'):
1029 requiresNotations = 1;
1030 j++;
1031 break;
1032 case T('d'):
1033 XMLWF_SHIFT_ARG_INTO(outputDir, argc, argv, i, j);
1034 break;
1035 case T('e'):
1036 XMLWF_SHIFT_ARG_INTO(encoding, argc, argv, i, j);
1037 break;
1038 case T('h'):
1039 usage(argv[0], XMLWF_EXIT_SUCCESS);
1040 return 0;
1041 case T('v'):
1042 showVersion(argv[0]);
1043 return 0;
1044 case T('k'):
1045 continueOnError = 1;
1046 j++;
1047 break;
1048 case T('a'): {
1049 const XML_Char *valueText = NULL;
1050 XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j);
1051
1052 errno = 0;
1053 XML_Char *afterValueText = (XML_Char *)valueText;
1054 attackMaximumAmplification = tcstof(valueText, &afterValueText);
1055 if ((errno != 0) || (afterValueText[0] != T('\0'))
1056 || isnan(attackMaximumAmplification)
1057 || (attackMaximumAmplification < 1.0f)) {
1058 // This prevents tperror(..) from reporting misleading "[..]: Success"
1059 errno = ERANGE;
1060 tperror(T("invalid amplification limit") T(
1061 " (needs a floating point number greater or equal than 1.0)"));
1062 exit(XMLWF_EXIT_USAGE_ERROR);
1063 }
1064 #ifndef XML_DTD
1065 ftprintf(stderr, T("Warning: Given amplification limit ignored") T(
1066 ", xmlwf has been compiled without DTD support.\n"));
1067 #endif
1068 break;
1069 }
1070 case T('b'): {
1071 const XML_Char *valueText = NULL;
1072 XMLWF_SHIFT_ARG_INTO(valueText, argc, argv, i, j);
1073
1074 errno = 0;
1075 XML_Char *afterValueText = (XML_Char *)valueText;
1076 attackThresholdBytes = tcstoull(valueText, &afterValueText, 10);
1077 if ((errno != 0) || (afterValueText[0] != T('\0'))) {
1078 // This prevents tperror(..) from reporting misleading "[..]: Success"
1079 errno = ERANGE;
1080 tperror(T("invalid ignore threshold")
1081 T(" (needs an integer from 0 to 2^64-1)"));
1082 exit(XMLWF_EXIT_USAGE_ERROR);
1083 }
1084 attackThresholdGiven = XML_TRUE;
1085 #ifndef XML_DTD
1086 ftprintf(stderr, T("Warning: Given attack threshold ignored") T(
1087 ", xmlwf has been compiled without DTD support.\n"));
1088 #endif
1089 break;
1090 }
1091 case T('\0'):
1092 if (j > 1) {
1093 i++;
1094 j = 0;
1095 break;
1096 }
1097 /* fall through */
1098 default:
1099 usage(argv[0], XMLWF_EXIT_USAGE_ERROR);
1100 }
1101 }
1102 if (i == argc) {
1103 useStdin = 1;
1104 processFlags &= ~XML_MAP_FILE;
1105 i--;
1106 }
1107 for (; i < argc; i++) {
1108 XML_Char *outName = 0;
1109 int result;
1110 XML_Parser parser;
1111 if (useNamespaces)
1112 parser = XML_ParserCreateNS(encoding, NSSEP);
1113 else
1114 parser = XML_ParserCreate(encoding);
1115
1116 if (! parser) {
1117 tperror(T("Could not instantiate parser"));
1118 exit(XMLWF_EXIT_INTERNAL_ERROR);
1119 }
1120
1121 if (attackMaximumAmplification != -1.0f) {
1122 #ifdef XML_DTD
1123 XML_SetBillionLaughsAttackProtectionMaximumAmplification(
1124 parser, attackMaximumAmplification);
1125 #endif
1126 }
1127 if (attackThresholdGiven) {
1128 #ifdef XML_DTD
1129 XML_SetBillionLaughsAttackProtectionActivationThreshold(
1130 parser, attackThresholdBytes);
1131 #else
1132 (void)attackThresholdBytes; // silence -Wunused-but-set-variable
1133 #endif
1134 }
1135
1136 if (requireStandalone)
1137 XML_SetNotStandaloneHandler(parser, notStandalone);
1138 XML_SetParamEntityParsing(parser, paramEntityParsing);
1139 if (outputType == 't') {
1140 /* This is for doing timings; this gives a more realistic estimate of
1141 the parsing time. */
1142 outputDir = 0;
1143 XML_SetElementHandler(parser, nopStartElement, nopEndElement);
1144 XML_SetCharacterDataHandler(parser, nopCharacterData);
1145 XML_SetProcessingInstructionHandler(parser, nopProcessingInstruction);
1146 } else if (outputDir) {
1147 const XML_Char *delim = T("/");
1148 const XML_Char *file = useStdin ? T("STDIN") : argv[i];
1149 if (! useStdin) {
1150 /* Jump after last (back)slash */
1151 const XML_Char *lastDelim = tcsrchr(file, delim[0]);
1152 if (lastDelim)
1153 file = lastDelim + 1;
1154 #if defined(_WIN32)
1155 else {
1156 const XML_Char *winDelim = T("\\");
1157 lastDelim = tcsrchr(file, winDelim[0]);
1158 if (lastDelim) {
1159 file = lastDelim + 1;
1160 delim = winDelim;
1161 }
1162 }
1163 #endif
1164 }
1165 outName = (XML_Char *)malloc((tcslen(outputDir) + tcslen(file) + 2)
1166 * sizeof(XML_Char));
1167 if (! outName) {
1168 tperror(T("Could not allocate memory"));
1169 exit(XMLWF_EXIT_INTERNAL_ERROR);
1170 }
1171 tcscpy(outName, outputDir);
1172 tcscat(outName, delim);
1173 tcscat(outName, file);
1174 userData.fp = tfopen(outName, T("wb"));
1175 if (! userData.fp) {
1176 tperror(outName);
1177 exitCode = XMLWF_EXIT_OUTPUT_ERROR;
1178 free(outName);
1179 XML_ParserFree(parser);
1180 if (continueOnError) {
1181 continue;
1182 } else {
1183 break;
1184 }
1185 }
1186 setvbuf(userData.fp, NULL, _IOFBF, 16384);
1187 #ifdef XML_UNICODE
1188 puttc(0xFEFF, userData.fp);
1189 #endif
1190 XML_SetUserData(parser, &userData);
1191 switch (outputType) {
1192 case 'm':
1193 XML_UseParserAsHandlerArg(parser);
1194 XML_SetElementHandler(parser, metaStartElement, metaEndElement);
1195 XML_SetProcessingInstructionHandler(parser, metaProcessingInstruction);
1196 XML_SetCommentHandler(parser, metaComment);
1197 XML_SetCdataSectionHandler(parser, metaStartCdataSection,
1198 metaEndCdataSection);
1199 XML_SetCharacterDataHandler(parser, metaCharacterData);
1200 XML_SetDoctypeDeclHandler(parser, metaStartDoctypeDecl,
1201 metaEndDoctypeDecl);
1202 XML_SetEntityDeclHandler(parser, metaEntityDecl);
1203 XML_SetNotationDeclHandler(parser, metaNotationDecl);
1204 XML_SetNamespaceDeclHandler(parser, metaStartNamespaceDecl,
1205 metaEndNamespaceDecl);
1206 metaStartDocument(parser);
1207 break;
1208 case 'c':
1209 XML_UseParserAsHandlerArg(parser);
1210 XML_SetDefaultHandler(parser, markup);
1211 XML_SetElementHandler(parser, defaultStartElement, defaultEndElement);
1212 XML_SetCharacterDataHandler(parser, defaultCharacterData);
1213 XML_SetProcessingInstructionHandler(parser,
1214 defaultProcessingInstruction);
1215 break;
1216 default:
1217 if (useNamespaces)
1218 XML_SetElementHandler(parser, startElementNS, endElementNS);
1219 else
1220 XML_SetElementHandler(parser, startElement, endElement);
1221 XML_SetCharacterDataHandler(parser, characterData);
1222 #ifndef W3C14N
1223 XML_SetProcessingInstructionHandler(parser, processingInstruction);
1224 if (requiresNotations) {
1225 XML_SetDoctypeDeclHandler(parser, startDoctypeDecl, endDoctypeDecl);
1226 XML_SetNotationDeclHandler(parser, notationDecl);
1227 }
1228 #endif /* not W3C14N */
1229 break;
1230 }
1231 }
1232 if (windowsCodePages)
1233 XML_SetUnknownEncodingHandler(parser, unknownEncoding, 0);
1234 result = XML_ProcessFile(parser, useStdin ? NULL : argv[i], processFlags);
1235 if (outputDir) {
1236 if (outputType == 'm')
1237 metaEndDocument(parser);
1238 fclose(userData.fp);
1239 if (! result) {
1240 tremove(outName);
1241 }
1242 free(outName);
1243 }
1244 XML_ParserFree(parser);
1245 if (! result) {
1246 exitCode = XMLWF_EXIT_NOT_WELLFORMED;
1247 cleanupUserData(&userData);
1248 if (! continueOnError) {
1249 break;
1250 }
1251 }
1252 }
1253 return exitCode;
1254 }
1255