12e724bc9Sbluhm /*
22e724bc9Sbluhm __ __ _
32e724bc9Sbluhm ___\ \/ /_ __ __ _| |_
42e724bc9Sbluhm / _ \\ /| '_ \ / _` | __|
52e724bc9Sbluhm | __// \| |_) | (_| | |_
62e724bc9Sbluhm \___/_/\_\ .__/ \__,_|\__|
72e724bc9Sbluhm |_| XML parser
82e724bc9Sbluhm
92e724bc9Sbluhm Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
1008819b41Sbluhm Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net>
1108819b41Sbluhm Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
1208819b41Sbluhm Copyright (c) 2002 Greg Stein <gstein@users.sourceforge.net>
1308819b41Sbluhm Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14253fd6bfSbluhm Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
15*bd8f1dc3Sbluhm Copyright (c) 2016-2024 Sebastian Pipping <sebastian@pipping.org>
1608819b41Sbluhm Copyright (c) 2016 Pascal Cuoq <cuoq@trust-in-soft.com>
1708819b41Sbluhm Copyright (c) 2016 Don Lewis <truckman@apache.org>
1808819b41Sbluhm Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk>
1908819b41Sbluhm Copyright (c) 2017 Alexander Bluhm <alexander.bluhm@gmx.net>
2008819b41Sbluhm Copyright (c) 2017 Benbuck Nason <bnason@netflix.com>
2108819b41Sbluhm Copyright (c) 2017 José Gutiérrez de la Concha <jose@zeroc.com>
2208819b41Sbluhm Copyright (c) 2019 David Loffredo <loffredo@steptools.com>
23*bd8f1dc3Sbluhm Copyright (c) 2021 Donghee Na <donghee.na@python.org>
249029d806Sbluhm Copyright (c) 2022 Martin Ettl <ettl.martin78@googlemail.com>
25*bd8f1dc3Sbluhm Copyright (c) 2022 Sean McBride <sean@rogue-research.com>
26*bd8f1dc3Sbluhm Copyright (c) 2023 Hanno Böck <hanno@gentoo.org>
272e724bc9Sbluhm Licensed under the MIT license:
282e724bc9Sbluhm
292e724bc9Sbluhm Permission is hereby granted, free of charge, to any person obtaining
302e724bc9Sbluhm a copy of this software and associated documentation files (the
312e724bc9Sbluhm "Software"), to deal in the Software without restriction, including
322e724bc9Sbluhm without limitation the rights to use, copy, modify, merge, publish,
332e724bc9Sbluhm distribute, sublicense, and/or sell copies of the Software, and to permit
342e724bc9Sbluhm persons to whom the Software is furnished to do so, subject to the
352e724bc9Sbluhm following conditions:
362e724bc9Sbluhm
372e724bc9Sbluhm The above copyright notice and this permission notice shall be included
382e724bc9Sbluhm in all copies or substantial portions of the Software.
392e724bc9Sbluhm
402e724bc9Sbluhm THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
412e724bc9Sbluhm EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
422e724bc9Sbluhm MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
432e724bc9Sbluhm NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
442e724bc9Sbluhm DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
452e724bc9Sbluhm OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
462e724bc9Sbluhm USE OR OTHER DEALINGS IN THE SOFTWARE.
471a3ddf8cSespie */
481a3ddf8cSespie
49*bd8f1dc3Sbluhm #include "expat_config.h"
502c19dcf8Sbluhm
517d36914fSalek #include <stddef.h>
529b8e2351Sbluhm #include <string.h> /* memcpy */
532e724bc9Sbluhm #include <stdbool.h>
549b8e2351Sbluhm
552feb5d2aSbluhm #ifdef _WIN32
561a3ddf8cSespie # include "winconfig.h"
577d36914fSalek #endif
5808819b41Sbluhm
597d36914fSalek #include "expat_external.h"
601a3ddf8cSespie #include "internal.h"
611a3ddf8cSespie #include "xmltok.h"
621a3ddf8cSespie #include "nametab.h"
631a3ddf8cSespie
641a3ddf8cSespie #ifdef XML_DTD
651a3ddf8cSespie # define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
661a3ddf8cSespie #else
671a3ddf8cSespie # define IGNORE_SECTION_TOK_VTABLE /* as nothing */
681a3ddf8cSespie #endif
691a3ddf8cSespie
701a3ddf8cSespie #define VTABLE1 \
711a3ddf8cSespie {PREFIX(prologTok), PREFIX(contentTok), \
721a3ddf8cSespie PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE}, \
731a3ddf8cSespie {PREFIX(attributeValueTok), PREFIX(entityValueTok)}, \
7428ce3119Sbluhm PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS), \
7528ce3119Sbluhm PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName), \
7628ce3119Sbluhm PREFIX(updatePosition), PREFIX(isPublicId)
771a3ddf8cSespie
781a3ddf8cSespie #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
791a3ddf8cSespie
801a3ddf8cSespie #define UCS2_GET_NAMING(pages, hi, lo) \
812feb5d2aSbluhm (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
821a3ddf8cSespie
831a3ddf8cSespie /* A 2 byte UTF-8 representation splits the characters 11 bits between
841a3ddf8cSespie the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
851a3ddf8cSespie pages, 3 bits to add to that index and 5 bits to generate the mask.
861a3ddf8cSespie */
871a3ddf8cSespie #define UTF8_GET_NAMING2(pages, byte) \
881a3ddf8cSespie (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
8928ce3119Sbluhm + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] \
902feb5d2aSbluhm & (1u << (((byte)[1]) & 0x1F)))
911a3ddf8cSespie
921a3ddf8cSespie /* A 3 byte UTF-8 representation splits the characters 16 bits between
931a3ddf8cSespie the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
941a3ddf8cSespie into pages, 3 bits to add to that index and 5 bits to generate the
951a3ddf8cSespie mask.
961a3ddf8cSespie */
971a3ddf8cSespie #define UTF8_GET_NAMING3(pages, byte) \
9828ce3119Sbluhm (namingBitmap \
9928ce3119Sbluhm [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] \
1001a3ddf8cSespie << 3) \
10128ce3119Sbluhm + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \
1022feb5d2aSbluhm & (1u << (((byte)[2]) & 0x1F)))
1031a3ddf8cSespie
1041a3ddf8cSespie /* Detection of invalid UTF-8 sequences is based on Table 3.1B
105*bd8f1dc3Sbluhm of Unicode 3.2: https://www.unicode.org/unicode/reports/tr28/
1061a3ddf8cSespie with the additional restriction of not allowing the Unicode
1071a3ddf8cSespie code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
1081a3ddf8cSespie Implementation details:
1091a3ddf8cSespie (A & 0x80) == 0 means A < 0x80
1101a3ddf8cSespie and
1111a3ddf8cSespie (A & 0xC0) == 0xC0 means A > 0xBF
1121a3ddf8cSespie */
1131a3ddf8cSespie
1141a3ddf8cSespie #define UTF8_INVALID2(p) \
1151a3ddf8cSespie ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
1161a3ddf8cSespie
1171a3ddf8cSespie #define UTF8_INVALID3(p) \
1181a3ddf8cSespie (((p)[2] & 0x80) == 0 \
11928ce3119Sbluhm || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD \
12028ce3119Sbluhm : ((p)[2] & 0xC0) == 0xC0) \
12128ce3119Sbluhm || ((*p) == 0xE0 \
12228ce3119Sbluhm ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
12328ce3119Sbluhm : ((p)[1] & 0x80) == 0 \
12428ce3119Sbluhm || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
1251a3ddf8cSespie
1261a3ddf8cSespie #define UTF8_INVALID4(p) \
12728ce3119Sbluhm (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0 \
12828ce3119Sbluhm || ((p)[2] & 0xC0) == 0xC0 \
12928ce3119Sbluhm || ((*p) == 0xF0 \
13028ce3119Sbluhm ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
13128ce3119Sbluhm : ((p)[1] & 0x80) == 0 \
13228ce3119Sbluhm || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
1331a3ddf8cSespie
1341a3ddf8cSespie static int PTRFASTCALL
isNever(const ENCODING * enc,const char * p)13528ce3119Sbluhm isNever(const ENCODING *enc, const char *p) {
13628ce3119Sbluhm UNUSED_P(enc);
13728ce3119Sbluhm UNUSED_P(p);
1381a3ddf8cSespie return 0;
1391a3ddf8cSespie }
1401a3ddf8cSespie
1411a3ddf8cSespie static int PTRFASTCALL
utf8_isName2(const ENCODING * enc,const char * p)14228ce3119Sbluhm utf8_isName2(const ENCODING *enc, const char *p) {
14328ce3119Sbluhm UNUSED_P(enc);
1441a3ddf8cSespie return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
1451a3ddf8cSespie }
1461a3ddf8cSespie
1471a3ddf8cSespie static int PTRFASTCALL
utf8_isName3(const ENCODING * enc,const char * p)14828ce3119Sbluhm utf8_isName3(const ENCODING *enc, const char *p) {
14928ce3119Sbluhm UNUSED_P(enc);
1501a3ddf8cSespie return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
1511a3ddf8cSespie }
1521a3ddf8cSespie
1531a3ddf8cSespie #define utf8_isName4 isNever
1541a3ddf8cSespie
1551a3ddf8cSespie static int PTRFASTCALL
utf8_isNmstrt2(const ENCODING * enc,const char * p)15628ce3119Sbluhm utf8_isNmstrt2(const ENCODING *enc, const char *p) {
15728ce3119Sbluhm UNUSED_P(enc);
1581a3ddf8cSespie return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
1591a3ddf8cSespie }
1601a3ddf8cSespie
1611a3ddf8cSespie static int PTRFASTCALL
utf8_isNmstrt3(const ENCODING * enc,const char * p)16228ce3119Sbluhm utf8_isNmstrt3(const ENCODING *enc, const char *p) {
16328ce3119Sbluhm UNUSED_P(enc);
1641a3ddf8cSespie return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
1651a3ddf8cSespie }
1661a3ddf8cSespie
1671a3ddf8cSespie #define utf8_isNmstrt4 isNever
1681a3ddf8cSespie
1691a3ddf8cSespie static int PTRFASTCALL
utf8_isInvalid2(const ENCODING * enc,const char * p)17028ce3119Sbluhm utf8_isInvalid2(const ENCODING *enc, const char *p) {
17128ce3119Sbluhm UNUSED_P(enc);
1721a3ddf8cSespie return UTF8_INVALID2((const unsigned char *)p);
1731a3ddf8cSespie }
1741a3ddf8cSespie
1751a3ddf8cSespie static int PTRFASTCALL
utf8_isInvalid3(const ENCODING * enc,const char * p)17628ce3119Sbluhm utf8_isInvalid3(const ENCODING *enc, const char *p) {
17728ce3119Sbluhm UNUSED_P(enc);
1781a3ddf8cSespie return UTF8_INVALID3((const unsigned char *)p);
1791a3ddf8cSespie }
1801a3ddf8cSespie
1811a3ddf8cSespie static int PTRFASTCALL
utf8_isInvalid4(const ENCODING * enc,const char * p)18228ce3119Sbluhm utf8_isInvalid4(const ENCODING *enc, const char *p) {
18328ce3119Sbluhm UNUSED_P(enc);
1841a3ddf8cSespie return UTF8_INVALID4((const unsigned char *)p);
1851a3ddf8cSespie }
1861a3ddf8cSespie
1871a3ddf8cSespie struct normal_encoding {
1881a3ddf8cSespie ENCODING enc;
1891a3ddf8cSespie unsigned char type[256];
1901a3ddf8cSespie #ifdef XML_MIN_SIZE
1911a3ddf8cSespie int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
1921a3ddf8cSespie int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
1931a3ddf8cSespie int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
1941a3ddf8cSespie int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
1951a3ddf8cSespie int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
1961a3ddf8cSespie #endif /* XML_MIN_SIZE */
1971a3ddf8cSespie int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
1981a3ddf8cSespie int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
1991a3ddf8cSespie int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
2001a3ddf8cSespie int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
2011a3ddf8cSespie int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
2021a3ddf8cSespie int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
2031a3ddf8cSespie int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
2041a3ddf8cSespie int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
2051a3ddf8cSespie int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
2061a3ddf8cSespie };
2071a3ddf8cSespie
2081a3ddf8cSespie #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
2091a3ddf8cSespie
2101a3ddf8cSespie #ifdef XML_MIN_SIZE
2111a3ddf8cSespie
2121a3ddf8cSespie # define STANDARD_VTABLE(E) \
21328ce3119Sbluhm E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
2141a3ddf8cSespie
2151a3ddf8cSespie #else
2161a3ddf8cSespie
2171a3ddf8cSespie # define STANDARD_VTABLE(E) /* as nothing */
2181a3ddf8cSespie
2191a3ddf8cSespie #endif
2201a3ddf8cSespie
2211a3ddf8cSespie #define NORMAL_VTABLE(E) \
22228ce3119Sbluhm E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3, \
22328ce3119Sbluhm E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
2241a3ddf8cSespie
2252feb5d2aSbluhm #define NULL_VTABLE \
22628ce3119Sbluhm /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL, \
22728ce3119Sbluhm /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL, \
22828ce3119Sbluhm /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
2292feb5d2aSbluhm
230*bd8f1dc3Sbluhm static int FASTCALL checkCharRefNumber(int result);
2311a3ddf8cSespie
2321a3ddf8cSespie #include "xmltok_impl.h"
2331a3ddf8cSespie #include "ascii.h"
2341a3ddf8cSespie
2351a3ddf8cSespie #ifdef XML_MIN_SIZE
2361a3ddf8cSespie # define sb_isNameMin isNever
2371a3ddf8cSespie # define sb_isNmstrtMin isNever
2381a3ddf8cSespie #endif
2391a3ddf8cSespie
2401a3ddf8cSespie #ifdef XML_MIN_SIZE
2411a3ddf8cSespie # define MINBPC(enc) ((enc)->minBytesPerChar)
2421a3ddf8cSespie #else
2431a3ddf8cSespie /* minimum bytes per character */
2441a3ddf8cSespie # define MINBPC(enc) 1
2451a3ddf8cSespie #endif
2461a3ddf8cSespie
2471a3ddf8cSespie #define SB_BYTE_TYPE(enc, p) \
248*bd8f1dc3Sbluhm (((const struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
2491a3ddf8cSespie
2501a3ddf8cSespie #ifdef XML_MIN_SIZE
2511a3ddf8cSespie static int PTRFASTCALL
sb_byteType(const ENCODING * enc,const char * p)25228ce3119Sbluhm sb_byteType(const ENCODING *enc, const char *p) {
2531a3ddf8cSespie return SB_BYTE_TYPE(enc, p);
2541a3ddf8cSespie }
25528ce3119Sbluhm # define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
2561a3ddf8cSespie #else
2571a3ddf8cSespie # define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
2581a3ddf8cSespie #endif
2591a3ddf8cSespie
2601a3ddf8cSespie #ifdef XML_MIN_SIZE
26128ce3119Sbluhm # define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
2621a3ddf8cSespie static int PTRFASTCALL
sb_byteToAscii(const ENCODING * enc,const char * p)26328ce3119Sbluhm sb_byteToAscii(const ENCODING *enc, const char *p) {
26428ce3119Sbluhm UNUSED_P(enc);
2651a3ddf8cSespie return *p;
2661a3ddf8cSespie }
2671a3ddf8cSespie #else
2681a3ddf8cSespie # define BYTE_TO_ASCII(enc, p) (*(p))
2691a3ddf8cSespie #endif
2701a3ddf8cSespie
27128ce3119Sbluhm #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
27228ce3119Sbluhm #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
27308819b41Sbluhm #ifdef XML_MIN_SIZE
27408819b41Sbluhm # define IS_INVALID_CHAR(enc, p, n) \
27508819b41Sbluhm (AS_NORMAL_ENCODING(enc)->isInvalid##n \
27608819b41Sbluhm && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
27708819b41Sbluhm #else
2781a3ddf8cSespie # define IS_INVALID_CHAR(enc, p, n) \
2791a3ddf8cSespie (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
28008819b41Sbluhm #endif
2811a3ddf8cSespie
2821a3ddf8cSespie #ifdef XML_MIN_SIZE
2831a3ddf8cSespie # define IS_NAME_CHAR_MINBPC(enc, p) \
2841a3ddf8cSespie (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
2851a3ddf8cSespie # define IS_NMSTRT_CHAR_MINBPC(enc, p) \
2861a3ddf8cSespie (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
2871a3ddf8cSespie #else
2881a3ddf8cSespie # define IS_NAME_CHAR_MINBPC(enc, p) (0)
2891a3ddf8cSespie # define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
2901a3ddf8cSespie #endif
2911a3ddf8cSespie
2921a3ddf8cSespie #ifdef XML_MIN_SIZE
2931a3ddf8cSespie # define CHAR_MATCHES(enc, p, c) \
2941a3ddf8cSespie (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
2951a3ddf8cSespie static int PTRCALL
sb_charMatches(const ENCODING * enc,const char * p,int c)29628ce3119Sbluhm sb_charMatches(const ENCODING *enc, const char *p, int c) {
29728ce3119Sbluhm UNUSED_P(enc);
2981a3ddf8cSespie return *p == c;
2991a3ddf8cSespie }
3001a3ddf8cSespie #else
3011a3ddf8cSespie /* c is an ASCII character */
3029029d806Sbluhm # define CHAR_MATCHES(enc, p, c) (*(p) == (c))
3031a3ddf8cSespie #endif
3041a3ddf8cSespie
3051a3ddf8cSespie #define PREFIX(ident) normal_##ident
306b26ab0f8Smatthieu #define XML_TOK_IMPL_C
3071a3ddf8cSespie #include "xmltok_impl.c"
308b26ab0f8Smatthieu #undef XML_TOK_IMPL_C
3091a3ddf8cSespie
3101a3ddf8cSespie #undef MINBPC
3111a3ddf8cSespie #undef BYTE_TYPE
3121a3ddf8cSespie #undef BYTE_TO_ASCII
3131a3ddf8cSespie #undef CHAR_MATCHES
3141a3ddf8cSespie #undef IS_NAME_CHAR
3151a3ddf8cSespie #undef IS_NAME_CHAR_MINBPC
3161a3ddf8cSespie #undef IS_NMSTRT_CHAR
3171a3ddf8cSespie #undef IS_NMSTRT_CHAR_MINBPC
3181a3ddf8cSespie #undef IS_INVALID_CHAR
3191a3ddf8cSespie
3201a3ddf8cSespie enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
3211a3ddf8cSespie UTF8_cval1 = 0x00,
3221a3ddf8cSespie UTF8_cval2 = 0xc0,
3231a3ddf8cSespie UTF8_cval3 = 0xe0,
3241a3ddf8cSespie UTF8_cval4 = 0xf0
3251a3ddf8cSespie };
3261a3ddf8cSespie
3272feb5d2aSbluhm void
_INTERNAL_trim_to_complete_utf8_characters(const char * from,const char ** fromLimRef)32828ce3119Sbluhm _INTERNAL_trim_to_complete_utf8_characters(const char *from,
32928ce3119Sbluhm const char **fromLimRef) {
3302feb5d2aSbluhm const char *fromLim = *fromLimRef;
3312feb5d2aSbluhm size_t walked = 0;
3322feb5d2aSbluhm for (; fromLim > from; fromLim--, walked++) {
3332feb5d2aSbluhm const unsigned char prev = (unsigned char)fromLim[-1];
33428ce3119Sbluhm if ((prev & 0xf8u)
33528ce3119Sbluhm == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
3362feb5d2aSbluhm if (walked + 1 >= 4) {
3372feb5d2aSbluhm fromLim += 4 - 1;
3382feb5d2aSbluhm break;
3392feb5d2aSbluhm } else {
3402feb5d2aSbluhm walked = 0;
3412feb5d2aSbluhm }
34228ce3119Sbluhm } else if ((prev & 0xf0u)
34328ce3119Sbluhm == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
3442feb5d2aSbluhm if (walked + 1 >= 3) {
3452feb5d2aSbluhm fromLim += 3 - 1;
3462feb5d2aSbluhm break;
3472feb5d2aSbluhm } else {
3482feb5d2aSbluhm walked = 0;
3492feb5d2aSbluhm }
35028ce3119Sbluhm } else if ((prev & 0xe0u)
35128ce3119Sbluhm == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
3522feb5d2aSbluhm if (walked + 1 >= 2) {
3532feb5d2aSbluhm fromLim += 2 - 1;
3542feb5d2aSbluhm break;
3552feb5d2aSbluhm } else {
3562feb5d2aSbluhm walked = 0;
3572feb5d2aSbluhm }
35828ce3119Sbluhm } else if ((prev & 0x80u)
35928ce3119Sbluhm == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
3602feb5d2aSbluhm break;
3612feb5d2aSbluhm }
3622feb5d2aSbluhm }
3632feb5d2aSbluhm *fromLimRef = fromLim;
3642feb5d2aSbluhm }
3652feb5d2aSbluhm
366525cdfc7Srpointel static enum XML_Convert_Result PTRCALL
utf8_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)36728ce3119Sbluhm utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
36828ce3119Sbluhm char **toP, const char *toLim) {
3692e724bc9Sbluhm bool input_incomplete = false;
3702e724bc9Sbluhm bool output_exhausted = false;
3712feb5d2aSbluhm
3722e724bc9Sbluhm /* Avoid copying partial characters (due to limited space). */
3732e724bc9Sbluhm const ptrdiff_t bytesAvailable = fromLim - *fromP;
3742e724bc9Sbluhm const ptrdiff_t bytesStorable = toLim - *toP;
37528ce3119Sbluhm UNUSED_P(enc);
3762e724bc9Sbluhm if (bytesAvailable > bytesStorable) {
3772e724bc9Sbluhm fromLim = *fromP + bytesStorable;
3782e724bc9Sbluhm output_exhausted = true;
3792e724bc9Sbluhm }
3802e724bc9Sbluhm
3812e724bc9Sbluhm /* Avoid copying partial characters (from incomplete input). */
3829b8e2351Sbluhm {
3832e724bc9Sbluhm const char *const fromLimBefore = fromLim;
3849b8e2351Sbluhm _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
3852e724bc9Sbluhm if (fromLim < fromLimBefore) {
3862e724bc9Sbluhm input_incomplete = true;
3872e724bc9Sbluhm }
3889b8e2351Sbluhm }
3892feb5d2aSbluhm
3909b8e2351Sbluhm {
3912e724bc9Sbluhm const ptrdiff_t bytesToCopy = fromLim - *fromP;
3929b8e2351Sbluhm memcpy(*toP, *fromP, bytesToCopy);
3932e724bc9Sbluhm *fromP += bytesToCopy;
3942e724bc9Sbluhm *toP += bytesToCopy;
3959b8e2351Sbluhm }
396525cdfc7Srpointel
3979b8e2351Sbluhm if (output_exhausted) /* needs to go first */
398525cdfc7Srpointel return XML_CONVERT_OUTPUT_EXHAUSTED;
3992e724bc9Sbluhm else if (input_incomplete)
4002e724bc9Sbluhm return XML_CONVERT_INPUT_INCOMPLETE;
401525cdfc7Srpointel else
4022feb5d2aSbluhm return XML_CONVERT_COMPLETED;
4031a3ddf8cSespie }
4041a3ddf8cSespie
405525cdfc7Srpointel static enum XML_Convert_Result PTRCALL
utf8_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)40628ce3119Sbluhm utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
40728ce3119Sbluhm unsigned short **toP, const unsigned short *toLim) {
408525cdfc7Srpointel enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
4091a3ddf8cSespie unsigned short *to = *toP;
4101a3ddf8cSespie const char *from = *fromP;
411525cdfc7Srpointel while (from < fromLim && to < toLim) {
412*bd8f1dc3Sbluhm switch (SB_BYTE_TYPE(enc, from)) {
4131a3ddf8cSespie case BT_LEAD2:
414525cdfc7Srpointel if (fromLim - from < 2) {
415525cdfc7Srpointel res = XML_CONVERT_INPUT_INCOMPLETE;
4162feb5d2aSbluhm goto after;
417525cdfc7Srpointel }
4181a3ddf8cSespie *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
4191a3ddf8cSespie from += 2;
4201a3ddf8cSespie break;
4211a3ddf8cSespie case BT_LEAD3:
422525cdfc7Srpointel if (fromLim - from < 3) {
423525cdfc7Srpointel res = XML_CONVERT_INPUT_INCOMPLETE;
4242feb5d2aSbluhm goto after;
425525cdfc7Srpointel }
42628ce3119Sbluhm *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
42728ce3119Sbluhm | (from[2] & 0x3f));
4281a3ddf8cSespie from += 3;
4291a3ddf8cSespie break;
43028ce3119Sbluhm case BT_LEAD4: {
4311a3ddf8cSespie unsigned long n;
432525cdfc7Srpointel if (toLim - to < 2) {
433525cdfc7Srpointel res = XML_CONVERT_OUTPUT_EXHAUSTED;
4341a3ddf8cSespie goto after;
435525cdfc7Srpointel }
436525cdfc7Srpointel if (fromLim - from < 4) {
437525cdfc7Srpointel res = XML_CONVERT_INPUT_INCOMPLETE;
438525cdfc7Srpointel goto after;
439525cdfc7Srpointel }
4401a3ddf8cSespie n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
4411a3ddf8cSespie | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
4421a3ddf8cSespie n -= 0x10000;
4431a3ddf8cSespie to[0] = (unsigned short)((n >> 10) | 0xD800);
4441a3ddf8cSespie to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
4451a3ddf8cSespie to += 2;
4461a3ddf8cSespie from += 4;
44728ce3119Sbluhm } break;
4481a3ddf8cSespie default:
4491a3ddf8cSespie *to++ = *from++;
4501a3ddf8cSespie break;
4511a3ddf8cSespie }
4521a3ddf8cSespie }
4532feb5d2aSbluhm if (from < fromLim)
4542feb5d2aSbluhm res = XML_CONVERT_OUTPUT_EXHAUSTED;
4551a3ddf8cSespie after:
4561a3ddf8cSespie *fromP = from;
4571a3ddf8cSespie *toP = to;
458525cdfc7Srpointel return res;
4591a3ddf8cSespie }
4601a3ddf8cSespie
4611a3ddf8cSespie #ifdef XML_NS
46228ce3119Sbluhm static const struct normal_encoding utf8_encoding_ns
46328ce3119Sbluhm = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
4641a3ddf8cSespie {
4651a3ddf8cSespie # include "asciitab.h"
4661a3ddf8cSespie # include "utf8tab.h"
4671a3ddf8cSespie },
46828ce3119Sbluhm STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
4691a3ddf8cSespie #endif
4701a3ddf8cSespie
47128ce3119Sbluhm static const struct normal_encoding utf8_encoding
47228ce3119Sbluhm = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
4731a3ddf8cSespie {
4741a3ddf8cSespie #define BT_COLON BT_NMSTRT
4751a3ddf8cSespie #include "asciitab.h"
4761a3ddf8cSespie #undef BT_COLON
4771a3ddf8cSespie #include "utf8tab.h"
4781a3ddf8cSespie },
47928ce3119Sbluhm STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
4801a3ddf8cSespie
4811a3ddf8cSespie #ifdef XML_NS
4821a3ddf8cSespie
48328ce3119Sbluhm static const struct normal_encoding internal_utf8_encoding_ns
48428ce3119Sbluhm = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
4851a3ddf8cSespie {
4861a3ddf8cSespie # include "iasciitab.h"
4871a3ddf8cSespie # include "utf8tab.h"
4881a3ddf8cSespie },
48928ce3119Sbluhm STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
4901a3ddf8cSespie
4911a3ddf8cSespie #endif
4921a3ddf8cSespie
49328ce3119Sbluhm static const struct normal_encoding internal_utf8_encoding
49428ce3119Sbluhm = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
4951a3ddf8cSespie {
4961a3ddf8cSespie #define BT_COLON BT_NMSTRT
4971a3ddf8cSespie #include "iasciitab.h"
4981a3ddf8cSespie #undef BT_COLON
4991a3ddf8cSespie #include "utf8tab.h"
5001a3ddf8cSespie },
50128ce3119Sbluhm STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
5021a3ddf8cSespie
503525cdfc7Srpointel static enum XML_Convert_Result PTRCALL
latin1_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)50428ce3119Sbluhm latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
50528ce3119Sbluhm char **toP, const char *toLim) {
50628ce3119Sbluhm UNUSED_P(enc);
5071a3ddf8cSespie for (;;) {
5081a3ddf8cSespie unsigned char c;
5091a3ddf8cSespie if (*fromP == fromLim)
510525cdfc7Srpointel return XML_CONVERT_COMPLETED;
5111a3ddf8cSespie c = (unsigned char)**fromP;
5121a3ddf8cSespie if (c & 0x80) {
5131a3ddf8cSespie if (toLim - *toP < 2)
514525cdfc7Srpointel return XML_CONVERT_OUTPUT_EXHAUSTED;
5151a3ddf8cSespie *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
5161a3ddf8cSespie *(*toP)++ = (char)((c & 0x3f) | 0x80);
5171a3ddf8cSespie (*fromP)++;
51828ce3119Sbluhm } else {
5191a3ddf8cSespie if (*toP == toLim)
520525cdfc7Srpointel return XML_CONVERT_OUTPUT_EXHAUSTED;
5211a3ddf8cSespie *(*toP)++ = *(*fromP)++;
5221a3ddf8cSespie }
5231a3ddf8cSespie }
5241a3ddf8cSespie }
5251a3ddf8cSespie
526525cdfc7Srpointel static enum XML_Convert_Result PTRCALL
latin1_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)52728ce3119Sbluhm latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
52828ce3119Sbluhm unsigned short **toP, const unsigned short *toLim) {
52928ce3119Sbluhm UNUSED_P(enc);
530525cdfc7Srpointel while (*fromP < fromLim && *toP < toLim)
5311a3ddf8cSespie *(*toP)++ = (unsigned char)*(*fromP)++;
532525cdfc7Srpointel
533525cdfc7Srpointel if ((*toP == toLim) && (*fromP < fromLim))
534525cdfc7Srpointel return XML_CONVERT_OUTPUT_EXHAUSTED;
535525cdfc7Srpointel else
536525cdfc7Srpointel return XML_CONVERT_COMPLETED;
5371a3ddf8cSespie }
5381a3ddf8cSespie
5391a3ddf8cSespie #ifdef XML_NS
5401a3ddf8cSespie
54128ce3119Sbluhm static const struct normal_encoding latin1_encoding_ns
54228ce3119Sbluhm = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
5431a3ddf8cSespie {
5441a3ddf8cSespie # include "asciitab.h"
5451a3ddf8cSespie # include "latin1tab.h"
5461a3ddf8cSespie },
54728ce3119Sbluhm STANDARD_VTABLE(sb_) NULL_VTABLE};
5481a3ddf8cSespie
5491a3ddf8cSespie #endif
5501a3ddf8cSespie
55128ce3119Sbluhm static const struct normal_encoding latin1_encoding
55228ce3119Sbluhm = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
5531a3ddf8cSespie {
5541a3ddf8cSespie #define BT_COLON BT_NMSTRT
5551a3ddf8cSespie #include "asciitab.h"
5561a3ddf8cSespie #undef BT_COLON
5571a3ddf8cSespie #include "latin1tab.h"
5581a3ddf8cSespie },
55928ce3119Sbluhm STANDARD_VTABLE(sb_) NULL_VTABLE};
5601a3ddf8cSespie
561525cdfc7Srpointel static enum XML_Convert_Result PTRCALL
ascii_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)56228ce3119Sbluhm ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
56328ce3119Sbluhm char **toP, const char *toLim) {
56428ce3119Sbluhm UNUSED_P(enc);
565525cdfc7Srpointel while (*fromP < fromLim && *toP < toLim)
5661a3ddf8cSespie *(*toP)++ = *(*fromP)++;
567525cdfc7Srpointel
568525cdfc7Srpointel if ((*toP == toLim) && (*fromP < fromLim))
569525cdfc7Srpointel return XML_CONVERT_OUTPUT_EXHAUSTED;
570525cdfc7Srpointel else
571525cdfc7Srpointel return XML_CONVERT_COMPLETED;
5721a3ddf8cSespie }
5731a3ddf8cSespie
5741a3ddf8cSespie #ifdef XML_NS
5751a3ddf8cSespie
57628ce3119Sbluhm static const struct normal_encoding ascii_encoding_ns
57728ce3119Sbluhm = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
5781a3ddf8cSespie {
5791a3ddf8cSespie # include "asciitab.h"
5801a3ddf8cSespie /* BT_NONXML == 0 */
5811a3ddf8cSespie },
58228ce3119Sbluhm STANDARD_VTABLE(sb_) NULL_VTABLE};
5831a3ddf8cSespie
5841a3ddf8cSespie #endif
5851a3ddf8cSespie
58628ce3119Sbluhm static const struct normal_encoding ascii_encoding
58728ce3119Sbluhm = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
5881a3ddf8cSespie {
5891a3ddf8cSespie #define BT_COLON BT_NMSTRT
5901a3ddf8cSespie #include "asciitab.h"
5911a3ddf8cSespie #undef BT_COLON
5921a3ddf8cSespie /* BT_NONXML == 0 */
5931a3ddf8cSespie },
59428ce3119Sbluhm STANDARD_VTABLE(sb_) NULL_VTABLE};
5951a3ddf8cSespie
5961a3ddf8cSespie static int PTRFASTCALL
unicode_byte_type(char hi,char lo)59728ce3119Sbluhm unicode_byte_type(char hi, char lo) {
5981a3ddf8cSespie switch ((unsigned char)hi) {
5992a4a206eSbluhm /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
60028ce3119Sbluhm case 0xD8:
60128ce3119Sbluhm case 0xD9:
60228ce3119Sbluhm case 0xDA:
60328ce3119Sbluhm case 0xDB:
6041a3ddf8cSespie return BT_LEAD4;
6052a4a206eSbluhm /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
60628ce3119Sbluhm case 0xDC:
60728ce3119Sbluhm case 0xDD:
60828ce3119Sbluhm case 0xDE:
60928ce3119Sbluhm case 0xDF:
6101a3ddf8cSespie return BT_TRAIL;
6111a3ddf8cSespie case 0xFF:
6121a3ddf8cSespie switch ((unsigned char)lo) {
61328ce3119Sbluhm case 0xFF: /* noncharacter-FFFF */
61428ce3119Sbluhm case 0xFE: /* noncharacter-FFFE */
6151a3ddf8cSespie return BT_NONXML;
6161a3ddf8cSespie }
6171a3ddf8cSespie break;
6181a3ddf8cSespie }
6191a3ddf8cSespie return BT_NONASCII;
6201a3ddf8cSespie }
6211a3ddf8cSespie
6221a3ddf8cSespie #define DEFINE_UTF16_TO_UTF8(E) \
62328ce3119Sbluhm static enum XML_Convert_Result PTRCALL E##toUtf8( \
62428ce3119Sbluhm const ENCODING *enc, const char **fromP, const char *fromLim, \
62528ce3119Sbluhm char **toP, const char *toLim) { \
626525cdfc7Srpointel const char *from = *fromP; \
62728ce3119Sbluhm UNUSED_P(enc); \
628525cdfc7Srpointel fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \
629525cdfc7Srpointel for (; from < fromLim; from += 2) { \
6301a3ddf8cSespie int plane; \
6311a3ddf8cSespie unsigned char lo2; \
6321a3ddf8cSespie unsigned char lo = GET_LO(from); \
6331a3ddf8cSespie unsigned char hi = GET_HI(from); \
6341a3ddf8cSespie switch (hi) { \
6351a3ddf8cSespie case 0: \
6361a3ddf8cSespie if (lo < 0x80) { \
6371a3ddf8cSespie if (*toP == toLim) { \
6381a3ddf8cSespie *fromP = from; \
639525cdfc7Srpointel return XML_CONVERT_OUTPUT_EXHAUSTED; \
6401a3ddf8cSespie } \
6411a3ddf8cSespie *(*toP)++ = lo; \
6421a3ddf8cSespie break; \
6431a3ddf8cSespie } \
6441a3ddf8cSespie /* fall through */ \
64528ce3119Sbluhm case 0x1: \
64628ce3119Sbluhm case 0x2: \
64728ce3119Sbluhm case 0x3: \
64828ce3119Sbluhm case 0x4: \
64928ce3119Sbluhm case 0x5: \
65028ce3119Sbluhm case 0x6: \
65128ce3119Sbluhm case 0x7: \
6521a3ddf8cSespie if (toLim - *toP < 2) { \
6531a3ddf8cSespie *fromP = from; \
654525cdfc7Srpointel return XML_CONVERT_OUTPUT_EXHAUSTED; \
6551a3ddf8cSespie } \
6561a3ddf8cSespie *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
6571a3ddf8cSespie *(*toP)++ = ((lo & 0x3f) | 0x80); \
6581a3ddf8cSespie break; \
6591a3ddf8cSespie default: \
6601a3ddf8cSespie if (toLim - *toP < 3) { \
6611a3ddf8cSespie *fromP = from; \
662525cdfc7Srpointel return XML_CONVERT_OUTPUT_EXHAUSTED; \
6631a3ddf8cSespie } \
6641a3ddf8cSespie /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
6651a3ddf8cSespie *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
6661a3ddf8cSespie *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
6671a3ddf8cSespie *(*toP)++ = ((lo & 0x3f) | 0x80); \
6681a3ddf8cSespie break; \
66928ce3119Sbluhm case 0xD8: \
67028ce3119Sbluhm case 0xD9: \
67128ce3119Sbluhm case 0xDA: \
67228ce3119Sbluhm case 0xDB: \
6731a3ddf8cSespie if (toLim - *toP < 4) { \
6741a3ddf8cSespie *fromP = from; \
675525cdfc7Srpointel return XML_CONVERT_OUTPUT_EXHAUSTED; \
676525cdfc7Srpointel } \
677525cdfc7Srpointel if (fromLim - from < 4) { \
678525cdfc7Srpointel *fromP = from; \
679525cdfc7Srpointel return XML_CONVERT_INPUT_INCOMPLETE; \
6801a3ddf8cSespie } \
6811a3ddf8cSespie plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
68228ce3119Sbluhm *(*toP)++ = (char)((plane >> 2) | UTF8_cval4); \
6831a3ddf8cSespie *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
6841a3ddf8cSespie from += 2; \
6851a3ddf8cSespie lo2 = GET_LO(from); \
68628ce3119Sbluhm *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2) \
68728ce3119Sbluhm | (lo2 >> 6) | 0x80); \
6881a3ddf8cSespie *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
6891a3ddf8cSespie break; \
6901a3ddf8cSespie } \
6911a3ddf8cSespie } \
6921a3ddf8cSespie *fromP = from; \
693525cdfc7Srpointel if (from < fromLim) \
694525cdfc7Srpointel return XML_CONVERT_INPUT_INCOMPLETE; \
695525cdfc7Srpointel else \
696525cdfc7Srpointel return XML_CONVERT_COMPLETED; \
6971a3ddf8cSespie }
6981a3ddf8cSespie
6991a3ddf8cSespie #define DEFINE_UTF16_TO_UTF16(E) \
70028ce3119Sbluhm static enum XML_Convert_Result PTRCALL E##toUtf16( \
70128ce3119Sbluhm const ENCODING *enc, const char **fromP, const char *fromLim, \
70228ce3119Sbluhm unsigned short **toP, const unsigned short *toLim) { \
703525cdfc7Srpointel enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
70428ce3119Sbluhm UNUSED_P(enc); \
705525cdfc7Srpointel fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \
7061a3ddf8cSespie /* Avoid copying first half only of surrogate */ \
7071a3ddf8cSespie if (fromLim - *fromP > ((toLim - *toP) << 1) \
708525cdfc7Srpointel && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
7091a3ddf8cSespie fromLim -= 2; \
710525cdfc7Srpointel res = XML_CONVERT_INPUT_INCOMPLETE; \
711525cdfc7Srpointel } \
712525cdfc7Srpointel for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
7131a3ddf8cSespie *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
714525cdfc7Srpointel if ((*toP == toLim) && (*fromP < fromLim)) \
715525cdfc7Srpointel return XML_CONVERT_OUTPUT_EXHAUSTED; \
716525cdfc7Srpointel else \
717525cdfc7Srpointel return res; \
7181a3ddf8cSespie }
7191a3ddf8cSespie
7201a3ddf8cSespie #define GET_LO(ptr) ((unsigned char)(ptr)[0])
7211a3ddf8cSespie #define GET_HI(ptr) ((unsigned char)(ptr)[1])
7221a3ddf8cSespie
7231a3ddf8cSespie DEFINE_UTF16_TO_UTF8(little2_)
DEFINE_UTF16_TO_UTF16(little2_)7241a3ddf8cSespie DEFINE_UTF16_TO_UTF16(little2_)
7251a3ddf8cSespie
7261a3ddf8cSespie #undef GET_LO
7271a3ddf8cSespie #undef GET_HI
7281a3ddf8cSespie
7291a3ddf8cSespie #define GET_LO(ptr) ((unsigned char)(ptr)[1])
7301a3ddf8cSespie #define GET_HI(ptr) ((unsigned char)(ptr)[0])
7311a3ddf8cSespie
7321a3ddf8cSespie DEFINE_UTF16_TO_UTF8(big2_)
7331a3ddf8cSespie DEFINE_UTF16_TO_UTF16(big2_)
7341a3ddf8cSespie
7351a3ddf8cSespie #undef GET_LO
7361a3ddf8cSespie #undef GET_HI
7371a3ddf8cSespie
7381a3ddf8cSespie #define LITTLE2_BYTE_TYPE(enc, p) \
739*bd8f1dc3Sbluhm ((p)[1] == 0 ? SB_BYTE_TYPE(enc, p) : unicode_byte_type((p)[1], (p)[0]))
74028ce3119Sbluhm #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
7419029d806Sbluhm #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c))
74228ce3119Sbluhm #define LITTLE2_IS_NAME_CHAR_MINBPC(p) \
7431a3ddf8cSespie UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
74428ce3119Sbluhm #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) \
7451a3ddf8cSespie UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
7461a3ddf8cSespie
7471a3ddf8cSespie #ifdef XML_MIN_SIZE
7481a3ddf8cSespie
7491a3ddf8cSespie static int PTRFASTCALL
75028ce3119Sbluhm little2_byteType(const ENCODING *enc, const char *p) {
7511a3ddf8cSespie return LITTLE2_BYTE_TYPE(enc, p);
7521a3ddf8cSespie }
7531a3ddf8cSespie
7541a3ddf8cSespie static int PTRFASTCALL
little2_byteToAscii(const ENCODING * enc,const char * p)75528ce3119Sbluhm little2_byteToAscii(const ENCODING *enc, const char *p) {
75628ce3119Sbluhm UNUSED_P(enc);
75728ce3119Sbluhm return LITTLE2_BYTE_TO_ASCII(p);
7581a3ddf8cSespie }
7591a3ddf8cSespie
7601a3ddf8cSespie static int PTRCALL
little2_charMatches(const ENCODING * enc,const char * p,int c)76128ce3119Sbluhm little2_charMatches(const ENCODING *enc, const char *p, int c) {
76228ce3119Sbluhm UNUSED_P(enc);
76328ce3119Sbluhm return LITTLE2_CHAR_MATCHES(p, c);
7641a3ddf8cSespie }
7651a3ddf8cSespie
7661a3ddf8cSespie static int PTRFASTCALL
little2_isNameMin(const ENCODING * enc,const char * p)76728ce3119Sbluhm little2_isNameMin(const ENCODING *enc, const char *p) {
76828ce3119Sbluhm UNUSED_P(enc);
76928ce3119Sbluhm return LITTLE2_IS_NAME_CHAR_MINBPC(p);
7701a3ddf8cSespie }
7711a3ddf8cSespie
7721a3ddf8cSespie static int PTRFASTCALL
little2_isNmstrtMin(const ENCODING * enc,const char * p)77328ce3119Sbluhm little2_isNmstrtMin(const ENCODING *enc, const char *p) {
77428ce3119Sbluhm UNUSED_P(enc);
77528ce3119Sbluhm return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
7761a3ddf8cSespie }
7771a3ddf8cSespie
7781a3ddf8cSespie # undef VTABLE
7791a3ddf8cSespie # define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
7801a3ddf8cSespie
7811a3ddf8cSespie #else /* not XML_MIN_SIZE */
7821a3ddf8cSespie
7831a3ddf8cSespie # undef PREFIX
7841a3ddf8cSespie # define PREFIX(ident) little2_##ident
7851a3ddf8cSespie # define MINBPC(enc) 2
7861a3ddf8cSespie /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
7871a3ddf8cSespie # define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
78828ce3119Sbluhm # define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
78928ce3119Sbluhm # define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
7901a3ddf8cSespie # define IS_NAME_CHAR(enc, p, n) 0
79128ce3119Sbluhm # define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
7921a3ddf8cSespie # define IS_NMSTRT_CHAR(enc, p, n) (0)
79328ce3119Sbluhm # define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
7941a3ddf8cSespie
795b26ab0f8Smatthieu # define XML_TOK_IMPL_C
7961a3ddf8cSespie # include "xmltok_impl.c"
797b26ab0f8Smatthieu # undef XML_TOK_IMPL_C
7981a3ddf8cSespie
7991a3ddf8cSespie # undef MINBPC
8001a3ddf8cSespie # undef BYTE_TYPE
8011a3ddf8cSespie # undef BYTE_TO_ASCII
8021a3ddf8cSespie # undef CHAR_MATCHES
8031a3ddf8cSespie # undef IS_NAME_CHAR
8041a3ddf8cSespie # undef IS_NAME_CHAR_MINBPC
8051a3ddf8cSespie # undef IS_NMSTRT_CHAR
8061a3ddf8cSespie # undef IS_NMSTRT_CHAR_MINBPC
8071a3ddf8cSespie # undef IS_INVALID_CHAR
8081a3ddf8cSespie
8091a3ddf8cSespie #endif /* not XML_MIN_SIZE */
8101a3ddf8cSespie
8111a3ddf8cSespie #ifdef XML_NS
8121a3ddf8cSespie
81328ce3119Sbluhm static const struct normal_encoding little2_encoding_ns
81428ce3119Sbluhm = {{VTABLE, 2, 0,
8151a3ddf8cSespie # if BYTEORDER == 1234
8161a3ddf8cSespie 1
8171a3ddf8cSespie # else
8181a3ddf8cSespie 0
8191a3ddf8cSespie # endif
8201a3ddf8cSespie },
8211a3ddf8cSespie {
8221a3ddf8cSespie # include "asciitab.h"
8231a3ddf8cSespie # include "latin1tab.h"
8241a3ddf8cSespie },
82528ce3119Sbluhm STANDARD_VTABLE(little2_) NULL_VTABLE};
8261a3ddf8cSespie
8271a3ddf8cSespie #endif
8281a3ddf8cSespie
82928ce3119Sbluhm static const struct normal_encoding little2_encoding
83028ce3119Sbluhm = {{VTABLE, 2, 0,
8311a3ddf8cSespie #if BYTEORDER == 1234
8321a3ddf8cSespie 1
8331a3ddf8cSespie #else
8341a3ddf8cSespie 0
8351a3ddf8cSespie #endif
8361a3ddf8cSespie },
8371a3ddf8cSespie {
8381a3ddf8cSespie #define BT_COLON BT_NMSTRT
8391a3ddf8cSespie #include "asciitab.h"
8401a3ddf8cSespie #undef BT_COLON
8411a3ddf8cSespie #include "latin1tab.h"
8421a3ddf8cSespie },
84328ce3119Sbluhm STANDARD_VTABLE(little2_) NULL_VTABLE};
8441a3ddf8cSespie
8451a3ddf8cSespie #if BYTEORDER != 4321
8461a3ddf8cSespie
8471a3ddf8cSespie # ifdef XML_NS
8481a3ddf8cSespie
84928ce3119Sbluhm static const struct normal_encoding internal_little2_encoding_ns
85028ce3119Sbluhm = {{VTABLE, 2, 0, 1},
8511a3ddf8cSespie {
8521a3ddf8cSespie # include "iasciitab.h"
8531a3ddf8cSespie # include "latin1tab.h"
8541a3ddf8cSespie },
85528ce3119Sbluhm STANDARD_VTABLE(little2_) NULL_VTABLE};
8561a3ddf8cSespie
8571a3ddf8cSespie # endif
8581a3ddf8cSespie
85928ce3119Sbluhm static const struct normal_encoding internal_little2_encoding
86028ce3119Sbluhm = {{VTABLE, 2, 0, 1},
8611a3ddf8cSespie {
8621a3ddf8cSespie # define BT_COLON BT_NMSTRT
8631a3ddf8cSespie # include "iasciitab.h"
8641a3ddf8cSespie # undef BT_COLON
8651a3ddf8cSespie # include "latin1tab.h"
8661a3ddf8cSespie },
86728ce3119Sbluhm STANDARD_VTABLE(little2_) NULL_VTABLE};
8681a3ddf8cSespie
8691a3ddf8cSespie #endif
8701a3ddf8cSespie
8711a3ddf8cSespie #define BIG2_BYTE_TYPE(enc, p) \
872*bd8f1dc3Sbluhm ((p)[0] == 0 ? SB_BYTE_TYPE(enc, p + 1) : unicode_byte_type((p)[0], (p)[1]))
87328ce3119Sbluhm #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
8749029d806Sbluhm #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c))
87528ce3119Sbluhm #define BIG2_IS_NAME_CHAR_MINBPC(p) \
8761a3ddf8cSespie UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
87728ce3119Sbluhm #define BIG2_IS_NMSTRT_CHAR_MINBPC(p) \
8781a3ddf8cSespie UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
8791a3ddf8cSespie
8801a3ddf8cSespie #ifdef XML_MIN_SIZE
8811a3ddf8cSespie
8821a3ddf8cSespie static int PTRFASTCALL
big2_byteType(const ENCODING * enc,const char * p)88328ce3119Sbluhm big2_byteType(const ENCODING *enc, const char *p) {
8841a3ddf8cSespie return BIG2_BYTE_TYPE(enc, p);
8851a3ddf8cSespie }
8861a3ddf8cSespie
8871a3ddf8cSespie static int PTRFASTCALL
big2_byteToAscii(const ENCODING * enc,const char * p)88828ce3119Sbluhm big2_byteToAscii(const ENCODING *enc, const char *p) {
88928ce3119Sbluhm UNUSED_P(enc);
89028ce3119Sbluhm return BIG2_BYTE_TO_ASCII(p);
8911a3ddf8cSespie }
8921a3ddf8cSespie
8931a3ddf8cSespie static int PTRCALL
big2_charMatches(const ENCODING * enc,const char * p,int c)89428ce3119Sbluhm big2_charMatches(const ENCODING *enc, const char *p, int c) {
89528ce3119Sbluhm UNUSED_P(enc);
89628ce3119Sbluhm return BIG2_CHAR_MATCHES(p, c);
8971a3ddf8cSespie }
8981a3ddf8cSespie
8991a3ddf8cSespie static int PTRFASTCALL
big2_isNameMin(const ENCODING * enc,const char * p)90028ce3119Sbluhm big2_isNameMin(const ENCODING *enc, const char *p) {
90128ce3119Sbluhm UNUSED_P(enc);
90228ce3119Sbluhm return BIG2_IS_NAME_CHAR_MINBPC(p);
9031a3ddf8cSespie }
9041a3ddf8cSespie
9051a3ddf8cSespie static int PTRFASTCALL
big2_isNmstrtMin(const ENCODING * enc,const char * p)90628ce3119Sbluhm big2_isNmstrtMin(const ENCODING *enc, const char *p) {
90728ce3119Sbluhm UNUSED_P(enc);
90828ce3119Sbluhm return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
9091a3ddf8cSespie }
9101a3ddf8cSespie
9111a3ddf8cSespie # undef VTABLE
9121a3ddf8cSespie # define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
9131a3ddf8cSespie
9141a3ddf8cSespie #else /* not XML_MIN_SIZE */
9151a3ddf8cSespie
9161a3ddf8cSespie # undef PREFIX
9171a3ddf8cSespie # define PREFIX(ident) big2_##ident
9181a3ddf8cSespie # define MINBPC(enc) 2
9191a3ddf8cSespie /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
9201a3ddf8cSespie # define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
92128ce3119Sbluhm # define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
92228ce3119Sbluhm # define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
9231a3ddf8cSespie # define IS_NAME_CHAR(enc, p, n) 0
92428ce3119Sbluhm # define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
9251a3ddf8cSespie # define IS_NMSTRT_CHAR(enc, p, n) (0)
92628ce3119Sbluhm # define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
9271a3ddf8cSespie
928b26ab0f8Smatthieu # define XML_TOK_IMPL_C
9291a3ddf8cSespie # include "xmltok_impl.c"
930b26ab0f8Smatthieu # undef XML_TOK_IMPL_C
9311a3ddf8cSespie
9321a3ddf8cSespie # undef MINBPC
9331a3ddf8cSespie # undef BYTE_TYPE
9341a3ddf8cSespie # undef BYTE_TO_ASCII
9351a3ddf8cSespie # undef CHAR_MATCHES
9361a3ddf8cSespie # undef IS_NAME_CHAR
9371a3ddf8cSespie # undef IS_NAME_CHAR_MINBPC
9381a3ddf8cSespie # undef IS_NMSTRT_CHAR
9391a3ddf8cSespie # undef IS_NMSTRT_CHAR_MINBPC
9401a3ddf8cSespie # undef IS_INVALID_CHAR
9411a3ddf8cSespie
9421a3ddf8cSespie #endif /* not XML_MIN_SIZE */
9431a3ddf8cSespie
9441a3ddf8cSespie #ifdef XML_NS
9451a3ddf8cSespie
94628ce3119Sbluhm static const struct normal_encoding big2_encoding_ns
94728ce3119Sbluhm = {{VTABLE, 2, 0,
9481a3ddf8cSespie # if BYTEORDER == 4321
9491a3ddf8cSespie 1
9501a3ddf8cSespie # else
9511a3ddf8cSespie 0
9521a3ddf8cSespie # endif
9531a3ddf8cSespie },
9541a3ddf8cSespie {
9551a3ddf8cSespie # include "asciitab.h"
9561a3ddf8cSespie # include "latin1tab.h"
9571a3ddf8cSespie },
95828ce3119Sbluhm STANDARD_VTABLE(big2_) NULL_VTABLE};
9591a3ddf8cSespie
9601a3ddf8cSespie #endif
9611a3ddf8cSespie
96228ce3119Sbluhm static const struct normal_encoding big2_encoding
96328ce3119Sbluhm = {{VTABLE, 2, 0,
9641a3ddf8cSespie #if BYTEORDER == 4321
9651a3ddf8cSespie 1
9661a3ddf8cSespie #else
9671a3ddf8cSespie 0
9681a3ddf8cSespie #endif
9691a3ddf8cSespie },
9701a3ddf8cSespie {
9711a3ddf8cSespie #define BT_COLON BT_NMSTRT
9721a3ddf8cSespie #include "asciitab.h"
9731a3ddf8cSespie #undef BT_COLON
9741a3ddf8cSespie #include "latin1tab.h"
9751a3ddf8cSespie },
97628ce3119Sbluhm STANDARD_VTABLE(big2_) NULL_VTABLE};
9771a3ddf8cSespie
9781a3ddf8cSespie #if BYTEORDER != 1234
9791a3ddf8cSespie
9801a3ddf8cSespie # ifdef XML_NS
9811a3ddf8cSespie
98228ce3119Sbluhm static const struct normal_encoding internal_big2_encoding_ns
98328ce3119Sbluhm = {{VTABLE, 2, 0, 1},
9841a3ddf8cSespie {
9851a3ddf8cSespie # include "iasciitab.h"
9861a3ddf8cSespie # include "latin1tab.h"
9871a3ddf8cSespie },
98828ce3119Sbluhm STANDARD_VTABLE(big2_) NULL_VTABLE};
9891a3ddf8cSespie
9901a3ddf8cSespie # endif
9911a3ddf8cSespie
99228ce3119Sbluhm static const struct normal_encoding internal_big2_encoding
99328ce3119Sbluhm = {{VTABLE, 2, 0, 1},
9941a3ddf8cSespie {
9951a3ddf8cSespie # define BT_COLON BT_NMSTRT
9961a3ddf8cSespie # include "iasciitab.h"
9971a3ddf8cSespie # undef BT_COLON
9981a3ddf8cSespie # include "latin1tab.h"
9991a3ddf8cSespie },
100028ce3119Sbluhm STANDARD_VTABLE(big2_) NULL_VTABLE};
10011a3ddf8cSespie
10021a3ddf8cSespie #endif
10031a3ddf8cSespie
10041a3ddf8cSespie #undef PREFIX
10051a3ddf8cSespie
10061a3ddf8cSespie static int FASTCALL
streqci(const char * s1,const char * s2)100728ce3119Sbluhm streqci(const char *s1, const char *s2) {
10081a3ddf8cSespie for (;;) {
10091a3ddf8cSespie char c1 = *s1++;
10101a3ddf8cSespie char c2 = *s2++;
10111a3ddf8cSespie if (ASCII_a <= c1 && c1 <= ASCII_z)
10121a3ddf8cSespie c1 += ASCII_A - ASCII_a;
10131a3ddf8cSespie if (ASCII_a <= c2 && c2 <= ASCII_z)
10145837d4fcSbluhm /* The following line will never get executed. streqci() is
10155837d4fcSbluhm * only called from two places, both of which guarantee to put
10165837d4fcSbluhm * upper-case strings into s2.
10175837d4fcSbluhm */
10185837d4fcSbluhm c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
10191a3ddf8cSespie if (c1 != c2)
10201a3ddf8cSespie return 0;
10211a3ddf8cSespie if (! c1)
10221a3ddf8cSespie break;
10231a3ddf8cSespie }
10241a3ddf8cSespie return 1;
10251a3ddf8cSespie }
10261a3ddf8cSespie
10271a3ddf8cSespie static void PTRCALL
initUpdatePosition(const ENCODING * enc,const char * ptr,const char * end,POSITION * pos)102828ce3119Sbluhm initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
102928ce3119Sbluhm POSITION *pos) {
103028ce3119Sbluhm UNUSED_P(enc);
10311a3ddf8cSespie normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
10321a3ddf8cSespie }
10331a3ddf8cSespie
10341a3ddf8cSespie static int
toAscii(const ENCODING * enc,const char * ptr,const char * end)103528ce3119Sbluhm toAscii(const ENCODING *enc, const char *ptr, const char *end) {
10361a3ddf8cSespie char buf[1];
10371a3ddf8cSespie char *p = buf;
10381a3ddf8cSespie XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
10391a3ddf8cSespie if (p == buf)
10401a3ddf8cSespie return -1;
10411a3ddf8cSespie else
10421a3ddf8cSespie return buf[0];
10431a3ddf8cSespie }
10441a3ddf8cSespie
10451a3ddf8cSespie static int FASTCALL
isSpace(int c)104628ce3119Sbluhm isSpace(int c) {
10471a3ddf8cSespie switch (c) {
10481a3ddf8cSespie case 0x20:
10491a3ddf8cSespie case 0xD:
10501a3ddf8cSespie case 0xA:
10511a3ddf8cSespie case 0x9:
10521a3ddf8cSespie return 1;
10531a3ddf8cSespie }
10541a3ddf8cSespie return 0;
10551a3ddf8cSespie }
10561a3ddf8cSespie
10571a3ddf8cSespie /* Return 1 if there's just optional white space or there's an S
10581a3ddf8cSespie followed by name=val.
10591a3ddf8cSespie */
10601a3ddf8cSespie static int
parsePseudoAttribute(const ENCODING * enc,const char * ptr,const char * end,const char ** namePtr,const char ** nameEndPtr,const char ** valPtr,const char ** nextTokPtr)106128ce3119Sbluhm parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
106228ce3119Sbluhm const char **namePtr, const char **nameEndPtr,
106328ce3119Sbluhm const char **valPtr, const char **nextTokPtr) {
10641a3ddf8cSespie int c;
10651a3ddf8cSespie char open;
10661a3ddf8cSespie if (ptr == end) {
10671a3ddf8cSespie *namePtr = NULL;
10681a3ddf8cSespie return 1;
10691a3ddf8cSespie }
10701a3ddf8cSespie if (! isSpace(toAscii(enc, ptr, end))) {
10711a3ddf8cSespie *nextTokPtr = ptr;
10721a3ddf8cSespie return 0;
10731a3ddf8cSespie }
10741a3ddf8cSespie do {
10751a3ddf8cSespie ptr += enc->minBytesPerChar;
10761a3ddf8cSespie } while (isSpace(toAscii(enc, ptr, end)));
10771a3ddf8cSespie if (ptr == end) {
10781a3ddf8cSespie *namePtr = NULL;
10791a3ddf8cSespie return 1;
10801a3ddf8cSespie }
10811a3ddf8cSespie *namePtr = ptr;
10821a3ddf8cSespie for (;;) {
10831a3ddf8cSespie c = toAscii(enc, ptr, end);
10841a3ddf8cSespie if (c == -1) {
10851a3ddf8cSespie *nextTokPtr = ptr;
10861a3ddf8cSespie return 0;
10871a3ddf8cSespie }
10881a3ddf8cSespie if (c == ASCII_EQUALS) {
10891a3ddf8cSespie *nameEndPtr = ptr;
10901a3ddf8cSespie break;
10911a3ddf8cSespie }
10921a3ddf8cSespie if (isSpace(c)) {
10931a3ddf8cSespie *nameEndPtr = ptr;
10941a3ddf8cSespie do {
10951a3ddf8cSespie ptr += enc->minBytesPerChar;
10961a3ddf8cSespie } while (isSpace(c = toAscii(enc, ptr, end)));
10971a3ddf8cSespie if (c != ASCII_EQUALS) {
10981a3ddf8cSespie *nextTokPtr = ptr;
10991a3ddf8cSespie return 0;
11001a3ddf8cSespie }
11011a3ddf8cSespie break;
11021a3ddf8cSespie }
11031a3ddf8cSespie ptr += enc->minBytesPerChar;
11041a3ddf8cSespie }
11051a3ddf8cSespie if (ptr == *namePtr) {
11061a3ddf8cSespie *nextTokPtr = ptr;
11071a3ddf8cSespie return 0;
11081a3ddf8cSespie }
11091a3ddf8cSespie ptr += enc->minBytesPerChar;
11101a3ddf8cSespie c = toAscii(enc, ptr, end);
11111a3ddf8cSespie while (isSpace(c)) {
11121a3ddf8cSespie ptr += enc->minBytesPerChar;
11131a3ddf8cSespie c = toAscii(enc, ptr, end);
11141a3ddf8cSespie }
11151a3ddf8cSespie if (c != ASCII_QUOT && c != ASCII_APOS) {
11161a3ddf8cSespie *nextTokPtr = ptr;
11171a3ddf8cSespie return 0;
11181a3ddf8cSespie }
11191a3ddf8cSespie open = (char)c;
11201a3ddf8cSespie ptr += enc->minBytesPerChar;
11211a3ddf8cSespie *valPtr = ptr;
11221a3ddf8cSespie for (;; ptr += enc->minBytesPerChar) {
11231a3ddf8cSespie c = toAscii(enc, ptr, end);
11241a3ddf8cSespie if (c == open)
11251a3ddf8cSespie break;
112628ce3119Sbluhm if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
112728ce3119Sbluhm && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
112828ce3119Sbluhm && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
11291a3ddf8cSespie *nextTokPtr = ptr;
11301a3ddf8cSespie return 0;
11311a3ddf8cSespie }
11321a3ddf8cSespie }
11331a3ddf8cSespie *nextTokPtr = ptr + enc->minBytesPerChar;
11341a3ddf8cSespie return 1;
11351a3ddf8cSespie }
11361a3ddf8cSespie
113728ce3119Sbluhm static const char KW_version[]
113828ce3119Sbluhm = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
11391a3ddf8cSespie
114028ce3119Sbluhm static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
114128ce3119Sbluhm ASCII_i, ASCII_n, ASCII_g, '\0'};
11421a3ddf8cSespie
114328ce3119Sbluhm static const char KW_standalone[]
114428ce3119Sbluhm = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
114528ce3119Sbluhm ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
11461a3ddf8cSespie
114728ce3119Sbluhm static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
11481a3ddf8cSespie
114928ce3119Sbluhm static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
11501a3ddf8cSespie
11511a3ddf8cSespie static int
doParseXmlDecl(const ENCODING * (* encodingFinder)(const ENCODING *,const char *,const char *),int isGeneralTextEntity,const ENCODING * enc,const char * ptr,const char * end,const char ** badPtr,const char ** versionPtr,const char ** versionEndPtr,const char ** encodingName,const ENCODING ** encoding,int * standalone)115228ce3119Sbluhm doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
11531a3ddf8cSespie const char *),
115428ce3119Sbluhm int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
115528ce3119Sbluhm const char *end, const char **badPtr, const char **versionPtr,
115628ce3119Sbluhm const char **versionEndPtr, const char **encodingName,
115728ce3119Sbluhm const ENCODING **encoding, int *standalone) {
11581a3ddf8cSespie const char *val = NULL;
11591a3ddf8cSespie const char *name = NULL;
11601a3ddf8cSespie const char *nameEnd = NULL;
11611a3ddf8cSespie ptr += 5 * enc->minBytesPerChar;
11621a3ddf8cSespie end -= 2 * enc->minBytesPerChar;
11631a3ddf8cSespie if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
11641a3ddf8cSespie || ! name) {
11651a3ddf8cSespie *badPtr = ptr;
11661a3ddf8cSespie return 0;
11671a3ddf8cSespie }
11681a3ddf8cSespie if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
11691a3ddf8cSespie if (! isGeneralTextEntity) {
11701a3ddf8cSespie *badPtr = name;
11711a3ddf8cSespie return 0;
11721a3ddf8cSespie }
117328ce3119Sbluhm } else {
11741a3ddf8cSespie if (versionPtr)
11751a3ddf8cSespie *versionPtr = val;
11761a3ddf8cSespie if (versionEndPtr)
11771a3ddf8cSespie *versionEndPtr = ptr;
11781a3ddf8cSespie if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
11791a3ddf8cSespie *badPtr = ptr;
11801a3ddf8cSespie return 0;
11811a3ddf8cSespie }
11821a3ddf8cSespie if (! name) {
11831a3ddf8cSespie if (isGeneralTextEntity) {
11841a3ddf8cSespie /* a TextDecl must have an EncodingDecl */
11851a3ddf8cSespie *badPtr = ptr;
11861a3ddf8cSespie return 0;
11871a3ddf8cSespie }
11881a3ddf8cSespie return 1;
11891a3ddf8cSespie }
11901a3ddf8cSespie }
11911a3ddf8cSespie if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
11921a3ddf8cSespie int c = toAscii(enc, val, end);
11931a3ddf8cSespie if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
11941a3ddf8cSespie *badPtr = val;
11951a3ddf8cSespie return 0;
11961a3ddf8cSespie }
11971a3ddf8cSespie if (encodingName)
11981a3ddf8cSespie *encodingName = val;
11991a3ddf8cSespie if (encoding)
12001a3ddf8cSespie *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
12011a3ddf8cSespie if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
12021a3ddf8cSespie *badPtr = ptr;
12031a3ddf8cSespie return 0;
12041a3ddf8cSespie }
12051a3ddf8cSespie if (! name)
12061a3ddf8cSespie return 1;
12071a3ddf8cSespie }
12081a3ddf8cSespie if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
12091a3ddf8cSespie || isGeneralTextEntity) {
12101a3ddf8cSespie *badPtr = name;
12111a3ddf8cSespie return 0;
12121a3ddf8cSespie }
12131a3ddf8cSespie if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
12141a3ddf8cSespie if (standalone)
12151a3ddf8cSespie *standalone = 1;
121628ce3119Sbluhm } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
12171a3ddf8cSespie if (standalone)
12181a3ddf8cSespie *standalone = 0;
121928ce3119Sbluhm } else {
12201a3ddf8cSespie *badPtr = val;
12211a3ddf8cSespie return 0;
12221a3ddf8cSespie }
12231a3ddf8cSespie while (isSpace(toAscii(enc, ptr, end)))
12241a3ddf8cSespie ptr += enc->minBytesPerChar;
12251a3ddf8cSespie if (ptr != end) {
12261a3ddf8cSespie *badPtr = ptr;
12271a3ddf8cSespie return 0;
12281a3ddf8cSespie }
12291a3ddf8cSespie return 1;
12301a3ddf8cSespie }
12311a3ddf8cSespie
12321a3ddf8cSespie static int FASTCALL
checkCharRefNumber(int result)123328ce3119Sbluhm checkCharRefNumber(int result) {
12341a3ddf8cSespie switch (result >> 8) {
123528ce3119Sbluhm case 0xD8:
123628ce3119Sbluhm case 0xD9:
123728ce3119Sbluhm case 0xDA:
123828ce3119Sbluhm case 0xDB:
123928ce3119Sbluhm case 0xDC:
124028ce3119Sbluhm case 0xDD:
124128ce3119Sbluhm case 0xDE:
124228ce3119Sbluhm case 0xDF:
12431a3ddf8cSespie return -1;
12441a3ddf8cSespie case 0:
12451a3ddf8cSespie if (latin1_encoding.type[result] == BT_NONXML)
12461a3ddf8cSespie return -1;
12471a3ddf8cSespie break;
12481a3ddf8cSespie case 0xFF:
12491a3ddf8cSespie if (result == 0xFFFE || result == 0xFFFF)
12501a3ddf8cSespie return -1;
12511a3ddf8cSespie break;
12521a3ddf8cSespie }
12531a3ddf8cSespie return result;
12541a3ddf8cSespie }
12551a3ddf8cSespie
12561a3ddf8cSespie int FASTCALL
XmlUtf8Encode(int c,char * buf)125728ce3119Sbluhm XmlUtf8Encode(int c, char *buf) {
12581a3ddf8cSespie enum {
12591a3ddf8cSespie /* minN is minimum legal resulting value for N byte sequence */
12601a3ddf8cSespie min2 = 0x80,
12611a3ddf8cSespie min3 = 0x800,
12621a3ddf8cSespie min4 = 0x10000
12631a3ddf8cSespie };
12641a3ddf8cSespie
12651a3ddf8cSespie if (c < 0)
12665837d4fcSbluhm return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
12671a3ddf8cSespie if (c < min2) {
12681a3ddf8cSespie buf[0] = (char)(c | UTF8_cval1);
12691a3ddf8cSespie return 1;
12701a3ddf8cSespie }
12711a3ddf8cSespie if (c < min3) {
12721a3ddf8cSespie buf[0] = (char)((c >> 6) | UTF8_cval2);
12731a3ddf8cSespie buf[1] = (char)((c & 0x3f) | 0x80);
12741a3ddf8cSespie return 2;
12751a3ddf8cSespie }
12761a3ddf8cSespie if (c < min4) {
12771a3ddf8cSespie buf[0] = (char)((c >> 12) | UTF8_cval3);
12781a3ddf8cSespie buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
12791a3ddf8cSespie buf[2] = (char)((c & 0x3f) | 0x80);
12801a3ddf8cSespie return 3;
12811a3ddf8cSespie }
12821a3ddf8cSespie if (c < 0x110000) {
12831a3ddf8cSespie buf[0] = (char)((c >> 18) | UTF8_cval4);
12841a3ddf8cSespie buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
12851a3ddf8cSespie buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
12861a3ddf8cSespie buf[3] = (char)((c & 0x3f) | 0x80);
12871a3ddf8cSespie return 4;
12881a3ddf8cSespie }
12895837d4fcSbluhm return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
12901a3ddf8cSespie }
12911a3ddf8cSespie
12921a3ddf8cSespie int FASTCALL
XmlUtf16Encode(int charNum,unsigned short * buf)129328ce3119Sbluhm XmlUtf16Encode(int charNum, unsigned short *buf) {
12941a3ddf8cSespie if (charNum < 0)
12951a3ddf8cSespie return 0;
12961a3ddf8cSespie if (charNum < 0x10000) {
12971a3ddf8cSespie buf[0] = (unsigned short)charNum;
12981a3ddf8cSespie return 1;
12991a3ddf8cSespie }
13001a3ddf8cSespie if (charNum < 0x110000) {
13011a3ddf8cSespie charNum -= 0x10000;
13021a3ddf8cSespie buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
13031a3ddf8cSespie buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
13041a3ddf8cSespie return 2;
13051a3ddf8cSespie }
13061a3ddf8cSespie return 0;
13071a3ddf8cSespie }
13081a3ddf8cSespie
13091a3ddf8cSespie struct unknown_encoding {
13101a3ddf8cSespie struct normal_encoding normal;
13117d36914fSalek CONVERTER convert;
13121a3ddf8cSespie void *userData;
13131a3ddf8cSespie unsigned short utf16[256];
13141a3ddf8cSespie char utf8[256][4];
13151a3ddf8cSespie };
13161a3ddf8cSespie
13171a3ddf8cSespie #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
13181a3ddf8cSespie
13191a3ddf8cSespie int
XmlSizeOfUnknownEncoding(void)132028ce3119Sbluhm XmlSizeOfUnknownEncoding(void) {
13211a3ddf8cSespie return sizeof(struct unknown_encoding);
13221a3ddf8cSespie }
13231a3ddf8cSespie
13241a3ddf8cSespie static int PTRFASTCALL
unknown_isName(const ENCODING * enc,const char * p)132528ce3119Sbluhm unknown_isName(const ENCODING *enc, const char *p) {
13261a3ddf8cSespie const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
13271a3ddf8cSespie int c = uenc->convert(uenc->userData, p);
13281a3ddf8cSespie if (c & ~0xFFFF)
13291a3ddf8cSespie return 0;
13301a3ddf8cSespie return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
13311a3ddf8cSespie }
13321a3ddf8cSespie
13331a3ddf8cSespie static int PTRFASTCALL
unknown_isNmstrt(const ENCODING * enc,const char * p)133428ce3119Sbluhm unknown_isNmstrt(const ENCODING *enc, const char *p) {
13351a3ddf8cSespie const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
13361a3ddf8cSespie int c = uenc->convert(uenc->userData, p);
13371a3ddf8cSespie if (c & ~0xFFFF)
13381a3ddf8cSespie return 0;
13391a3ddf8cSespie return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
13401a3ddf8cSespie }
13411a3ddf8cSespie
13421a3ddf8cSespie static int PTRFASTCALL
unknown_isInvalid(const ENCODING * enc,const char * p)134328ce3119Sbluhm unknown_isInvalid(const ENCODING *enc, const char *p) {
13441a3ddf8cSespie const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
13451a3ddf8cSespie int c = uenc->convert(uenc->userData, p);
13461a3ddf8cSespie return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
13471a3ddf8cSespie }
13481a3ddf8cSespie
1349525cdfc7Srpointel static enum XML_Convert_Result PTRCALL
unknown_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)135028ce3119Sbluhm unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
135128ce3119Sbluhm char **toP, const char *toLim) {
13521a3ddf8cSespie const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
13531a3ddf8cSespie char buf[XML_UTF8_ENCODE_MAX];
13541a3ddf8cSespie for (;;) {
13551a3ddf8cSespie const char *utf8;
13561a3ddf8cSespie int n;
13571a3ddf8cSespie if (*fromP == fromLim)
1358525cdfc7Srpointel return XML_CONVERT_COMPLETED;
13591a3ddf8cSespie utf8 = uenc->utf8[(unsigned char)**fromP];
13601a3ddf8cSespie n = *utf8++;
13611a3ddf8cSespie if (n == 0) {
13621a3ddf8cSespie int c = uenc->convert(uenc->userData, *fromP);
13631a3ddf8cSespie n = XmlUtf8Encode(c, buf);
13641a3ddf8cSespie if (n > toLim - *toP)
1365525cdfc7Srpointel return XML_CONVERT_OUTPUT_EXHAUSTED;
13661a3ddf8cSespie utf8 = buf;
13671a3ddf8cSespie *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
13681a3ddf8cSespie - (BT_LEAD2 - 2));
136928ce3119Sbluhm } else {
13701a3ddf8cSespie if (n > toLim - *toP)
1371525cdfc7Srpointel return XML_CONVERT_OUTPUT_EXHAUSTED;
13721a3ddf8cSespie (*fromP)++;
13731a3ddf8cSespie }
13749b8e2351Sbluhm memcpy(*toP, utf8, n);
13759b8e2351Sbluhm *toP += n;
13761a3ddf8cSespie }
13771a3ddf8cSespie }
13781a3ddf8cSespie
1379525cdfc7Srpointel static enum XML_Convert_Result PTRCALL
unknown_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)138028ce3119Sbluhm unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
138128ce3119Sbluhm unsigned short **toP, const unsigned short *toLim) {
13821a3ddf8cSespie const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1383525cdfc7Srpointel while (*fromP < fromLim && *toP < toLim) {
13841a3ddf8cSespie unsigned short c = uenc->utf16[(unsigned char)**fromP];
13851a3ddf8cSespie if (c == 0) {
138628ce3119Sbluhm c = (unsigned short)uenc->convert(uenc->userData, *fromP);
13871a3ddf8cSespie *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
13881a3ddf8cSespie - (BT_LEAD2 - 2));
138928ce3119Sbluhm } else
13901a3ddf8cSespie (*fromP)++;
13911a3ddf8cSespie *(*toP)++ = c;
13921a3ddf8cSespie }
1393525cdfc7Srpointel
1394525cdfc7Srpointel if ((*toP == toLim) && (*fromP < fromLim))
1395525cdfc7Srpointel return XML_CONVERT_OUTPUT_EXHAUSTED;
1396525cdfc7Srpointel else
1397525cdfc7Srpointel return XML_CONVERT_COMPLETED;
13981a3ddf8cSespie }
13991a3ddf8cSespie
14001a3ddf8cSespie ENCODING *
XmlInitUnknownEncoding(void * mem,int * table,CONVERTER convert,void * userData)140128ce3119Sbluhm XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
140228ce3119Sbluhm void *userData) {
14031a3ddf8cSespie int i;
14041a3ddf8cSespie struct unknown_encoding *e = (struct unknown_encoding *)mem;
140528ce3119Sbluhm memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
14061a3ddf8cSespie for (i = 0; i < 128; i++)
14071a3ddf8cSespie if (latin1_encoding.type[i] != BT_OTHER
140828ce3119Sbluhm && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
14091a3ddf8cSespie return 0;
14101a3ddf8cSespie for (i = 0; i < 256; i++) {
14111a3ddf8cSespie int c = table[i];
14121a3ddf8cSespie if (c == -1) {
14131a3ddf8cSespie e->normal.type[i] = BT_MALFORM;
14141a3ddf8cSespie /* This shouldn't really get used. */
14151a3ddf8cSespie e->utf16[i] = 0xFFFF;
14161a3ddf8cSespie e->utf8[i][0] = 1;
14171a3ddf8cSespie e->utf8[i][1] = 0;
141828ce3119Sbluhm } else if (c < 0) {
14191a3ddf8cSespie if (c < -4)
14201a3ddf8cSespie return 0;
14215837d4fcSbluhm /* Multi-byte sequences need a converter function */
14225837d4fcSbluhm if (! convert)
14235837d4fcSbluhm return 0;
14241a3ddf8cSespie e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
14251a3ddf8cSespie e->utf8[i][0] = 0;
14261a3ddf8cSespie e->utf16[i] = 0;
142728ce3119Sbluhm } else if (c < 0x80) {
14281a3ddf8cSespie if (latin1_encoding.type[c] != BT_OTHER
142928ce3119Sbluhm && latin1_encoding.type[c] != BT_NONXML && c != i)
14301a3ddf8cSespie return 0;
14311a3ddf8cSespie e->normal.type[i] = latin1_encoding.type[c];
14321a3ddf8cSespie e->utf8[i][0] = 1;
14331a3ddf8cSespie e->utf8[i][1] = (char)c;
14341a3ddf8cSespie e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
143528ce3119Sbluhm } else if (checkCharRefNumber(c) < 0) {
14361a3ddf8cSespie e->normal.type[i] = BT_NONXML;
14371a3ddf8cSespie /* This shouldn't really get used. */
14381a3ddf8cSespie e->utf16[i] = 0xFFFF;
14391a3ddf8cSespie e->utf8[i][0] = 1;
14401a3ddf8cSespie e->utf8[i][1] = 0;
144128ce3119Sbluhm } else {
14421a3ddf8cSespie if (c > 0xFFFF)
14431a3ddf8cSespie return 0;
14441a3ddf8cSespie if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
14451a3ddf8cSespie e->normal.type[i] = BT_NMSTRT;
14461a3ddf8cSespie else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
14471a3ddf8cSespie e->normal.type[i] = BT_NAME;
14481a3ddf8cSespie else
14491a3ddf8cSespie e->normal.type[i] = BT_OTHER;
14501a3ddf8cSespie e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
14511a3ddf8cSespie e->utf16[i] = (unsigned short)c;
14521a3ddf8cSespie }
14531a3ddf8cSespie }
14541a3ddf8cSespie e->userData = userData;
14551a3ddf8cSespie e->convert = convert;
14561a3ddf8cSespie if (convert) {
14571a3ddf8cSespie e->normal.isName2 = unknown_isName;
14581a3ddf8cSespie e->normal.isName3 = unknown_isName;
14591a3ddf8cSespie e->normal.isName4 = unknown_isName;
14601a3ddf8cSespie e->normal.isNmstrt2 = unknown_isNmstrt;
14611a3ddf8cSespie e->normal.isNmstrt3 = unknown_isNmstrt;
14621a3ddf8cSespie e->normal.isNmstrt4 = unknown_isNmstrt;
14631a3ddf8cSespie e->normal.isInvalid2 = unknown_isInvalid;
14641a3ddf8cSespie e->normal.isInvalid3 = unknown_isInvalid;
14651a3ddf8cSespie e->normal.isInvalid4 = unknown_isInvalid;
14661a3ddf8cSespie }
14671a3ddf8cSespie e->normal.enc.utf8Convert = unknown_toUtf8;
14681a3ddf8cSespie e->normal.enc.utf16Convert = unknown_toUtf16;
14691a3ddf8cSespie return &(e->normal.enc);
14701a3ddf8cSespie }
14711a3ddf8cSespie
14721a3ddf8cSespie /* If this enumeration is changed, getEncodingIndex and encodings
14731a3ddf8cSespie must also be changed. */
14741a3ddf8cSespie enum {
14751a3ddf8cSespie UNKNOWN_ENC = -1,
14761a3ddf8cSespie ISO_8859_1_ENC = 0,
14771a3ddf8cSespie US_ASCII_ENC,
14781a3ddf8cSespie UTF_8_ENC,
14791a3ddf8cSespie UTF_16_ENC,
14801a3ddf8cSespie UTF_16BE_ENC,
14811a3ddf8cSespie UTF_16LE_ENC,
14821a3ddf8cSespie /* must match encodingNames up to here */
14831a3ddf8cSespie NO_ENC
14841a3ddf8cSespie };
14851a3ddf8cSespie
148628ce3119Sbluhm static const char KW_ISO_8859_1[]
148728ce3119Sbluhm = {ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8,
148828ce3119Sbluhm ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'};
148928ce3119Sbluhm static const char KW_US_ASCII[]
149028ce3119Sbluhm = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
149128ce3119Sbluhm ASCII_C, ASCII_I, ASCII_I, '\0'};
149228ce3119Sbluhm static const char KW_UTF_8[]
149328ce3119Sbluhm = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
149428ce3119Sbluhm static const char KW_UTF_16[]
149528ce3119Sbluhm = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
149628ce3119Sbluhm static const char KW_UTF_16BE[]
149728ce3119Sbluhm = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
149828ce3119Sbluhm ASCII_6, ASCII_B, ASCII_E, '\0'};
149928ce3119Sbluhm static const char KW_UTF_16LE[]
150028ce3119Sbluhm = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
150128ce3119Sbluhm ASCII_6, ASCII_L, ASCII_E, '\0'};
15021a3ddf8cSespie
15031a3ddf8cSespie static int FASTCALL
getEncodingIndex(const char * name)150428ce3119Sbluhm getEncodingIndex(const char *name) {
15057d36914fSalek static const char *const encodingNames[] = {
150628ce3119Sbluhm KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
15071a3ddf8cSespie };
15081a3ddf8cSespie int i;
15091a3ddf8cSespie if (name == NULL)
15101a3ddf8cSespie return NO_ENC;
15111a3ddf8cSespie for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
15121a3ddf8cSespie if (streqci(name, encodingNames[i]))
15131a3ddf8cSespie return i;
15141a3ddf8cSespie return UNKNOWN_ENC;
15151a3ddf8cSespie }
15161a3ddf8cSespie
15171a3ddf8cSespie /* For binary compatibility, we store the index of the encoding
15181a3ddf8cSespie specified at initialization in the isUtf16 member.
15191a3ddf8cSespie */
15201a3ddf8cSespie
15211a3ddf8cSespie #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
15221a3ddf8cSespie #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
15231a3ddf8cSespie
15241a3ddf8cSespie /* This is what detects the encoding. encodingTable maps from
15251a3ddf8cSespie encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
15261a3ddf8cSespie the external (protocol) specified encoding; state is
15271a3ddf8cSespie XML_CONTENT_STATE if we're parsing an external text entity, and
15281a3ddf8cSespie XML_PROLOG_STATE otherwise.
15291a3ddf8cSespie */
15301a3ddf8cSespie
15311a3ddf8cSespie static int
initScan(const ENCODING * const * encodingTable,const INIT_ENCODING * enc,int state,const char * ptr,const char * end,const char ** nextTokPtr)153228ce3119Sbluhm initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
153328ce3119Sbluhm int state, const char *ptr, const char *end, const char **nextTokPtr) {
15341a3ddf8cSespie const ENCODING **encPtr;
15351a3ddf8cSespie
1536525cdfc7Srpointel if (ptr >= end)
15371a3ddf8cSespie return XML_TOK_NONE;
15381a3ddf8cSespie encPtr = enc->encPtr;
15391a3ddf8cSespie if (ptr + 1 == end) {
15401a3ddf8cSespie /* only a single byte available for auto-detection */
15411a3ddf8cSespie #ifndef XML_DTD /* FIXME */
15421a3ddf8cSespie /* a well-formed document entity must have more than one byte */
15431a3ddf8cSespie if (state != XML_CONTENT_STATE)
15441a3ddf8cSespie return XML_TOK_PARTIAL;
15451a3ddf8cSespie #endif
15461a3ddf8cSespie /* so we're parsing an external text entity... */
15471a3ddf8cSespie /* if UTF-16 was externally specified, then we need at least 2 bytes */
15481a3ddf8cSespie switch (INIT_ENC_INDEX(enc)) {
15491a3ddf8cSespie case UTF_16_ENC:
15501a3ddf8cSespie case UTF_16LE_ENC:
15511a3ddf8cSespie case UTF_16BE_ENC:
15521a3ddf8cSespie return XML_TOK_PARTIAL;
15531a3ddf8cSespie }
15541a3ddf8cSespie switch ((unsigned char)*ptr) {
15551a3ddf8cSespie case 0xFE:
15561a3ddf8cSespie case 0xFF:
15571a3ddf8cSespie case 0xEF: /* possibly first byte of UTF-8 BOM */
155828ce3119Sbluhm if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
15591a3ddf8cSespie break;
15601a3ddf8cSespie /* fall through */
15611a3ddf8cSespie case 0x00:
15621a3ddf8cSespie case 0x3C:
15631a3ddf8cSespie return XML_TOK_PARTIAL;
15641a3ddf8cSespie }
156528ce3119Sbluhm } else {
15661a3ddf8cSespie switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
15671a3ddf8cSespie case 0xFEFF:
156828ce3119Sbluhm if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
15691a3ddf8cSespie break;
15701a3ddf8cSespie *nextTokPtr = ptr + 2;
15711a3ddf8cSespie *encPtr = encodingTable[UTF_16BE_ENC];
15721a3ddf8cSespie return XML_TOK_BOM;
15731a3ddf8cSespie /* 00 3C is handled in the default case */
15741a3ddf8cSespie case 0x3C00:
15751a3ddf8cSespie if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
15761a3ddf8cSespie || INIT_ENC_INDEX(enc) == UTF_16_ENC)
15771a3ddf8cSespie && state == XML_CONTENT_STATE)
15781a3ddf8cSespie break;
15791a3ddf8cSespie *encPtr = encodingTable[UTF_16LE_ENC];
15801a3ddf8cSespie return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
15811a3ddf8cSespie case 0xFFFE:
158228ce3119Sbluhm if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
15831a3ddf8cSespie break;
15841a3ddf8cSespie *nextTokPtr = ptr + 2;
15851a3ddf8cSespie *encPtr = encodingTable[UTF_16LE_ENC];
15861a3ddf8cSespie return XML_TOK_BOM;
15871a3ddf8cSespie case 0xEFBB:
15881a3ddf8cSespie /* Maybe a UTF-8 BOM (EF BB BF) */
15891a3ddf8cSespie /* If there's an explicitly specified (external) encoding
15901a3ddf8cSespie of ISO-8859-1 or some flavour of UTF-16
15911a3ddf8cSespie and this is an external text entity,
15921a3ddf8cSespie don't look for the BOM,
15931a3ddf8cSespie because it might be a legal data.
15941a3ddf8cSespie */
15951a3ddf8cSespie if (state == XML_CONTENT_STATE) {
15961a3ddf8cSespie int e = INIT_ENC_INDEX(enc);
159728ce3119Sbluhm if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
159828ce3119Sbluhm || e == UTF_16_ENC)
15991a3ddf8cSespie break;
16001a3ddf8cSespie }
16011a3ddf8cSespie if (ptr + 2 == end)
16021a3ddf8cSespie return XML_TOK_PARTIAL;
16031a3ddf8cSespie if ((unsigned char)ptr[2] == 0xBF) {
16041a3ddf8cSespie *nextTokPtr = ptr + 3;
16051a3ddf8cSespie *encPtr = encodingTable[UTF_8_ENC];
16061a3ddf8cSespie return XML_TOK_BOM;
16071a3ddf8cSespie }
16081a3ddf8cSespie break;
16091a3ddf8cSespie default:
16101a3ddf8cSespie if (ptr[0] == '\0') {
16111a3ddf8cSespie /* 0 isn't a legal data character. Furthermore a document
16121a3ddf8cSespie entity can only start with ASCII characters. So the only
16131a3ddf8cSespie way this can fail to be big-endian UTF-16 if it it's an
16141a3ddf8cSespie external parsed general entity that's labelled as
16151a3ddf8cSespie UTF-16LE.
16161a3ddf8cSespie */
16171a3ddf8cSespie if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
16181a3ddf8cSespie break;
16191a3ddf8cSespie *encPtr = encodingTable[UTF_16BE_ENC];
16201a3ddf8cSespie return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
162128ce3119Sbluhm } else if (ptr[1] == '\0') {
16221a3ddf8cSespie /* We could recover here in the case:
16231a3ddf8cSespie - parsing an external entity
16241a3ddf8cSespie - second byte is 0
16251a3ddf8cSespie - no externally specified encoding
16261a3ddf8cSespie - no encoding declaration
16271a3ddf8cSespie by assuming UTF-16LE. But we don't, because this would mean when
16281a3ddf8cSespie presented just with a single byte, we couldn't reliably determine
16291a3ddf8cSespie whether we needed further bytes.
16301a3ddf8cSespie */
16311a3ddf8cSespie if (state == XML_CONTENT_STATE)
16321a3ddf8cSespie break;
16331a3ddf8cSespie *encPtr = encodingTable[UTF_16LE_ENC];
16341a3ddf8cSespie return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
16351a3ddf8cSespie }
16361a3ddf8cSespie break;
16371a3ddf8cSespie }
16381a3ddf8cSespie }
16391a3ddf8cSespie *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
16401a3ddf8cSespie return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
16411a3ddf8cSespie }
16421a3ddf8cSespie
16431a3ddf8cSespie #define NS(x) x
16441a3ddf8cSespie #define ns(x) x
1645b26ab0f8Smatthieu #define XML_TOK_NS_C
16461a3ddf8cSespie #include "xmltok_ns.c"
1647b26ab0f8Smatthieu #undef XML_TOK_NS_C
16481a3ddf8cSespie #undef NS
16491a3ddf8cSespie #undef ns
16501a3ddf8cSespie
16511a3ddf8cSespie #ifdef XML_NS
16521a3ddf8cSespie
16531a3ddf8cSespie # define NS(x) x##NS
16541a3ddf8cSespie # define ns(x) x##_ns
16551a3ddf8cSespie
1656b26ab0f8Smatthieu # define XML_TOK_NS_C
16571a3ddf8cSespie # include "xmltok_ns.c"
1658b26ab0f8Smatthieu # undef XML_TOK_NS_C
16591a3ddf8cSespie
16601a3ddf8cSespie # undef NS
16611a3ddf8cSespie # undef ns
16621a3ddf8cSespie
16631a3ddf8cSespie ENCODING *
XmlInitUnknownEncodingNS(void * mem,int * table,CONVERTER convert,void * userData)166428ce3119Sbluhm XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
166528ce3119Sbluhm void *userData) {
16661a3ddf8cSespie ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
16671a3ddf8cSespie if (enc)
16681a3ddf8cSespie ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
16691a3ddf8cSespie return enc;
16701a3ddf8cSespie }
16711a3ddf8cSespie
16721a3ddf8cSespie #endif /* XML_NS */
1673