xref: /openbsd-src/lib/libexpat/lib/xmltok.c (revision bd8f1dc3b0e01803a74947836eef57849c13acb0)
12e724bc9Sbluhm /*
22e724bc9Sbluhm                             __  __            _
32e724bc9Sbluhm                          ___\ \/ /_ __   __ _| |_
42e724bc9Sbluhm                         / _ \\  /| '_ \ / _` | __|
52e724bc9Sbluhm                        |  __//  \| |_) | (_| | |_
62e724bc9Sbluhm                         \___/_/\_\ .__/ \__,_|\__|
72e724bc9Sbluhm                                  |_| XML parser
82e724bc9Sbluhm 
92e724bc9Sbluhm    Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
1008819b41Sbluhm    Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
1108819b41Sbluhm    Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
1208819b41Sbluhm    Copyright (c) 2002      Greg Stein <gstein@users.sourceforge.net>
1308819b41Sbluhm    Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14253fd6bfSbluhm    Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
15*bd8f1dc3Sbluhm    Copyright (c) 2016-2024 Sebastian Pipping <sebastian@pipping.org>
1608819b41Sbluhm    Copyright (c) 2016      Pascal Cuoq <cuoq@trust-in-soft.com>
1708819b41Sbluhm    Copyright (c) 2016      Don Lewis <truckman@apache.org>
1808819b41Sbluhm    Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
1908819b41Sbluhm    Copyright (c) 2017      Alexander Bluhm <alexander.bluhm@gmx.net>
2008819b41Sbluhm    Copyright (c) 2017      Benbuck Nason <bnason@netflix.com>
2108819b41Sbluhm    Copyright (c) 2017      José Gutiérrez de la Concha <jose@zeroc.com>
2208819b41Sbluhm    Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
23*bd8f1dc3Sbluhm    Copyright (c) 2021      Donghee Na <donghee.na@python.org>
249029d806Sbluhm    Copyright (c) 2022      Martin Ettl <ettl.martin78@googlemail.com>
25*bd8f1dc3Sbluhm    Copyright (c) 2022      Sean McBride <sean@rogue-research.com>
26*bd8f1dc3Sbluhm    Copyright (c) 2023      Hanno Böck <hanno@gentoo.org>
272e724bc9Sbluhm    Licensed under the MIT license:
282e724bc9Sbluhm 
292e724bc9Sbluhm    Permission is  hereby granted,  free of charge,  to any  person obtaining
302e724bc9Sbluhm    a  copy  of  this  software   and  associated  documentation  files  (the
312e724bc9Sbluhm    "Software"),  to  deal in  the  Software  without restriction,  including
322e724bc9Sbluhm    without  limitation the  rights  to use,  copy,  modify, merge,  publish,
332e724bc9Sbluhm    distribute, sublicense, and/or sell copies of the Software, and to permit
342e724bc9Sbluhm    persons  to whom  the Software  is  furnished to  do so,  subject to  the
352e724bc9Sbluhm    following conditions:
362e724bc9Sbluhm 
372e724bc9Sbluhm    The above copyright  notice and this permission notice  shall be included
382e724bc9Sbluhm    in all copies or substantial portions of the Software.
392e724bc9Sbluhm 
402e724bc9Sbluhm    THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
412e724bc9Sbluhm    EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
422e724bc9Sbluhm    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
432e724bc9Sbluhm    NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
442e724bc9Sbluhm    DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
452e724bc9Sbluhm    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
462e724bc9Sbluhm    USE OR OTHER DEALINGS IN THE SOFTWARE.
471a3ddf8cSespie */
481a3ddf8cSespie 
49*bd8f1dc3Sbluhm #include "expat_config.h"
502c19dcf8Sbluhm 
517d36914fSalek #include <stddef.h>
529b8e2351Sbluhm #include <string.h> /* memcpy */
532e724bc9Sbluhm #include <stdbool.h>
549b8e2351Sbluhm 
552feb5d2aSbluhm #ifdef _WIN32
561a3ddf8cSespie #  include "winconfig.h"
577d36914fSalek #endif
5808819b41Sbluhm 
597d36914fSalek #include "expat_external.h"
601a3ddf8cSespie #include "internal.h"
611a3ddf8cSespie #include "xmltok.h"
621a3ddf8cSespie #include "nametab.h"
631a3ddf8cSespie 
641a3ddf8cSespie #ifdef XML_DTD
651a3ddf8cSespie #  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
661a3ddf8cSespie #else
671a3ddf8cSespie #  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
681a3ddf8cSespie #endif
691a3ddf8cSespie 
701a3ddf8cSespie #define VTABLE1                                                                \
711a3ddf8cSespie   {PREFIX(prologTok), PREFIX(contentTok),                                      \
721a3ddf8cSespie    PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
731a3ddf8cSespie       {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
7428ce3119Sbluhm       PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
7528ce3119Sbluhm       PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
7628ce3119Sbluhm       PREFIX(updatePosition), PREFIX(isPublicId)
771a3ddf8cSespie 
781a3ddf8cSespie #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
791a3ddf8cSespie 
801a3ddf8cSespie #define UCS2_GET_NAMING(pages, hi, lo)                                         \
812feb5d2aSbluhm   (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
821a3ddf8cSespie 
831a3ddf8cSespie /* A 2 byte UTF-8 representation splits the characters 11 bits between
841a3ddf8cSespie    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
851a3ddf8cSespie    pages, 3 bits to add to that index and 5 bits to generate the mask.
861a3ddf8cSespie */
871a3ddf8cSespie #define UTF8_GET_NAMING2(pages, byte)                                          \
881a3ddf8cSespie   (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
8928ce3119Sbluhm                 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
902feb5d2aSbluhm    & (1u << (((byte)[1]) & 0x1F)))
911a3ddf8cSespie 
921a3ddf8cSespie /* A 3 byte UTF-8 representation splits the characters 16 bits between
931a3ddf8cSespie    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
941a3ddf8cSespie    into pages, 3 bits to add to that index and 5 bits to generate the
951a3ddf8cSespie    mask.
961a3ddf8cSespie */
971a3ddf8cSespie #define UTF8_GET_NAMING3(pages, byte)                                          \
9828ce3119Sbluhm   (namingBitmap                                                                \
9928ce3119Sbluhm        [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
1001a3ddf8cSespie          << 3)                                                                 \
10128ce3119Sbluhm         + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
1022feb5d2aSbluhm    & (1u << (((byte)[2]) & 0x1F)))
1031a3ddf8cSespie 
1041a3ddf8cSespie /* Detection of invalid UTF-8 sequences is based on Table 3.1B
105*bd8f1dc3Sbluhm    of Unicode 3.2: https://www.unicode.org/unicode/reports/tr28/
1061a3ddf8cSespie    with the additional restriction of not allowing the Unicode
1071a3ddf8cSespie    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
1081a3ddf8cSespie    Implementation details:
1091a3ddf8cSespie      (A & 0x80) == 0     means A < 0x80
1101a3ddf8cSespie    and
1111a3ddf8cSespie      (A & 0xC0) == 0xC0  means A > 0xBF
1121a3ddf8cSespie */
1131a3ddf8cSespie 
1141a3ddf8cSespie #define UTF8_INVALID2(p)                                                       \
1151a3ddf8cSespie   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
1161a3ddf8cSespie 
1171a3ddf8cSespie #define UTF8_INVALID3(p)                                                       \
1181a3ddf8cSespie   (((p)[2] & 0x80) == 0                                                        \
11928ce3119Sbluhm    || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
12028ce3119Sbluhm                                       : ((p)[2] & 0xC0) == 0xC0)               \
12128ce3119Sbluhm    || ((*p) == 0xE0                                                            \
12228ce3119Sbluhm            ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
12328ce3119Sbluhm            : ((p)[1] & 0x80) == 0                                              \
12428ce3119Sbluhm                  || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
1251a3ddf8cSespie 
1261a3ddf8cSespie #define UTF8_INVALID4(p)                                                       \
12728ce3119Sbluhm   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
12828ce3119Sbluhm    || ((p)[2] & 0xC0) == 0xC0                                                  \
12928ce3119Sbluhm    || ((*p) == 0xF0                                                            \
13028ce3119Sbluhm            ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
13128ce3119Sbluhm            : ((p)[1] & 0x80) == 0                                              \
13228ce3119Sbluhm                  || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
1331a3ddf8cSespie 
1341a3ddf8cSespie static int PTRFASTCALL
isNever(const ENCODING * enc,const char * p)13528ce3119Sbluhm isNever(const ENCODING *enc, const char *p) {
13628ce3119Sbluhm   UNUSED_P(enc);
13728ce3119Sbluhm   UNUSED_P(p);
1381a3ddf8cSespie   return 0;
1391a3ddf8cSespie }
1401a3ddf8cSespie 
1411a3ddf8cSespie static int PTRFASTCALL
utf8_isName2(const ENCODING * enc,const char * p)14228ce3119Sbluhm utf8_isName2(const ENCODING *enc, const char *p) {
14328ce3119Sbluhm   UNUSED_P(enc);
1441a3ddf8cSespie   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
1451a3ddf8cSespie }
1461a3ddf8cSespie 
1471a3ddf8cSespie static int PTRFASTCALL
utf8_isName3(const ENCODING * enc,const char * p)14828ce3119Sbluhm utf8_isName3(const ENCODING *enc, const char *p) {
14928ce3119Sbluhm   UNUSED_P(enc);
1501a3ddf8cSespie   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
1511a3ddf8cSespie }
1521a3ddf8cSespie 
1531a3ddf8cSespie #define utf8_isName4 isNever
1541a3ddf8cSespie 
1551a3ddf8cSespie static int PTRFASTCALL
utf8_isNmstrt2(const ENCODING * enc,const char * p)15628ce3119Sbluhm utf8_isNmstrt2(const ENCODING *enc, const char *p) {
15728ce3119Sbluhm   UNUSED_P(enc);
1581a3ddf8cSespie   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
1591a3ddf8cSespie }
1601a3ddf8cSespie 
1611a3ddf8cSespie static int PTRFASTCALL
utf8_isNmstrt3(const ENCODING * enc,const char * p)16228ce3119Sbluhm utf8_isNmstrt3(const ENCODING *enc, const char *p) {
16328ce3119Sbluhm   UNUSED_P(enc);
1641a3ddf8cSespie   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
1651a3ddf8cSespie }
1661a3ddf8cSespie 
1671a3ddf8cSespie #define utf8_isNmstrt4 isNever
1681a3ddf8cSespie 
1691a3ddf8cSespie static int PTRFASTCALL
utf8_isInvalid2(const ENCODING * enc,const char * p)17028ce3119Sbluhm utf8_isInvalid2(const ENCODING *enc, const char *p) {
17128ce3119Sbluhm   UNUSED_P(enc);
1721a3ddf8cSespie   return UTF8_INVALID2((const unsigned char *)p);
1731a3ddf8cSespie }
1741a3ddf8cSespie 
1751a3ddf8cSespie static int PTRFASTCALL
utf8_isInvalid3(const ENCODING * enc,const char * p)17628ce3119Sbluhm utf8_isInvalid3(const ENCODING *enc, const char *p) {
17728ce3119Sbluhm   UNUSED_P(enc);
1781a3ddf8cSespie   return UTF8_INVALID3((const unsigned char *)p);
1791a3ddf8cSespie }
1801a3ddf8cSespie 
1811a3ddf8cSespie static int PTRFASTCALL
utf8_isInvalid4(const ENCODING * enc,const char * p)18228ce3119Sbluhm utf8_isInvalid4(const ENCODING *enc, const char *p) {
18328ce3119Sbluhm   UNUSED_P(enc);
1841a3ddf8cSespie   return UTF8_INVALID4((const unsigned char *)p);
1851a3ddf8cSespie }
1861a3ddf8cSespie 
1871a3ddf8cSespie struct normal_encoding {
1881a3ddf8cSespie   ENCODING enc;
1891a3ddf8cSespie   unsigned char type[256];
1901a3ddf8cSespie #ifdef XML_MIN_SIZE
1911a3ddf8cSespie   int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
1921a3ddf8cSespie   int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
1931a3ddf8cSespie   int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
1941a3ddf8cSespie   int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
1951a3ddf8cSespie   int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
1961a3ddf8cSespie #endif /* XML_MIN_SIZE */
1971a3ddf8cSespie   int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
1981a3ddf8cSespie   int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
1991a3ddf8cSespie   int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
2001a3ddf8cSespie   int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
2011a3ddf8cSespie   int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
2021a3ddf8cSespie   int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
2031a3ddf8cSespie   int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
2041a3ddf8cSespie   int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
2051a3ddf8cSespie   int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
2061a3ddf8cSespie };
2071a3ddf8cSespie 
2081a3ddf8cSespie #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
2091a3ddf8cSespie 
2101a3ddf8cSespie #ifdef XML_MIN_SIZE
2111a3ddf8cSespie 
2121a3ddf8cSespie #  define STANDARD_VTABLE(E)                                                   \
21328ce3119Sbluhm     E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
2141a3ddf8cSespie 
2151a3ddf8cSespie #else
2161a3ddf8cSespie 
2171a3ddf8cSespie #  define STANDARD_VTABLE(E) /* as nothing */
2181a3ddf8cSespie 
2191a3ddf8cSespie #endif
2201a3ddf8cSespie 
2211a3ddf8cSespie #define NORMAL_VTABLE(E)                                                       \
22228ce3119Sbluhm   E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
22328ce3119Sbluhm       E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
2241a3ddf8cSespie 
2252feb5d2aSbluhm #define NULL_VTABLE                                                            \
22628ce3119Sbluhm   /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
22728ce3119Sbluhm       /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
22828ce3119Sbluhm       /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
2292feb5d2aSbluhm 
230*bd8f1dc3Sbluhm static int FASTCALL checkCharRefNumber(int result);
2311a3ddf8cSespie 
2321a3ddf8cSespie #include "xmltok_impl.h"
2331a3ddf8cSespie #include "ascii.h"
2341a3ddf8cSespie 
2351a3ddf8cSespie #ifdef XML_MIN_SIZE
2361a3ddf8cSespie #  define sb_isNameMin isNever
2371a3ddf8cSespie #  define sb_isNmstrtMin isNever
2381a3ddf8cSespie #endif
2391a3ddf8cSespie 
2401a3ddf8cSespie #ifdef XML_MIN_SIZE
2411a3ddf8cSespie #  define MINBPC(enc) ((enc)->minBytesPerChar)
2421a3ddf8cSespie #else
2431a3ddf8cSespie /* minimum bytes per character */
2441a3ddf8cSespie #  define MINBPC(enc) 1
2451a3ddf8cSespie #endif
2461a3ddf8cSespie 
2471a3ddf8cSespie #define SB_BYTE_TYPE(enc, p)                                                   \
248*bd8f1dc3Sbluhm   (((const struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
2491a3ddf8cSespie 
2501a3ddf8cSespie #ifdef XML_MIN_SIZE
2511a3ddf8cSespie static int PTRFASTCALL
sb_byteType(const ENCODING * enc,const char * p)25228ce3119Sbluhm sb_byteType(const ENCODING *enc, const char *p) {
2531a3ddf8cSespie   return SB_BYTE_TYPE(enc, p);
2541a3ddf8cSespie }
25528ce3119Sbluhm #  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
2561a3ddf8cSespie #else
2571a3ddf8cSespie #  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
2581a3ddf8cSespie #endif
2591a3ddf8cSespie 
2601a3ddf8cSespie #ifdef XML_MIN_SIZE
26128ce3119Sbluhm #  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
2621a3ddf8cSespie static int PTRFASTCALL
sb_byteToAscii(const ENCODING * enc,const char * p)26328ce3119Sbluhm sb_byteToAscii(const ENCODING *enc, const char *p) {
26428ce3119Sbluhm   UNUSED_P(enc);
2651a3ddf8cSespie   return *p;
2661a3ddf8cSespie }
2671a3ddf8cSespie #else
2681a3ddf8cSespie #  define BYTE_TO_ASCII(enc, p) (*(p))
2691a3ddf8cSespie #endif
2701a3ddf8cSespie 
27128ce3119Sbluhm #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
27228ce3119Sbluhm #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
27308819b41Sbluhm #ifdef XML_MIN_SIZE
27408819b41Sbluhm #  define IS_INVALID_CHAR(enc, p, n)                                           \
27508819b41Sbluhm     (AS_NORMAL_ENCODING(enc)->isInvalid##n                                     \
27608819b41Sbluhm      && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
27708819b41Sbluhm #else
2781a3ddf8cSespie #  define IS_INVALID_CHAR(enc, p, n)                                           \
2791a3ddf8cSespie     (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
28008819b41Sbluhm #endif
2811a3ddf8cSespie 
2821a3ddf8cSespie #ifdef XML_MIN_SIZE
2831a3ddf8cSespie #  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
2841a3ddf8cSespie     (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
2851a3ddf8cSespie #  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
2861a3ddf8cSespie     (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
2871a3ddf8cSespie #else
2881a3ddf8cSespie #  define IS_NAME_CHAR_MINBPC(enc, p) (0)
2891a3ddf8cSespie #  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
2901a3ddf8cSespie #endif
2911a3ddf8cSespie 
2921a3ddf8cSespie #ifdef XML_MIN_SIZE
2931a3ddf8cSespie #  define CHAR_MATCHES(enc, p, c)                                              \
2941a3ddf8cSespie     (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
2951a3ddf8cSespie static int PTRCALL
sb_charMatches(const ENCODING * enc,const char * p,int c)29628ce3119Sbluhm sb_charMatches(const ENCODING *enc, const char *p, int c) {
29728ce3119Sbluhm   UNUSED_P(enc);
2981a3ddf8cSespie   return *p == c;
2991a3ddf8cSespie }
3001a3ddf8cSespie #else
3011a3ddf8cSespie /* c is an ASCII character */
3029029d806Sbluhm #  define CHAR_MATCHES(enc, p, c) (*(p) == (c))
3031a3ddf8cSespie #endif
3041a3ddf8cSespie 
3051a3ddf8cSespie #define PREFIX(ident) normal_##ident
306b26ab0f8Smatthieu #define XML_TOK_IMPL_C
3071a3ddf8cSespie #include "xmltok_impl.c"
308b26ab0f8Smatthieu #undef XML_TOK_IMPL_C
3091a3ddf8cSespie 
3101a3ddf8cSespie #undef MINBPC
3111a3ddf8cSespie #undef BYTE_TYPE
3121a3ddf8cSespie #undef BYTE_TO_ASCII
3131a3ddf8cSespie #undef CHAR_MATCHES
3141a3ddf8cSespie #undef IS_NAME_CHAR
3151a3ddf8cSespie #undef IS_NAME_CHAR_MINBPC
3161a3ddf8cSespie #undef IS_NMSTRT_CHAR
3171a3ddf8cSespie #undef IS_NMSTRT_CHAR_MINBPC
3181a3ddf8cSespie #undef IS_INVALID_CHAR
3191a3ddf8cSespie 
3201a3ddf8cSespie enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
3211a3ddf8cSespie        UTF8_cval1 = 0x00,
3221a3ddf8cSespie        UTF8_cval2 = 0xc0,
3231a3ddf8cSespie        UTF8_cval3 = 0xe0,
3241a3ddf8cSespie        UTF8_cval4 = 0xf0
3251a3ddf8cSespie };
3261a3ddf8cSespie 
3272feb5d2aSbluhm void
_INTERNAL_trim_to_complete_utf8_characters(const char * from,const char ** fromLimRef)32828ce3119Sbluhm _INTERNAL_trim_to_complete_utf8_characters(const char *from,
32928ce3119Sbluhm                                            const char **fromLimRef) {
3302feb5d2aSbluhm   const char *fromLim = *fromLimRef;
3312feb5d2aSbluhm   size_t walked = 0;
3322feb5d2aSbluhm   for (; fromLim > from; fromLim--, walked++) {
3332feb5d2aSbluhm     const unsigned char prev = (unsigned char)fromLim[-1];
33428ce3119Sbluhm     if ((prev & 0xf8u)
33528ce3119Sbluhm         == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
3362feb5d2aSbluhm       if (walked + 1 >= 4) {
3372feb5d2aSbluhm         fromLim += 4 - 1;
3382feb5d2aSbluhm         break;
3392feb5d2aSbluhm       } else {
3402feb5d2aSbluhm         walked = 0;
3412feb5d2aSbluhm       }
34228ce3119Sbluhm     } else if ((prev & 0xf0u)
34328ce3119Sbluhm                == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
3442feb5d2aSbluhm       if (walked + 1 >= 3) {
3452feb5d2aSbluhm         fromLim += 3 - 1;
3462feb5d2aSbluhm         break;
3472feb5d2aSbluhm       } else {
3482feb5d2aSbluhm         walked = 0;
3492feb5d2aSbluhm       }
35028ce3119Sbluhm     } else if ((prev & 0xe0u)
35128ce3119Sbluhm                == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
3522feb5d2aSbluhm       if (walked + 1 >= 2) {
3532feb5d2aSbluhm         fromLim += 2 - 1;
3542feb5d2aSbluhm         break;
3552feb5d2aSbluhm       } else {
3562feb5d2aSbluhm         walked = 0;
3572feb5d2aSbluhm       }
35828ce3119Sbluhm     } else if ((prev & 0x80u)
35928ce3119Sbluhm                == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
3602feb5d2aSbluhm       break;
3612feb5d2aSbluhm     }
3622feb5d2aSbluhm   }
3632feb5d2aSbluhm   *fromLimRef = fromLim;
3642feb5d2aSbluhm }
3652feb5d2aSbluhm 
366525cdfc7Srpointel static enum XML_Convert_Result PTRCALL
utf8_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)36728ce3119Sbluhm utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
36828ce3119Sbluhm             char **toP, const char *toLim) {
3692e724bc9Sbluhm   bool input_incomplete = false;
3702e724bc9Sbluhm   bool output_exhausted = false;
3712feb5d2aSbluhm 
3722e724bc9Sbluhm   /* Avoid copying partial characters (due to limited space). */
3732e724bc9Sbluhm   const ptrdiff_t bytesAvailable = fromLim - *fromP;
3742e724bc9Sbluhm   const ptrdiff_t bytesStorable = toLim - *toP;
37528ce3119Sbluhm   UNUSED_P(enc);
3762e724bc9Sbluhm   if (bytesAvailable > bytesStorable) {
3772e724bc9Sbluhm     fromLim = *fromP + bytesStorable;
3782e724bc9Sbluhm     output_exhausted = true;
3792e724bc9Sbluhm   }
3802e724bc9Sbluhm 
3812e724bc9Sbluhm   /* Avoid copying partial characters (from incomplete input). */
3829b8e2351Sbluhm   {
3832e724bc9Sbluhm     const char *const fromLimBefore = fromLim;
3849b8e2351Sbluhm     _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
3852e724bc9Sbluhm     if (fromLim < fromLimBefore) {
3862e724bc9Sbluhm       input_incomplete = true;
3872e724bc9Sbluhm     }
3889b8e2351Sbluhm   }
3892feb5d2aSbluhm 
3909b8e2351Sbluhm   {
3912e724bc9Sbluhm     const ptrdiff_t bytesToCopy = fromLim - *fromP;
3929b8e2351Sbluhm     memcpy(*toP, *fromP, bytesToCopy);
3932e724bc9Sbluhm     *fromP += bytesToCopy;
3942e724bc9Sbluhm     *toP += bytesToCopy;
3959b8e2351Sbluhm   }
396525cdfc7Srpointel 
3979b8e2351Sbluhm   if (output_exhausted) /* needs to go first */
398525cdfc7Srpointel     return XML_CONVERT_OUTPUT_EXHAUSTED;
3992e724bc9Sbluhm   else if (input_incomplete)
4002e724bc9Sbluhm     return XML_CONVERT_INPUT_INCOMPLETE;
401525cdfc7Srpointel   else
4022feb5d2aSbluhm     return XML_CONVERT_COMPLETED;
4031a3ddf8cSespie }
4041a3ddf8cSespie 
405525cdfc7Srpointel static enum XML_Convert_Result PTRCALL
utf8_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)40628ce3119Sbluhm utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
40728ce3119Sbluhm              unsigned short **toP, const unsigned short *toLim) {
408525cdfc7Srpointel   enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
4091a3ddf8cSespie   unsigned short *to = *toP;
4101a3ddf8cSespie   const char *from = *fromP;
411525cdfc7Srpointel   while (from < fromLim && to < toLim) {
412*bd8f1dc3Sbluhm     switch (SB_BYTE_TYPE(enc, from)) {
4131a3ddf8cSespie     case BT_LEAD2:
414525cdfc7Srpointel       if (fromLim - from < 2) {
415525cdfc7Srpointel         res = XML_CONVERT_INPUT_INCOMPLETE;
4162feb5d2aSbluhm         goto after;
417525cdfc7Srpointel       }
4181a3ddf8cSespie       *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
4191a3ddf8cSespie       from += 2;
4201a3ddf8cSespie       break;
4211a3ddf8cSespie     case BT_LEAD3:
422525cdfc7Srpointel       if (fromLim - from < 3) {
423525cdfc7Srpointel         res = XML_CONVERT_INPUT_INCOMPLETE;
4242feb5d2aSbluhm         goto after;
425525cdfc7Srpointel       }
42628ce3119Sbluhm       *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
42728ce3119Sbluhm                                | (from[2] & 0x3f));
4281a3ddf8cSespie       from += 3;
4291a3ddf8cSespie       break;
43028ce3119Sbluhm     case BT_LEAD4: {
4311a3ddf8cSespie       unsigned long n;
432525cdfc7Srpointel       if (toLim - to < 2) {
433525cdfc7Srpointel         res = XML_CONVERT_OUTPUT_EXHAUSTED;
4341a3ddf8cSespie         goto after;
435525cdfc7Srpointel       }
436525cdfc7Srpointel       if (fromLim - from < 4) {
437525cdfc7Srpointel         res = XML_CONVERT_INPUT_INCOMPLETE;
438525cdfc7Srpointel         goto after;
439525cdfc7Srpointel       }
4401a3ddf8cSespie       n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
4411a3ddf8cSespie           | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
4421a3ddf8cSespie       n -= 0x10000;
4431a3ddf8cSespie       to[0] = (unsigned short)((n >> 10) | 0xD800);
4441a3ddf8cSespie       to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
4451a3ddf8cSespie       to += 2;
4461a3ddf8cSespie       from += 4;
44728ce3119Sbluhm     } break;
4481a3ddf8cSespie     default:
4491a3ddf8cSespie       *to++ = *from++;
4501a3ddf8cSespie       break;
4511a3ddf8cSespie     }
4521a3ddf8cSespie   }
4532feb5d2aSbluhm   if (from < fromLim)
4542feb5d2aSbluhm     res = XML_CONVERT_OUTPUT_EXHAUSTED;
4551a3ddf8cSespie after:
4561a3ddf8cSespie   *fromP = from;
4571a3ddf8cSespie   *toP = to;
458525cdfc7Srpointel   return res;
4591a3ddf8cSespie }
4601a3ddf8cSespie 
4611a3ddf8cSespie #ifdef XML_NS
46228ce3119Sbluhm static const struct normal_encoding utf8_encoding_ns
46328ce3119Sbluhm     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
4641a3ddf8cSespie        {
4651a3ddf8cSespie #  include "asciitab.h"
4661a3ddf8cSespie #  include "utf8tab.h"
4671a3ddf8cSespie        },
46828ce3119Sbluhm        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
4691a3ddf8cSespie #endif
4701a3ddf8cSespie 
47128ce3119Sbluhm static const struct normal_encoding utf8_encoding
47228ce3119Sbluhm     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
4731a3ddf8cSespie        {
4741a3ddf8cSespie #define BT_COLON BT_NMSTRT
4751a3ddf8cSespie #include "asciitab.h"
4761a3ddf8cSespie #undef BT_COLON
4771a3ddf8cSespie #include "utf8tab.h"
4781a3ddf8cSespie        },
47928ce3119Sbluhm        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
4801a3ddf8cSespie 
4811a3ddf8cSespie #ifdef XML_NS
4821a3ddf8cSespie 
48328ce3119Sbluhm static const struct normal_encoding internal_utf8_encoding_ns
48428ce3119Sbluhm     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
4851a3ddf8cSespie        {
4861a3ddf8cSespie #  include "iasciitab.h"
4871a3ddf8cSespie #  include "utf8tab.h"
4881a3ddf8cSespie        },
48928ce3119Sbluhm        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
4901a3ddf8cSespie 
4911a3ddf8cSespie #endif
4921a3ddf8cSespie 
49328ce3119Sbluhm static const struct normal_encoding internal_utf8_encoding
49428ce3119Sbluhm     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
4951a3ddf8cSespie        {
4961a3ddf8cSespie #define BT_COLON BT_NMSTRT
4971a3ddf8cSespie #include "iasciitab.h"
4981a3ddf8cSespie #undef BT_COLON
4991a3ddf8cSespie #include "utf8tab.h"
5001a3ddf8cSespie        },
50128ce3119Sbluhm        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
5021a3ddf8cSespie 
503525cdfc7Srpointel static enum XML_Convert_Result PTRCALL
latin1_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)50428ce3119Sbluhm latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
50528ce3119Sbluhm               char **toP, const char *toLim) {
50628ce3119Sbluhm   UNUSED_P(enc);
5071a3ddf8cSespie   for (;;) {
5081a3ddf8cSespie     unsigned char c;
5091a3ddf8cSespie     if (*fromP == fromLim)
510525cdfc7Srpointel       return XML_CONVERT_COMPLETED;
5111a3ddf8cSespie     c = (unsigned char)**fromP;
5121a3ddf8cSespie     if (c & 0x80) {
5131a3ddf8cSespie       if (toLim - *toP < 2)
514525cdfc7Srpointel         return XML_CONVERT_OUTPUT_EXHAUSTED;
5151a3ddf8cSespie       *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
5161a3ddf8cSespie       *(*toP)++ = (char)((c & 0x3f) | 0x80);
5171a3ddf8cSespie       (*fromP)++;
51828ce3119Sbluhm     } else {
5191a3ddf8cSespie       if (*toP == toLim)
520525cdfc7Srpointel         return XML_CONVERT_OUTPUT_EXHAUSTED;
5211a3ddf8cSespie       *(*toP)++ = *(*fromP)++;
5221a3ddf8cSespie     }
5231a3ddf8cSespie   }
5241a3ddf8cSespie }
5251a3ddf8cSespie 
526525cdfc7Srpointel static enum XML_Convert_Result PTRCALL
latin1_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)52728ce3119Sbluhm latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
52828ce3119Sbluhm                unsigned short **toP, const unsigned short *toLim) {
52928ce3119Sbluhm   UNUSED_P(enc);
530525cdfc7Srpointel   while (*fromP < fromLim && *toP < toLim)
5311a3ddf8cSespie     *(*toP)++ = (unsigned char)*(*fromP)++;
532525cdfc7Srpointel 
533525cdfc7Srpointel   if ((*toP == toLim) && (*fromP < fromLim))
534525cdfc7Srpointel     return XML_CONVERT_OUTPUT_EXHAUSTED;
535525cdfc7Srpointel   else
536525cdfc7Srpointel     return XML_CONVERT_COMPLETED;
5371a3ddf8cSespie }
5381a3ddf8cSespie 
5391a3ddf8cSespie #ifdef XML_NS
5401a3ddf8cSespie 
54128ce3119Sbluhm static const struct normal_encoding latin1_encoding_ns
54228ce3119Sbluhm     = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
5431a3ddf8cSespie        {
5441a3ddf8cSespie #  include "asciitab.h"
5451a3ddf8cSespie #  include "latin1tab.h"
5461a3ddf8cSespie        },
54728ce3119Sbluhm        STANDARD_VTABLE(sb_) NULL_VTABLE};
5481a3ddf8cSespie 
5491a3ddf8cSespie #endif
5501a3ddf8cSespie 
55128ce3119Sbluhm static const struct normal_encoding latin1_encoding
55228ce3119Sbluhm     = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
5531a3ddf8cSespie        {
5541a3ddf8cSespie #define BT_COLON BT_NMSTRT
5551a3ddf8cSespie #include "asciitab.h"
5561a3ddf8cSespie #undef BT_COLON
5571a3ddf8cSespie #include "latin1tab.h"
5581a3ddf8cSespie        },
55928ce3119Sbluhm        STANDARD_VTABLE(sb_) NULL_VTABLE};
5601a3ddf8cSespie 
561525cdfc7Srpointel static enum XML_Convert_Result PTRCALL
ascii_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)56228ce3119Sbluhm ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
56328ce3119Sbluhm              char **toP, const char *toLim) {
56428ce3119Sbluhm   UNUSED_P(enc);
565525cdfc7Srpointel   while (*fromP < fromLim && *toP < toLim)
5661a3ddf8cSespie     *(*toP)++ = *(*fromP)++;
567525cdfc7Srpointel 
568525cdfc7Srpointel   if ((*toP == toLim) && (*fromP < fromLim))
569525cdfc7Srpointel     return XML_CONVERT_OUTPUT_EXHAUSTED;
570525cdfc7Srpointel   else
571525cdfc7Srpointel     return XML_CONVERT_COMPLETED;
5721a3ddf8cSespie }
5731a3ddf8cSespie 
5741a3ddf8cSespie #ifdef XML_NS
5751a3ddf8cSespie 
57628ce3119Sbluhm static const struct normal_encoding ascii_encoding_ns
57728ce3119Sbluhm     = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
5781a3ddf8cSespie        {
5791a3ddf8cSespie #  include "asciitab.h"
5801a3ddf8cSespie            /* BT_NONXML == 0 */
5811a3ddf8cSespie        },
58228ce3119Sbluhm        STANDARD_VTABLE(sb_) NULL_VTABLE};
5831a3ddf8cSespie 
5841a3ddf8cSespie #endif
5851a3ddf8cSespie 
58628ce3119Sbluhm static const struct normal_encoding ascii_encoding
58728ce3119Sbluhm     = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
5881a3ddf8cSespie        {
5891a3ddf8cSespie #define BT_COLON BT_NMSTRT
5901a3ddf8cSespie #include "asciitab.h"
5911a3ddf8cSespie #undef BT_COLON
5921a3ddf8cSespie            /* BT_NONXML == 0 */
5931a3ddf8cSespie        },
59428ce3119Sbluhm        STANDARD_VTABLE(sb_) NULL_VTABLE};
5951a3ddf8cSespie 
5961a3ddf8cSespie static int PTRFASTCALL
unicode_byte_type(char hi,char lo)59728ce3119Sbluhm unicode_byte_type(char hi, char lo) {
5981a3ddf8cSespie   switch ((unsigned char)hi) {
5992a4a206eSbluhm   /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
60028ce3119Sbluhm   case 0xD8:
60128ce3119Sbluhm   case 0xD9:
60228ce3119Sbluhm   case 0xDA:
60328ce3119Sbluhm   case 0xDB:
6041a3ddf8cSespie     return BT_LEAD4;
6052a4a206eSbluhm   /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
60628ce3119Sbluhm   case 0xDC:
60728ce3119Sbluhm   case 0xDD:
60828ce3119Sbluhm   case 0xDE:
60928ce3119Sbluhm   case 0xDF:
6101a3ddf8cSespie     return BT_TRAIL;
6111a3ddf8cSespie   case 0xFF:
6121a3ddf8cSespie     switch ((unsigned char)lo) {
61328ce3119Sbluhm     case 0xFF: /* noncharacter-FFFF */
61428ce3119Sbluhm     case 0xFE: /* noncharacter-FFFE */
6151a3ddf8cSespie       return BT_NONXML;
6161a3ddf8cSespie     }
6171a3ddf8cSespie     break;
6181a3ddf8cSespie   }
6191a3ddf8cSespie   return BT_NONASCII;
6201a3ddf8cSespie }
6211a3ddf8cSespie 
6221a3ddf8cSespie #define DEFINE_UTF16_TO_UTF8(E)                                                \
62328ce3119Sbluhm   static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
62428ce3119Sbluhm       const ENCODING *enc, const char **fromP, const char *fromLim,            \
62528ce3119Sbluhm       char **toP, const char *toLim) {                                         \
626525cdfc7Srpointel     const char *from = *fromP;                                                 \
62728ce3119Sbluhm     UNUSED_P(enc);                                                             \
628525cdfc7Srpointel     fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
629525cdfc7Srpointel     for (; from < fromLim; from += 2) {                                        \
6301a3ddf8cSespie       int plane;                                                               \
6311a3ddf8cSespie       unsigned char lo2;                                                       \
6321a3ddf8cSespie       unsigned char lo = GET_LO(from);                                         \
6331a3ddf8cSespie       unsigned char hi = GET_HI(from);                                         \
6341a3ddf8cSespie       switch (hi) {                                                            \
6351a3ddf8cSespie       case 0:                                                                  \
6361a3ddf8cSespie         if (lo < 0x80) {                                                       \
6371a3ddf8cSespie           if (*toP == toLim) {                                                 \
6381a3ddf8cSespie             *fromP = from;                                                     \
639525cdfc7Srpointel             return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
6401a3ddf8cSespie           }                                                                    \
6411a3ddf8cSespie           *(*toP)++ = lo;                                                      \
6421a3ddf8cSespie           break;                                                               \
6431a3ddf8cSespie         }                                                                      \
6441a3ddf8cSespie         /* fall through */                                                     \
64528ce3119Sbluhm       case 0x1:                                                                \
64628ce3119Sbluhm       case 0x2:                                                                \
64728ce3119Sbluhm       case 0x3:                                                                \
64828ce3119Sbluhm       case 0x4:                                                                \
64928ce3119Sbluhm       case 0x5:                                                                \
65028ce3119Sbluhm       case 0x6:                                                                \
65128ce3119Sbluhm       case 0x7:                                                                \
6521a3ddf8cSespie         if (toLim - *toP < 2) {                                                \
6531a3ddf8cSespie           *fromP = from;                                                       \
654525cdfc7Srpointel           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
6551a3ddf8cSespie         }                                                                      \
6561a3ddf8cSespie         *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
6571a3ddf8cSespie         *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
6581a3ddf8cSespie         break;                                                                 \
6591a3ddf8cSespie       default:                                                                 \
6601a3ddf8cSespie         if (toLim - *toP < 3) {                                                \
6611a3ddf8cSespie           *fromP = from;                                                       \
662525cdfc7Srpointel           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
6631a3ddf8cSespie         }                                                                      \
6641a3ddf8cSespie         /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
6651a3ddf8cSespie         *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
6661a3ddf8cSespie         *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
6671a3ddf8cSespie         *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
6681a3ddf8cSespie         break;                                                                 \
66928ce3119Sbluhm       case 0xD8:                                                               \
67028ce3119Sbluhm       case 0xD9:                                                               \
67128ce3119Sbluhm       case 0xDA:                                                               \
67228ce3119Sbluhm       case 0xDB:                                                               \
6731a3ddf8cSespie         if (toLim - *toP < 4) {                                                \
6741a3ddf8cSespie           *fromP = from;                                                       \
675525cdfc7Srpointel           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
676525cdfc7Srpointel         }                                                                      \
677525cdfc7Srpointel         if (fromLim - from < 4) {                                              \
678525cdfc7Srpointel           *fromP = from;                                                       \
679525cdfc7Srpointel           return XML_CONVERT_INPUT_INCOMPLETE;                                 \
6801a3ddf8cSespie         }                                                                      \
6811a3ddf8cSespie         plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
68228ce3119Sbluhm         *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
6831a3ddf8cSespie         *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
6841a3ddf8cSespie         from += 2;                                                             \
6851a3ddf8cSespie         lo2 = GET_LO(from);                                                    \
68628ce3119Sbluhm         *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
68728ce3119Sbluhm                      | (lo2 >> 6) | 0x80);                                     \
6881a3ddf8cSespie         *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
6891a3ddf8cSespie         break;                                                                 \
6901a3ddf8cSespie       }                                                                        \
6911a3ddf8cSespie     }                                                                          \
6921a3ddf8cSespie     *fromP = from;                                                             \
693525cdfc7Srpointel     if (from < fromLim)                                                        \
694525cdfc7Srpointel       return XML_CONVERT_INPUT_INCOMPLETE;                                     \
695525cdfc7Srpointel     else                                                                       \
696525cdfc7Srpointel       return XML_CONVERT_COMPLETED;                                            \
6971a3ddf8cSespie   }
6981a3ddf8cSespie 
6991a3ddf8cSespie #define DEFINE_UTF16_TO_UTF16(E)                                               \
70028ce3119Sbluhm   static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
70128ce3119Sbluhm       const ENCODING *enc, const char **fromP, const char *fromLim,            \
70228ce3119Sbluhm       unsigned short **toP, const unsigned short *toLim) {                     \
703525cdfc7Srpointel     enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
70428ce3119Sbluhm     UNUSED_P(enc);                                                             \
705525cdfc7Srpointel     fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
7061a3ddf8cSespie     /* Avoid copying first half only of surrogate */                           \
7071a3ddf8cSespie     if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
708525cdfc7Srpointel         && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
7091a3ddf8cSespie       fromLim -= 2;                                                            \
710525cdfc7Srpointel       res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
711525cdfc7Srpointel     }                                                                          \
712525cdfc7Srpointel     for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
7131a3ddf8cSespie       *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
714525cdfc7Srpointel     if ((*toP == toLim) && (*fromP < fromLim))                                 \
715525cdfc7Srpointel       return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
716525cdfc7Srpointel     else                                                                       \
717525cdfc7Srpointel       return res;                                                              \
7181a3ddf8cSespie   }
7191a3ddf8cSespie 
7201a3ddf8cSespie #define GET_LO(ptr) ((unsigned char)(ptr)[0])
7211a3ddf8cSespie #define GET_HI(ptr) ((unsigned char)(ptr)[1])
7221a3ddf8cSespie 
7231a3ddf8cSespie DEFINE_UTF16_TO_UTF8(little2_)
DEFINE_UTF16_TO_UTF16(little2_)7241a3ddf8cSespie DEFINE_UTF16_TO_UTF16(little2_)
7251a3ddf8cSespie 
7261a3ddf8cSespie #undef GET_LO
7271a3ddf8cSespie #undef GET_HI
7281a3ddf8cSespie 
7291a3ddf8cSespie #define GET_LO(ptr) ((unsigned char)(ptr)[1])
7301a3ddf8cSespie #define GET_HI(ptr) ((unsigned char)(ptr)[0])
7311a3ddf8cSespie 
7321a3ddf8cSespie DEFINE_UTF16_TO_UTF8(big2_)
7331a3ddf8cSespie DEFINE_UTF16_TO_UTF16(big2_)
7341a3ddf8cSespie 
7351a3ddf8cSespie #undef GET_LO
7361a3ddf8cSespie #undef GET_HI
7371a3ddf8cSespie 
7381a3ddf8cSespie #define LITTLE2_BYTE_TYPE(enc, p)                                              \
739*bd8f1dc3Sbluhm   ((p)[1] == 0 ? SB_BYTE_TYPE(enc, p) : unicode_byte_type((p)[1], (p)[0]))
74028ce3119Sbluhm #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
7419029d806Sbluhm #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c))
74228ce3119Sbluhm #define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
7431a3ddf8cSespie   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
74428ce3119Sbluhm #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
7451a3ddf8cSespie   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
7461a3ddf8cSespie 
7471a3ddf8cSespie #ifdef XML_MIN_SIZE
7481a3ddf8cSespie 
7491a3ddf8cSespie static int PTRFASTCALL
75028ce3119Sbluhm little2_byteType(const ENCODING *enc, const char *p) {
7511a3ddf8cSespie   return LITTLE2_BYTE_TYPE(enc, p);
7521a3ddf8cSespie }
7531a3ddf8cSespie 
7541a3ddf8cSespie static int PTRFASTCALL
little2_byteToAscii(const ENCODING * enc,const char * p)75528ce3119Sbluhm little2_byteToAscii(const ENCODING *enc, const char *p) {
75628ce3119Sbluhm   UNUSED_P(enc);
75728ce3119Sbluhm   return LITTLE2_BYTE_TO_ASCII(p);
7581a3ddf8cSespie }
7591a3ddf8cSespie 
7601a3ddf8cSespie static int PTRCALL
little2_charMatches(const ENCODING * enc,const char * p,int c)76128ce3119Sbluhm little2_charMatches(const ENCODING *enc, const char *p, int c) {
76228ce3119Sbluhm   UNUSED_P(enc);
76328ce3119Sbluhm   return LITTLE2_CHAR_MATCHES(p, c);
7641a3ddf8cSespie }
7651a3ddf8cSespie 
7661a3ddf8cSespie static int PTRFASTCALL
little2_isNameMin(const ENCODING * enc,const char * p)76728ce3119Sbluhm little2_isNameMin(const ENCODING *enc, const char *p) {
76828ce3119Sbluhm   UNUSED_P(enc);
76928ce3119Sbluhm   return LITTLE2_IS_NAME_CHAR_MINBPC(p);
7701a3ddf8cSespie }
7711a3ddf8cSespie 
7721a3ddf8cSespie static int PTRFASTCALL
little2_isNmstrtMin(const ENCODING * enc,const char * p)77328ce3119Sbluhm little2_isNmstrtMin(const ENCODING *enc, const char *p) {
77428ce3119Sbluhm   UNUSED_P(enc);
77528ce3119Sbluhm   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
7761a3ddf8cSespie }
7771a3ddf8cSespie 
7781a3ddf8cSespie #  undef VTABLE
7791a3ddf8cSespie #  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
7801a3ddf8cSespie 
7811a3ddf8cSespie #else /* not XML_MIN_SIZE */
7821a3ddf8cSespie 
7831a3ddf8cSespie #  undef PREFIX
7841a3ddf8cSespie #  define PREFIX(ident) little2_##ident
7851a3ddf8cSespie #  define MINBPC(enc) 2
7861a3ddf8cSespie /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
7871a3ddf8cSespie #  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
78828ce3119Sbluhm #  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
78928ce3119Sbluhm #  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
7901a3ddf8cSespie #  define IS_NAME_CHAR(enc, p, n) 0
79128ce3119Sbluhm #  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
7921a3ddf8cSespie #  define IS_NMSTRT_CHAR(enc, p, n) (0)
79328ce3119Sbluhm #  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
7941a3ddf8cSespie 
795b26ab0f8Smatthieu #  define XML_TOK_IMPL_C
7961a3ddf8cSespie #  include "xmltok_impl.c"
797b26ab0f8Smatthieu #  undef XML_TOK_IMPL_C
7981a3ddf8cSespie 
7991a3ddf8cSespie #  undef MINBPC
8001a3ddf8cSespie #  undef BYTE_TYPE
8011a3ddf8cSespie #  undef BYTE_TO_ASCII
8021a3ddf8cSespie #  undef CHAR_MATCHES
8031a3ddf8cSespie #  undef IS_NAME_CHAR
8041a3ddf8cSespie #  undef IS_NAME_CHAR_MINBPC
8051a3ddf8cSespie #  undef IS_NMSTRT_CHAR
8061a3ddf8cSespie #  undef IS_NMSTRT_CHAR_MINBPC
8071a3ddf8cSespie #  undef IS_INVALID_CHAR
8081a3ddf8cSespie 
8091a3ddf8cSespie #endif /* not XML_MIN_SIZE */
8101a3ddf8cSespie 
8111a3ddf8cSespie #ifdef XML_NS
8121a3ddf8cSespie 
81328ce3119Sbluhm static const struct normal_encoding little2_encoding_ns
81428ce3119Sbluhm     = {{VTABLE, 2, 0,
8151a3ddf8cSespie #  if BYTEORDER == 1234
8161a3ddf8cSespie         1
8171a3ddf8cSespie #  else
8181a3ddf8cSespie         0
8191a3ddf8cSespie #  endif
8201a3ddf8cSespie        },
8211a3ddf8cSespie        {
8221a3ddf8cSespie #  include "asciitab.h"
8231a3ddf8cSespie #  include "latin1tab.h"
8241a3ddf8cSespie        },
82528ce3119Sbluhm        STANDARD_VTABLE(little2_) NULL_VTABLE};
8261a3ddf8cSespie 
8271a3ddf8cSespie #endif
8281a3ddf8cSespie 
82928ce3119Sbluhm static const struct normal_encoding little2_encoding
83028ce3119Sbluhm     = {{VTABLE, 2, 0,
8311a3ddf8cSespie #if BYTEORDER == 1234
8321a3ddf8cSespie         1
8331a3ddf8cSespie #else
8341a3ddf8cSespie         0
8351a3ddf8cSespie #endif
8361a3ddf8cSespie        },
8371a3ddf8cSespie        {
8381a3ddf8cSespie #define BT_COLON BT_NMSTRT
8391a3ddf8cSespie #include "asciitab.h"
8401a3ddf8cSespie #undef BT_COLON
8411a3ddf8cSespie #include "latin1tab.h"
8421a3ddf8cSespie        },
84328ce3119Sbluhm        STANDARD_VTABLE(little2_) NULL_VTABLE};
8441a3ddf8cSespie 
8451a3ddf8cSespie #if BYTEORDER != 4321
8461a3ddf8cSespie 
8471a3ddf8cSespie #  ifdef XML_NS
8481a3ddf8cSespie 
84928ce3119Sbluhm static const struct normal_encoding internal_little2_encoding_ns
85028ce3119Sbluhm     = {{VTABLE, 2, 0, 1},
8511a3ddf8cSespie        {
8521a3ddf8cSespie #    include "iasciitab.h"
8531a3ddf8cSespie #    include "latin1tab.h"
8541a3ddf8cSespie        },
85528ce3119Sbluhm        STANDARD_VTABLE(little2_) NULL_VTABLE};
8561a3ddf8cSespie 
8571a3ddf8cSespie #  endif
8581a3ddf8cSespie 
85928ce3119Sbluhm static const struct normal_encoding internal_little2_encoding
86028ce3119Sbluhm     = {{VTABLE, 2, 0, 1},
8611a3ddf8cSespie        {
8621a3ddf8cSespie #  define BT_COLON BT_NMSTRT
8631a3ddf8cSespie #  include "iasciitab.h"
8641a3ddf8cSespie #  undef BT_COLON
8651a3ddf8cSespie #  include "latin1tab.h"
8661a3ddf8cSespie        },
86728ce3119Sbluhm        STANDARD_VTABLE(little2_) NULL_VTABLE};
8681a3ddf8cSespie 
8691a3ddf8cSespie #endif
8701a3ddf8cSespie 
8711a3ddf8cSespie #define BIG2_BYTE_TYPE(enc, p)                                                 \
872*bd8f1dc3Sbluhm   ((p)[0] == 0 ? SB_BYTE_TYPE(enc, p + 1) : unicode_byte_type((p)[0], (p)[1]))
87328ce3119Sbluhm #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
8749029d806Sbluhm #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c))
87528ce3119Sbluhm #define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
8761a3ddf8cSespie   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
87728ce3119Sbluhm #define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
8781a3ddf8cSespie   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
8791a3ddf8cSespie 
8801a3ddf8cSespie #ifdef XML_MIN_SIZE
8811a3ddf8cSespie 
8821a3ddf8cSespie static int PTRFASTCALL
big2_byteType(const ENCODING * enc,const char * p)88328ce3119Sbluhm big2_byteType(const ENCODING *enc, const char *p) {
8841a3ddf8cSespie   return BIG2_BYTE_TYPE(enc, p);
8851a3ddf8cSespie }
8861a3ddf8cSespie 
8871a3ddf8cSespie static int PTRFASTCALL
big2_byteToAscii(const ENCODING * enc,const char * p)88828ce3119Sbluhm big2_byteToAscii(const ENCODING *enc, const char *p) {
88928ce3119Sbluhm   UNUSED_P(enc);
89028ce3119Sbluhm   return BIG2_BYTE_TO_ASCII(p);
8911a3ddf8cSespie }
8921a3ddf8cSespie 
8931a3ddf8cSespie static int PTRCALL
big2_charMatches(const ENCODING * enc,const char * p,int c)89428ce3119Sbluhm big2_charMatches(const ENCODING *enc, const char *p, int c) {
89528ce3119Sbluhm   UNUSED_P(enc);
89628ce3119Sbluhm   return BIG2_CHAR_MATCHES(p, c);
8971a3ddf8cSespie }
8981a3ddf8cSespie 
8991a3ddf8cSespie static int PTRFASTCALL
big2_isNameMin(const ENCODING * enc,const char * p)90028ce3119Sbluhm big2_isNameMin(const ENCODING *enc, const char *p) {
90128ce3119Sbluhm   UNUSED_P(enc);
90228ce3119Sbluhm   return BIG2_IS_NAME_CHAR_MINBPC(p);
9031a3ddf8cSespie }
9041a3ddf8cSespie 
9051a3ddf8cSespie static int PTRFASTCALL
big2_isNmstrtMin(const ENCODING * enc,const char * p)90628ce3119Sbluhm big2_isNmstrtMin(const ENCODING *enc, const char *p) {
90728ce3119Sbluhm   UNUSED_P(enc);
90828ce3119Sbluhm   return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
9091a3ddf8cSespie }
9101a3ddf8cSespie 
9111a3ddf8cSespie #  undef VTABLE
9121a3ddf8cSespie #  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
9131a3ddf8cSespie 
9141a3ddf8cSespie #else /* not XML_MIN_SIZE */
9151a3ddf8cSespie 
9161a3ddf8cSespie #  undef PREFIX
9171a3ddf8cSespie #  define PREFIX(ident) big2_##ident
9181a3ddf8cSespie #  define MINBPC(enc) 2
9191a3ddf8cSespie /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
9201a3ddf8cSespie #  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
92128ce3119Sbluhm #  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
92228ce3119Sbluhm #  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
9231a3ddf8cSespie #  define IS_NAME_CHAR(enc, p, n) 0
92428ce3119Sbluhm #  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
9251a3ddf8cSespie #  define IS_NMSTRT_CHAR(enc, p, n) (0)
92628ce3119Sbluhm #  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
9271a3ddf8cSespie 
928b26ab0f8Smatthieu #  define XML_TOK_IMPL_C
9291a3ddf8cSespie #  include "xmltok_impl.c"
930b26ab0f8Smatthieu #  undef XML_TOK_IMPL_C
9311a3ddf8cSespie 
9321a3ddf8cSespie #  undef MINBPC
9331a3ddf8cSespie #  undef BYTE_TYPE
9341a3ddf8cSespie #  undef BYTE_TO_ASCII
9351a3ddf8cSespie #  undef CHAR_MATCHES
9361a3ddf8cSespie #  undef IS_NAME_CHAR
9371a3ddf8cSespie #  undef IS_NAME_CHAR_MINBPC
9381a3ddf8cSespie #  undef IS_NMSTRT_CHAR
9391a3ddf8cSespie #  undef IS_NMSTRT_CHAR_MINBPC
9401a3ddf8cSespie #  undef IS_INVALID_CHAR
9411a3ddf8cSespie 
9421a3ddf8cSespie #endif /* not XML_MIN_SIZE */
9431a3ddf8cSespie 
9441a3ddf8cSespie #ifdef XML_NS
9451a3ddf8cSespie 
94628ce3119Sbluhm static const struct normal_encoding big2_encoding_ns
94728ce3119Sbluhm     = {{VTABLE, 2, 0,
9481a3ddf8cSespie #  if BYTEORDER == 4321
9491a3ddf8cSespie         1
9501a3ddf8cSespie #  else
9511a3ddf8cSespie         0
9521a3ddf8cSespie #  endif
9531a3ddf8cSespie        },
9541a3ddf8cSespie        {
9551a3ddf8cSespie #  include "asciitab.h"
9561a3ddf8cSespie #  include "latin1tab.h"
9571a3ddf8cSespie        },
95828ce3119Sbluhm        STANDARD_VTABLE(big2_) NULL_VTABLE};
9591a3ddf8cSespie 
9601a3ddf8cSespie #endif
9611a3ddf8cSespie 
96228ce3119Sbluhm static const struct normal_encoding big2_encoding
96328ce3119Sbluhm     = {{VTABLE, 2, 0,
9641a3ddf8cSespie #if BYTEORDER == 4321
9651a3ddf8cSespie         1
9661a3ddf8cSespie #else
9671a3ddf8cSespie         0
9681a3ddf8cSespie #endif
9691a3ddf8cSespie        },
9701a3ddf8cSespie        {
9711a3ddf8cSespie #define BT_COLON BT_NMSTRT
9721a3ddf8cSespie #include "asciitab.h"
9731a3ddf8cSespie #undef BT_COLON
9741a3ddf8cSespie #include "latin1tab.h"
9751a3ddf8cSespie        },
97628ce3119Sbluhm        STANDARD_VTABLE(big2_) NULL_VTABLE};
9771a3ddf8cSespie 
9781a3ddf8cSespie #if BYTEORDER != 1234
9791a3ddf8cSespie 
9801a3ddf8cSespie #  ifdef XML_NS
9811a3ddf8cSespie 
98228ce3119Sbluhm static const struct normal_encoding internal_big2_encoding_ns
98328ce3119Sbluhm     = {{VTABLE, 2, 0, 1},
9841a3ddf8cSespie        {
9851a3ddf8cSespie #    include "iasciitab.h"
9861a3ddf8cSespie #    include "latin1tab.h"
9871a3ddf8cSespie        },
98828ce3119Sbluhm        STANDARD_VTABLE(big2_) NULL_VTABLE};
9891a3ddf8cSespie 
9901a3ddf8cSespie #  endif
9911a3ddf8cSespie 
99228ce3119Sbluhm static const struct normal_encoding internal_big2_encoding
99328ce3119Sbluhm     = {{VTABLE, 2, 0, 1},
9941a3ddf8cSespie        {
9951a3ddf8cSespie #  define BT_COLON BT_NMSTRT
9961a3ddf8cSespie #  include "iasciitab.h"
9971a3ddf8cSespie #  undef BT_COLON
9981a3ddf8cSespie #  include "latin1tab.h"
9991a3ddf8cSespie        },
100028ce3119Sbluhm        STANDARD_VTABLE(big2_) NULL_VTABLE};
10011a3ddf8cSespie 
10021a3ddf8cSespie #endif
10031a3ddf8cSespie 
10041a3ddf8cSespie #undef PREFIX
10051a3ddf8cSespie 
10061a3ddf8cSespie static int FASTCALL
streqci(const char * s1,const char * s2)100728ce3119Sbluhm streqci(const char *s1, const char *s2) {
10081a3ddf8cSespie   for (;;) {
10091a3ddf8cSespie     char c1 = *s1++;
10101a3ddf8cSespie     char c2 = *s2++;
10111a3ddf8cSespie     if (ASCII_a <= c1 && c1 <= ASCII_z)
10121a3ddf8cSespie       c1 += ASCII_A - ASCII_a;
10131a3ddf8cSespie     if (ASCII_a <= c2 && c2 <= ASCII_z)
10145837d4fcSbluhm       /* The following line will never get executed.  streqci() is
10155837d4fcSbluhm        * only called from two places, both of which guarantee to put
10165837d4fcSbluhm        * upper-case strings into s2.
10175837d4fcSbluhm        */
10185837d4fcSbluhm       c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
10191a3ddf8cSespie     if (c1 != c2)
10201a3ddf8cSespie       return 0;
10211a3ddf8cSespie     if (! c1)
10221a3ddf8cSespie       break;
10231a3ddf8cSespie   }
10241a3ddf8cSespie   return 1;
10251a3ddf8cSespie }
10261a3ddf8cSespie 
10271a3ddf8cSespie static void PTRCALL
initUpdatePosition(const ENCODING * enc,const char * ptr,const char * end,POSITION * pos)102828ce3119Sbluhm initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
102928ce3119Sbluhm                    POSITION *pos) {
103028ce3119Sbluhm   UNUSED_P(enc);
10311a3ddf8cSespie   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
10321a3ddf8cSespie }
10331a3ddf8cSespie 
10341a3ddf8cSespie static int
toAscii(const ENCODING * enc,const char * ptr,const char * end)103528ce3119Sbluhm toAscii(const ENCODING *enc, const char *ptr, const char *end) {
10361a3ddf8cSespie   char buf[1];
10371a3ddf8cSespie   char *p = buf;
10381a3ddf8cSespie   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
10391a3ddf8cSespie   if (p == buf)
10401a3ddf8cSespie     return -1;
10411a3ddf8cSespie   else
10421a3ddf8cSespie     return buf[0];
10431a3ddf8cSespie }
10441a3ddf8cSespie 
10451a3ddf8cSespie static int FASTCALL
isSpace(int c)104628ce3119Sbluhm isSpace(int c) {
10471a3ddf8cSespie   switch (c) {
10481a3ddf8cSespie   case 0x20:
10491a3ddf8cSespie   case 0xD:
10501a3ddf8cSespie   case 0xA:
10511a3ddf8cSespie   case 0x9:
10521a3ddf8cSespie     return 1;
10531a3ddf8cSespie   }
10541a3ddf8cSespie   return 0;
10551a3ddf8cSespie }
10561a3ddf8cSespie 
10571a3ddf8cSespie /* Return 1 if there's just optional white space or there's an S
10581a3ddf8cSespie    followed by name=val.
10591a3ddf8cSespie */
10601a3ddf8cSespie static int
parsePseudoAttribute(const ENCODING * enc,const char * ptr,const char * end,const char ** namePtr,const char ** nameEndPtr,const char ** valPtr,const char ** nextTokPtr)106128ce3119Sbluhm parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
106228ce3119Sbluhm                      const char **namePtr, const char **nameEndPtr,
106328ce3119Sbluhm                      const char **valPtr, const char **nextTokPtr) {
10641a3ddf8cSespie   int c;
10651a3ddf8cSespie   char open;
10661a3ddf8cSespie   if (ptr == end) {
10671a3ddf8cSespie     *namePtr = NULL;
10681a3ddf8cSespie     return 1;
10691a3ddf8cSespie   }
10701a3ddf8cSespie   if (! isSpace(toAscii(enc, ptr, end))) {
10711a3ddf8cSespie     *nextTokPtr = ptr;
10721a3ddf8cSespie     return 0;
10731a3ddf8cSespie   }
10741a3ddf8cSespie   do {
10751a3ddf8cSespie     ptr += enc->minBytesPerChar;
10761a3ddf8cSespie   } while (isSpace(toAscii(enc, ptr, end)));
10771a3ddf8cSespie   if (ptr == end) {
10781a3ddf8cSespie     *namePtr = NULL;
10791a3ddf8cSespie     return 1;
10801a3ddf8cSespie   }
10811a3ddf8cSespie   *namePtr = ptr;
10821a3ddf8cSespie   for (;;) {
10831a3ddf8cSespie     c = toAscii(enc, ptr, end);
10841a3ddf8cSespie     if (c == -1) {
10851a3ddf8cSespie       *nextTokPtr = ptr;
10861a3ddf8cSespie       return 0;
10871a3ddf8cSespie     }
10881a3ddf8cSespie     if (c == ASCII_EQUALS) {
10891a3ddf8cSespie       *nameEndPtr = ptr;
10901a3ddf8cSespie       break;
10911a3ddf8cSespie     }
10921a3ddf8cSespie     if (isSpace(c)) {
10931a3ddf8cSespie       *nameEndPtr = ptr;
10941a3ddf8cSespie       do {
10951a3ddf8cSespie         ptr += enc->minBytesPerChar;
10961a3ddf8cSespie       } while (isSpace(c = toAscii(enc, ptr, end)));
10971a3ddf8cSespie       if (c != ASCII_EQUALS) {
10981a3ddf8cSespie         *nextTokPtr = ptr;
10991a3ddf8cSespie         return 0;
11001a3ddf8cSespie       }
11011a3ddf8cSespie       break;
11021a3ddf8cSespie     }
11031a3ddf8cSespie     ptr += enc->minBytesPerChar;
11041a3ddf8cSespie   }
11051a3ddf8cSespie   if (ptr == *namePtr) {
11061a3ddf8cSespie     *nextTokPtr = ptr;
11071a3ddf8cSespie     return 0;
11081a3ddf8cSespie   }
11091a3ddf8cSespie   ptr += enc->minBytesPerChar;
11101a3ddf8cSespie   c = toAscii(enc, ptr, end);
11111a3ddf8cSespie   while (isSpace(c)) {
11121a3ddf8cSespie     ptr += enc->minBytesPerChar;
11131a3ddf8cSespie     c = toAscii(enc, ptr, end);
11141a3ddf8cSespie   }
11151a3ddf8cSespie   if (c != ASCII_QUOT && c != ASCII_APOS) {
11161a3ddf8cSespie     *nextTokPtr = ptr;
11171a3ddf8cSespie     return 0;
11181a3ddf8cSespie   }
11191a3ddf8cSespie   open = (char)c;
11201a3ddf8cSespie   ptr += enc->minBytesPerChar;
11211a3ddf8cSespie   *valPtr = ptr;
11221a3ddf8cSespie   for (;; ptr += enc->minBytesPerChar) {
11231a3ddf8cSespie     c = toAscii(enc, ptr, end);
11241a3ddf8cSespie     if (c == open)
11251a3ddf8cSespie       break;
112628ce3119Sbluhm     if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
112728ce3119Sbluhm         && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
112828ce3119Sbluhm         && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
11291a3ddf8cSespie       *nextTokPtr = ptr;
11301a3ddf8cSespie       return 0;
11311a3ddf8cSespie     }
11321a3ddf8cSespie   }
11331a3ddf8cSespie   *nextTokPtr = ptr + enc->minBytesPerChar;
11341a3ddf8cSespie   return 1;
11351a3ddf8cSespie }
11361a3ddf8cSespie 
113728ce3119Sbluhm static const char KW_version[]
113828ce3119Sbluhm     = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
11391a3ddf8cSespie 
114028ce3119Sbluhm static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
114128ce3119Sbluhm                                    ASCII_i, ASCII_n, ASCII_g, '\0'};
11421a3ddf8cSespie 
114328ce3119Sbluhm static const char KW_standalone[]
114428ce3119Sbluhm     = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
114528ce3119Sbluhm        ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
11461a3ddf8cSespie 
114728ce3119Sbluhm static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
11481a3ddf8cSespie 
114928ce3119Sbluhm static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
11501a3ddf8cSespie 
11511a3ddf8cSespie static int
doParseXmlDecl(const ENCODING * (* encodingFinder)(const ENCODING *,const char *,const char *),int isGeneralTextEntity,const ENCODING * enc,const char * ptr,const char * end,const char ** badPtr,const char ** versionPtr,const char ** versionEndPtr,const char ** encodingName,const ENCODING ** encoding,int * standalone)115228ce3119Sbluhm doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
11531a3ddf8cSespie                                                  const char *),
115428ce3119Sbluhm                int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
115528ce3119Sbluhm                const char *end, const char **badPtr, const char **versionPtr,
115628ce3119Sbluhm                const char **versionEndPtr, const char **encodingName,
115728ce3119Sbluhm                const ENCODING **encoding, int *standalone) {
11581a3ddf8cSespie   const char *val = NULL;
11591a3ddf8cSespie   const char *name = NULL;
11601a3ddf8cSespie   const char *nameEnd = NULL;
11611a3ddf8cSespie   ptr += 5 * enc->minBytesPerChar;
11621a3ddf8cSespie   end -= 2 * enc->minBytesPerChar;
11631a3ddf8cSespie   if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
11641a3ddf8cSespie       || ! name) {
11651a3ddf8cSespie     *badPtr = ptr;
11661a3ddf8cSespie     return 0;
11671a3ddf8cSespie   }
11681a3ddf8cSespie   if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
11691a3ddf8cSespie     if (! isGeneralTextEntity) {
11701a3ddf8cSespie       *badPtr = name;
11711a3ddf8cSespie       return 0;
11721a3ddf8cSespie     }
117328ce3119Sbluhm   } else {
11741a3ddf8cSespie     if (versionPtr)
11751a3ddf8cSespie       *versionPtr = val;
11761a3ddf8cSespie     if (versionEndPtr)
11771a3ddf8cSespie       *versionEndPtr = ptr;
11781a3ddf8cSespie     if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
11791a3ddf8cSespie       *badPtr = ptr;
11801a3ddf8cSespie       return 0;
11811a3ddf8cSespie     }
11821a3ddf8cSespie     if (! name) {
11831a3ddf8cSespie       if (isGeneralTextEntity) {
11841a3ddf8cSespie         /* a TextDecl must have an EncodingDecl */
11851a3ddf8cSespie         *badPtr = ptr;
11861a3ddf8cSespie         return 0;
11871a3ddf8cSespie       }
11881a3ddf8cSespie       return 1;
11891a3ddf8cSespie     }
11901a3ddf8cSespie   }
11911a3ddf8cSespie   if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
11921a3ddf8cSespie     int c = toAscii(enc, val, end);
11931a3ddf8cSespie     if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
11941a3ddf8cSespie       *badPtr = val;
11951a3ddf8cSespie       return 0;
11961a3ddf8cSespie     }
11971a3ddf8cSespie     if (encodingName)
11981a3ddf8cSespie       *encodingName = val;
11991a3ddf8cSespie     if (encoding)
12001a3ddf8cSespie       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
12011a3ddf8cSespie     if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
12021a3ddf8cSespie       *badPtr = ptr;
12031a3ddf8cSespie       return 0;
12041a3ddf8cSespie     }
12051a3ddf8cSespie     if (! name)
12061a3ddf8cSespie       return 1;
12071a3ddf8cSespie   }
12081a3ddf8cSespie   if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
12091a3ddf8cSespie       || isGeneralTextEntity) {
12101a3ddf8cSespie     *badPtr = name;
12111a3ddf8cSespie     return 0;
12121a3ddf8cSespie   }
12131a3ddf8cSespie   if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
12141a3ddf8cSespie     if (standalone)
12151a3ddf8cSespie       *standalone = 1;
121628ce3119Sbluhm   } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
12171a3ddf8cSespie     if (standalone)
12181a3ddf8cSespie       *standalone = 0;
121928ce3119Sbluhm   } else {
12201a3ddf8cSespie     *badPtr = val;
12211a3ddf8cSespie     return 0;
12221a3ddf8cSespie   }
12231a3ddf8cSespie   while (isSpace(toAscii(enc, ptr, end)))
12241a3ddf8cSespie     ptr += enc->minBytesPerChar;
12251a3ddf8cSespie   if (ptr != end) {
12261a3ddf8cSespie     *badPtr = ptr;
12271a3ddf8cSespie     return 0;
12281a3ddf8cSespie   }
12291a3ddf8cSespie   return 1;
12301a3ddf8cSespie }
12311a3ddf8cSespie 
12321a3ddf8cSespie static int FASTCALL
checkCharRefNumber(int result)123328ce3119Sbluhm checkCharRefNumber(int result) {
12341a3ddf8cSespie   switch (result >> 8) {
123528ce3119Sbluhm   case 0xD8:
123628ce3119Sbluhm   case 0xD9:
123728ce3119Sbluhm   case 0xDA:
123828ce3119Sbluhm   case 0xDB:
123928ce3119Sbluhm   case 0xDC:
124028ce3119Sbluhm   case 0xDD:
124128ce3119Sbluhm   case 0xDE:
124228ce3119Sbluhm   case 0xDF:
12431a3ddf8cSespie     return -1;
12441a3ddf8cSespie   case 0:
12451a3ddf8cSespie     if (latin1_encoding.type[result] == BT_NONXML)
12461a3ddf8cSespie       return -1;
12471a3ddf8cSespie     break;
12481a3ddf8cSespie   case 0xFF:
12491a3ddf8cSespie     if (result == 0xFFFE || result == 0xFFFF)
12501a3ddf8cSespie       return -1;
12511a3ddf8cSespie     break;
12521a3ddf8cSespie   }
12531a3ddf8cSespie   return result;
12541a3ddf8cSespie }
12551a3ddf8cSespie 
12561a3ddf8cSespie int FASTCALL
XmlUtf8Encode(int c,char * buf)125728ce3119Sbluhm XmlUtf8Encode(int c, char *buf) {
12581a3ddf8cSespie   enum {
12591a3ddf8cSespie     /* minN is minimum legal resulting value for N byte sequence */
12601a3ddf8cSespie     min2 = 0x80,
12611a3ddf8cSespie     min3 = 0x800,
12621a3ddf8cSespie     min4 = 0x10000
12631a3ddf8cSespie   };
12641a3ddf8cSespie 
12651a3ddf8cSespie   if (c < 0)
12665837d4fcSbluhm     return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
12671a3ddf8cSespie   if (c < min2) {
12681a3ddf8cSespie     buf[0] = (char)(c | UTF8_cval1);
12691a3ddf8cSespie     return 1;
12701a3ddf8cSespie   }
12711a3ddf8cSespie   if (c < min3) {
12721a3ddf8cSespie     buf[0] = (char)((c >> 6) | UTF8_cval2);
12731a3ddf8cSespie     buf[1] = (char)((c & 0x3f) | 0x80);
12741a3ddf8cSespie     return 2;
12751a3ddf8cSespie   }
12761a3ddf8cSespie   if (c < min4) {
12771a3ddf8cSespie     buf[0] = (char)((c >> 12) | UTF8_cval3);
12781a3ddf8cSespie     buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
12791a3ddf8cSespie     buf[2] = (char)((c & 0x3f) | 0x80);
12801a3ddf8cSespie     return 3;
12811a3ddf8cSespie   }
12821a3ddf8cSespie   if (c < 0x110000) {
12831a3ddf8cSespie     buf[0] = (char)((c >> 18) | UTF8_cval4);
12841a3ddf8cSespie     buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
12851a3ddf8cSespie     buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
12861a3ddf8cSespie     buf[3] = (char)((c & 0x3f) | 0x80);
12871a3ddf8cSespie     return 4;
12881a3ddf8cSespie   }
12895837d4fcSbluhm   return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
12901a3ddf8cSespie }
12911a3ddf8cSespie 
12921a3ddf8cSespie int FASTCALL
XmlUtf16Encode(int charNum,unsigned short * buf)129328ce3119Sbluhm XmlUtf16Encode(int charNum, unsigned short *buf) {
12941a3ddf8cSespie   if (charNum < 0)
12951a3ddf8cSespie     return 0;
12961a3ddf8cSespie   if (charNum < 0x10000) {
12971a3ddf8cSespie     buf[0] = (unsigned short)charNum;
12981a3ddf8cSespie     return 1;
12991a3ddf8cSespie   }
13001a3ddf8cSespie   if (charNum < 0x110000) {
13011a3ddf8cSespie     charNum -= 0x10000;
13021a3ddf8cSespie     buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
13031a3ddf8cSespie     buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
13041a3ddf8cSespie     return 2;
13051a3ddf8cSespie   }
13061a3ddf8cSespie   return 0;
13071a3ddf8cSespie }
13081a3ddf8cSespie 
13091a3ddf8cSespie struct unknown_encoding {
13101a3ddf8cSespie   struct normal_encoding normal;
13117d36914fSalek   CONVERTER convert;
13121a3ddf8cSespie   void *userData;
13131a3ddf8cSespie   unsigned short utf16[256];
13141a3ddf8cSespie   char utf8[256][4];
13151a3ddf8cSespie };
13161a3ddf8cSespie 
13171a3ddf8cSespie #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
13181a3ddf8cSespie 
13191a3ddf8cSespie int
XmlSizeOfUnknownEncoding(void)132028ce3119Sbluhm XmlSizeOfUnknownEncoding(void) {
13211a3ddf8cSespie   return sizeof(struct unknown_encoding);
13221a3ddf8cSespie }
13231a3ddf8cSespie 
13241a3ddf8cSespie static int PTRFASTCALL
unknown_isName(const ENCODING * enc,const char * p)132528ce3119Sbluhm unknown_isName(const ENCODING *enc, const char *p) {
13261a3ddf8cSespie   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
13271a3ddf8cSespie   int c = uenc->convert(uenc->userData, p);
13281a3ddf8cSespie   if (c & ~0xFFFF)
13291a3ddf8cSespie     return 0;
13301a3ddf8cSespie   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
13311a3ddf8cSespie }
13321a3ddf8cSespie 
13331a3ddf8cSespie static int PTRFASTCALL
unknown_isNmstrt(const ENCODING * enc,const char * p)133428ce3119Sbluhm unknown_isNmstrt(const ENCODING *enc, const char *p) {
13351a3ddf8cSespie   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
13361a3ddf8cSespie   int c = uenc->convert(uenc->userData, p);
13371a3ddf8cSespie   if (c & ~0xFFFF)
13381a3ddf8cSespie     return 0;
13391a3ddf8cSespie   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
13401a3ddf8cSespie }
13411a3ddf8cSespie 
13421a3ddf8cSespie static int PTRFASTCALL
unknown_isInvalid(const ENCODING * enc,const char * p)134328ce3119Sbluhm unknown_isInvalid(const ENCODING *enc, const char *p) {
13441a3ddf8cSespie   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
13451a3ddf8cSespie   int c = uenc->convert(uenc->userData, p);
13461a3ddf8cSespie   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
13471a3ddf8cSespie }
13481a3ddf8cSespie 
1349525cdfc7Srpointel static enum XML_Convert_Result PTRCALL
unknown_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)135028ce3119Sbluhm unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
135128ce3119Sbluhm                char **toP, const char *toLim) {
13521a3ddf8cSespie   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
13531a3ddf8cSespie   char buf[XML_UTF8_ENCODE_MAX];
13541a3ddf8cSespie   for (;;) {
13551a3ddf8cSespie     const char *utf8;
13561a3ddf8cSespie     int n;
13571a3ddf8cSespie     if (*fromP == fromLim)
1358525cdfc7Srpointel       return XML_CONVERT_COMPLETED;
13591a3ddf8cSespie     utf8 = uenc->utf8[(unsigned char)**fromP];
13601a3ddf8cSespie     n = *utf8++;
13611a3ddf8cSespie     if (n == 0) {
13621a3ddf8cSespie       int c = uenc->convert(uenc->userData, *fromP);
13631a3ddf8cSespie       n = XmlUtf8Encode(c, buf);
13641a3ddf8cSespie       if (n > toLim - *toP)
1365525cdfc7Srpointel         return XML_CONVERT_OUTPUT_EXHAUSTED;
13661a3ddf8cSespie       utf8 = buf;
13671a3ddf8cSespie       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
13681a3ddf8cSespie                  - (BT_LEAD2 - 2));
136928ce3119Sbluhm     } else {
13701a3ddf8cSespie       if (n > toLim - *toP)
1371525cdfc7Srpointel         return XML_CONVERT_OUTPUT_EXHAUSTED;
13721a3ddf8cSespie       (*fromP)++;
13731a3ddf8cSespie     }
13749b8e2351Sbluhm     memcpy(*toP, utf8, n);
13759b8e2351Sbluhm     *toP += n;
13761a3ddf8cSespie   }
13771a3ddf8cSespie }
13781a3ddf8cSespie 
1379525cdfc7Srpointel static enum XML_Convert_Result PTRCALL
unknown_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)138028ce3119Sbluhm unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
138128ce3119Sbluhm                 unsigned short **toP, const unsigned short *toLim) {
13821a3ddf8cSespie   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1383525cdfc7Srpointel   while (*fromP < fromLim && *toP < toLim) {
13841a3ddf8cSespie     unsigned short c = uenc->utf16[(unsigned char)**fromP];
13851a3ddf8cSespie     if (c == 0) {
138628ce3119Sbluhm       c = (unsigned short)uenc->convert(uenc->userData, *fromP);
13871a3ddf8cSespie       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
13881a3ddf8cSespie                  - (BT_LEAD2 - 2));
138928ce3119Sbluhm     } else
13901a3ddf8cSespie       (*fromP)++;
13911a3ddf8cSespie     *(*toP)++ = c;
13921a3ddf8cSespie   }
1393525cdfc7Srpointel 
1394525cdfc7Srpointel   if ((*toP == toLim) && (*fromP < fromLim))
1395525cdfc7Srpointel     return XML_CONVERT_OUTPUT_EXHAUSTED;
1396525cdfc7Srpointel   else
1397525cdfc7Srpointel     return XML_CONVERT_COMPLETED;
13981a3ddf8cSespie }
13991a3ddf8cSespie 
14001a3ddf8cSespie ENCODING *
XmlInitUnknownEncoding(void * mem,int * table,CONVERTER convert,void * userData)140128ce3119Sbluhm XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
140228ce3119Sbluhm                        void *userData) {
14031a3ddf8cSespie   int i;
14041a3ddf8cSespie   struct unknown_encoding *e = (struct unknown_encoding *)mem;
140528ce3119Sbluhm   memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
14061a3ddf8cSespie   for (i = 0; i < 128; i++)
14071a3ddf8cSespie     if (latin1_encoding.type[i] != BT_OTHER
140828ce3119Sbluhm         && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
14091a3ddf8cSespie       return 0;
14101a3ddf8cSespie   for (i = 0; i < 256; i++) {
14111a3ddf8cSespie     int c = table[i];
14121a3ddf8cSespie     if (c == -1) {
14131a3ddf8cSespie       e->normal.type[i] = BT_MALFORM;
14141a3ddf8cSespie       /* This shouldn't really get used. */
14151a3ddf8cSespie       e->utf16[i] = 0xFFFF;
14161a3ddf8cSespie       e->utf8[i][0] = 1;
14171a3ddf8cSespie       e->utf8[i][1] = 0;
141828ce3119Sbluhm     } else if (c < 0) {
14191a3ddf8cSespie       if (c < -4)
14201a3ddf8cSespie         return 0;
14215837d4fcSbluhm       /* Multi-byte sequences need a converter function */
14225837d4fcSbluhm       if (! convert)
14235837d4fcSbluhm         return 0;
14241a3ddf8cSespie       e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
14251a3ddf8cSespie       e->utf8[i][0] = 0;
14261a3ddf8cSespie       e->utf16[i] = 0;
142728ce3119Sbluhm     } else if (c < 0x80) {
14281a3ddf8cSespie       if (latin1_encoding.type[c] != BT_OTHER
142928ce3119Sbluhm           && latin1_encoding.type[c] != BT_NONXML && c != i)
14301a3ddf8cSespie         return 0;
14311a3ddf8cSespie       e->normal.type[i] = latin1_encoding.type[c];
14321a3ddf8cSespie       e->utf8[i][0] = 1;
14331a3ddf8cSespie       e->utf8[i][1] = (char)c;
14341a3ddf8cSespie       e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
143528ce3119Sbluhm     } else if (checkCharRefNumber(c) < 0) {
14361a3ddf8cSespie       e->normal.type[i] = BT_NONXML;
14371a3ddf8cSespie       /* This shouldn't really get used. */
14381a3ddf8cSespie       e->utf16[i] = 0xFFFF;
14391a3ddf8cSespie       e->utf8[i][0] = 1;
14401a3ddf8cSespie       e->utf8[i][1] = 0;
144128ce3119Sbluhm     } else {
14421a3ddf8cSespie       if (c > 0xFFFF)
14431a3ddf8cSespie         return 0;
14441a3ddf8cSespie       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
14451a3ddf8cSespie         e->normal.type[i] = BT_NMSTRT;
14461a3ddf8cSespie       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
14471a3ddf8cSespie         e->normal.type[i] = BT_NAME;
14481a3ddf8cSespie       else
14491a3ddf8cSespie         e->normal.type[i] = BT_OTHER;
14501a3ddf8cSespie       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
14511a3ddf8cSespie       e->utf16[i] = (unsigned short)c;
14521a3ddf8cSespie     }
14531a3ddf8cSespie   }
14541a3ddf8cSespie   e->userData = userData;
14551a3ddf8cSespie   e->convert = convert;
14561a3ddf8cSespie   if (convert) {
14571a3ddf8cSespie     e->normal.isName2 = unknown_isName;
14581a3ddf8cSespie     e->normal.isName3 = unknown_isName;
14591a3ddf8cSespie     e->normal.isName4 = unknown_isName;
14601a3ddf8cSespie     e->normal.isNmstrt2 = unknown_isNmstrt;
14611a3ddf8cSespie     e->normal.isNmstrt3 = unknown_isNmstrt;
14621a3ddf8cSespie     e->normal.isNmstrt4 = unknown_isNmstrt;
14631a3ddf8cSespie     e->normal.isInvalid2 = unknown_isInvalid;
14641a3ddf8cSespie     e->normal.isInvalid3 = unknown_isInvalid;
14651a3ddf8cSespie     e->normal.isInvalid4 = unknown_isInvalid;
14661a3ddf8cSespie   }
14671a3ddf8cSespie   e->normal.enc.utf8Convert = unknown_toUtf8;
14681a3ddf8cSespie   e->normal.enc.utf16Convert = unknown_toUtf16;
14691a3ddf8cSespie   return &(e->normal.enc);
14701a3ddf8cSespie }
14711a3ddf8cSespie 
14721a3ddf8cSespie /* If this enumeration is changed, getEncodingIndex and encodings
14731a3ddf8cSespie must also be changed. */
14741a3ddf8cSespie enum {
14751a3ddf8cSespie   UNKNOWN_ENC = -1,
14761a3ddf8cSespie   ISO_8859_1_ENC = 0,
14771a3ddf8cSespie   US_ASCII_ENC,
14781a3ddf8cSespie   UTF_8_ENC,
14791a3ddf8cSespie   UTF_16_ENC,
14801a3ddf8cSespie   UTF_16BE_ENC,
14811a3ddf8cSespie   UTF_16LE_ENC,
14821a3ddf8cSespie   /* must match encodingNames up to here */
14831a3ddf8cSespie   NO_ENC
14841a3ddf8cSespie };
14851a3ddf8cSespie 
148628ce3119Sbluhm static const char KW_ISO_8859_1[]
148728ce3119Sbluhm     = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
148828ce3119Sbluhm        ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
148928ce3119Sbluhm static const char KW_US_ASCII[]
149028ce3119Sbluhm     = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
149128ce3119Sbluhm        ASCII_C, ASCII_I, ASCII_I,     '\0'};
149228ce3119Sbluhm static const char KW_UTF_8[]
149328ce3119Sbluhm     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
149428ce3119Sbluhm static const char KW_UTF_16[]
149528ce3119Sbluhm     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
149628ce3119Sbluhm static const char KW_UTF_16BE[]
149728ce3119Sbluhm     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
149828ce3119Sbluhm        ASCII_6, ASCII_B, ASCII_E, '\0'};
149928ce3119Sbluhm static const char KW_UTF_16LE[]
150028ce3119Sbluhm     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
150128ce3119Sbluhm        ASCII_6, ASCII_L, ASCII_E, '\0'};
15021a3ddf8cSespie 
15031a3ddf8cSespie static int FASTCALL
getEncodingIndex(const char * name)150428ce3119Sbluhm getEncodingIndex(const char *name) {
15057d36914fSalek   static const char *const encodingNames[] = {
150628ce3119Sbluhm       KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
15071a3ddf8cSespie   };
15081a3ddf8cSespie   int i;
15091a3ddf8cSespie   if (name == NULL)
15101a3ddf8cSespie     return NO_ENC;
15111a3ddf8cSespie   for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
15121a3ddf8cSespie     if (streqci(name, encodingNames[i]))
15131a3ddf8cSespie       return i;
15141a3ddf8cSespie   return UNKNOWN_ENC;
15151a3ddf8cSespie }
15161a3ddf8cSespie 
15171a3ddf8cSespie /* For binary compatibility, we store the index of the encoding
15181a3ddf8cSespie    specified at initialization in the isUtf16 member.
15191a3ddf8cSespie */
15201a3ddf8cSespie 
15211a3ddf8cSespie #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
15221a3ddf8cSespie #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
15231a3ddf8cSespie 
15241a3ddf8cSespie /* This is what detects the encoding.  encodingTable maps from
15251a3ddf8cSespie    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
15261a3ddf8cSespie    the external (protocol) specified encoding; state is
15271a3ddf8cSespie    XML_CONTENT_STATE if we're parsing an external text entity, and
15281a3ddf8cSespie    XML_PROLOG_STATE otherwise.
15291a3ddf8cSespie */
15301a3ddf8cSespie 
15311a3ddf8cSespie static int
initScan(const ENCODING * const * encodingTable,const INIT_ENCODING * enc,int state,const char * ptr,const char * end,const char ** nextTokPtr)153228ce3119Sbluhm initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
153328ce3119Sbluhm          int state, const char *ptr, const char *end, const char **nextTokPtr) {
15341a3ddf8cSespie   const ENCODING **encPtr;
15351a3ddf8cSespie 
1536525cdfc7Srpointel   if (ptr >= end)
15371a3ddf8cSespie     return XML_TOK_NONE;
15381a3ddf8cSespie   encPtr = enc->encPtr;
15391a3ddf8cSespie   if (ptr + 1 == end) {
15401a3ddf8cSespie     /* only a single byte available for auto-detection */
15411a3ddf8cSespie #ifndef XML_DTD /* FIXME */
15421a3ddf8cSespie     /* a well-formed document entity must have more than one byte */
15431a3ddf8cSespie     if (state != XML_CONTENT_STATE)
15441a3ddf8cSespie       return XML_TOK_PARTIAL;
15451a3ddf8cSespie #endif
15461a3ddf8cSespie     /* so we're parsing an external text entity... */
15471a3ddf8cSespie     /* if UTF-16 was externally specified, then we need at least 2 bytes */
15481a3ddf8cSespie     switch (INIT_ENC_INDEX(enc)) {
15491a3ddf8cSespie     case UTF_16_ENC:
15501a3ddf8cSespie     case UTF_16LE_ENC:
15511a3ddf8cSespie     case UTF_16BE_ENC:
15521a3ddf8cSespie       return XML_TOK_PARTIAL;
15531a3ddf8cSespie     }
15541a3ddf8cSespie     switch ((unsigned char)*ptr) {
15551a3ddf8cSespie     case 0xFE:
15561a3ddf8cSespie     case 0xFF:
15571a3ddf8cSespie     case 0xEF: /* possibly first byte of UTF-8 BOM */
155828ce3119Sbluhm       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
15591a3ddf8cSespie         break;
15601a3ddf8cSespie       /* fall through */
15611a3ddf8cSespie     case 0x00:
15621a3ddf8cSespie     case 0x3C:
15631a3ddf8cSespie       return XML_TOK_PARTIAL;
15641a3ddf8cSespie     }
156528ce3119Sbluhm   } else {
15661a3ddf8cSespie     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
15671a3ddf8cSespie     case 0xFEFF:
156828ce3119Sbluhm       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
15691a3ddf8cSespie         break;
15701a3ddf8cSespie       *nextTokPtr = ptr + 2;
15711a3ddf8cSespie       *encPtr = encodingTable[UTF_16BE_ENC];
15721a3ddf8cSespie       return XML_TOK_BOM;
15731a3ddf8cSespie     /* 00 3C is handled in the default case */
15741a3ddf8cSespie     case 0x3C00:
15751a3ddf8cSespie       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
15761a3ddf8cSespie            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
15771a3ddf8cSespie           && state == XML_CONTENT_STATE)
15781a3ddf8cSespie         break;
15791a3ddf8cSespie       *encPtr = encodingTable[UTF_16LE_ENC];
15801a3ddf8cSespie       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
15811a3ddf8cSespie     case 0xFFFE:
158228ce3119Sbluhm       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
15831a3ddf8cSespie         break;
15841a3ddf8cSespie       *nextTokPtr = ptr + 2;
15851a3ddf8cSespie       *encPtr = encodingTable[UTF_16LE_ENC];
15861a3ddf8cSespie       return XML_TOK_BOM;
15871a3ddf8cSespie     case 0xEFBB:
15881a3ddf8cSespie       /* Maybe a UTF-8 BOM (EF BB BF) */
15891a3ddf8cSespie       /* If there's an explicitly specified (external) encoding
15901a3ddf8cSespie          of ISO-8859-1 or some flavour of UTF-16
15911a3ddf8cSespie          and this is an external text entity,
15921a3ddf8cSespie          don't look for the BOM,
15931a3ddf8cSespie          because it might be a legal data.
15941a3ddf8cSespie       */
15951a3ddf8cSespie       if (state == XML_CONTENT_STATE) {
15961a3ddf8cSespie         int e = INIT_ENC_INDEX(enc);
159728ce3119Sbluhm         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
159828ce3119Sbluhm             || e == UTF_16_ENC)
15991a3ddf8cSespie           break;
16001a3ddf8cSespie       }
16011a3ddf8cSespie       if (ptr + 2 == end)
16021a3ddf8cSespie         return XML_TOK_PARTIAL;
16031a3ddf8cSespie       if ((unsigned char)ptr[2] == 0xBF) {
16041a3ddf8cSespie         *nextTokPtr = ptr + 3;
16051a3ddf8cSespie         *encPtr = encodingTable[UTF_8_ENC];
16061a3ddf8cSespie         return XML_TOK_BOM;
16071a3ddf8cSespie       }
16081a3ddf8cSespie       break;
16091a3ddf8cSespie     default:
16101a3ddf8cSespie       if (ptr[0] == '\0') {
16111a3ddf8cSespie         /* 0 isn't a legal data character. Furthermore a document
16121a3ddf8cSespie            entity can only start with ASCII characters.  So the only
16131a3ddf8cSespie            way this can fail to be big-endian UTF-16 if it it's an
16141a3ddf8cSespie            external parsed general entity that's labelled as
16151a3ddf8cSespie            UTF-16LE.
16161a3ddf8cSespie         */
16171a3ddf8cSespie         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
16181a3ddf8cSespie           break;
16191a3ddf8cSespie         *encPtr = encodingTable[UTF_16BE_ENC];
16201a3ddf8cSespie         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
162128ce3119Sbluhm       } else if (ptr[1] == '\0') {
16221a3ddf8cSespie         /* We could recover here in the case:
16231a3ddf8cSespie             - parsing an external entity
16241a3ddf8cSespie             - second byte is 0
16251a3ddf8cSespie             - no externally specified encoding
16261a3ddf8cSespie             - no encoding declaration
16271a3ddf8cSespie            by assuming UTF-16LE.  But we don't, because this would mean when
16281a3ddf8cSespie            presented just with a single byte, we couldn't reliably determine
16291a3ddf8cSespie            whether we needed further bytes.
16301a3ddf8cSespie         */
16311a3ddf8cSespie         if (state == XML_CONTENT_STATE)
16321a3ddf8cSespie           break;
16331a3ddf8cSespie         *encPtr = encodingTable[UTF_16LE_ENC];
16341a3ddf8cSespie         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
16351a3ddf8cSespie       }
16361a3ddf8cSespie       break;
16371a3ddf8cSespie     }
16381a3ddf8cSespie   }
16391a3ddf8cSespie   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
16401a3ddf8cSespie   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
16411a3ddf8cSespie }
16421a3ddf8cSespie 
16431a3ddf8cSespie #define NS(x) x
16441a3ddf8cSespie #define ns(x) x
1645b26ab0f8Smatthieu #define XML_TOK_NS_C
16461a3ddf8cSespie #include "xmltok_ns.c"
1647b26ab0f8Smatthieu #undef XML_TOK_NS_C
16481a3ddf8cSespie #undef NS
16491a3ddf8cSespie #undef ns
16501a3ddf8cSespie 
16511a3ddf8cSespie #ifdef XML_NS
16521a3ddf8cSespie 
16531a3ddf8cSespie #  define NS(x) x##NS
16541a3ddf8cSespie #  define ns(x) x##_ns
16551a3ddf8cSespie 
1656b26ab0f8Smatthieu #  define XML_TOK_NS_C
16571a3ddf8cSespie #  include "xmltok_ns.c"
1658b26ab0f8Smatthieu #  undef XML_TOK_NS_C
16591a3ddf8cSespie 
16601a3ddf8cSespie #  undef NS
16611a3ddf8cSespie #  undef ns
16621a3ddf8cSespie 
16631a3ddf8cSespie ENCODING *
XmlInitUnknownEncodingNS(void * mem,int * table,CONVERTER convert,void * userData)166428ce3119Sbluhm XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
166528ce3119Sbluhm                          void *userData) {
16661a3ddf8cSespie   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
16671a3ddf8cSespie   if (enc)
16681a3ddf8cSespie     ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
16691a3ddf8cSespie   return enc;
16701a3ddf8cSespie }
16711a3ddf8cSespie 
16721a3ddf8cSespie #endif /* XML_NS */
1673