1*e4b17023SJohn Marino /* Make ucnid.h from various sources. 2*e4b17023SJohn Marino Copyright (C) 2005, 2009 Free Software Foundation, Inc. 3*e4b17023SJohn Marino 4*e4b17023SJohn Marino This program is free software; you can redistribute it and/or modify it 5*e4b17023SJohn Marino under the terms of the GNU General Public License as published by the 6*e4b17023SJohn Marino Free Software Foundation; either version 3, or (at your option) any 7*e4b17023SJohn Marino later version. 8*e4b17023SJohn Marino 9*e4b17023SJohn Marino This program is distributed in the hope that it will be useful, 10*e4b17023SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of 11*e4b17023SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12*e4b17023SJohn Marino GNU General Public License for more details. 13*e4b17023SJohn Marino 14*e4b17023SJohn Marino You should have received a copy of the GNU General Public License 15*e4b17023SJohn Marino along with this program; see the file COPYING3. If not see 16*e4b17023SJohn Marino <http://www.gnu.org/licenses/>. */ 17*e4b17023SJohn Marino 18*e4b17023SJohn Marino /* Run this program as 19*e4b17023SJohn Marino ./makeucnid ucnid.tab UnicodeData.txt DerivedNormalizationProps.txt \ 20*e4b17023SJohn Marino > ucnid.h 21*e4b17023SJohn Marino */ 22*e4b17023SJohn Marino 23*e4b17023SJohn Marino #include <stdio.h> 24*e4b17023SJohn Marino #include <string.h> 25*e4b17023SJohn Marino #include <ctype.h> 26*e4b17023SJohn Marino #include <stdbool.h> 27*e4b17023SJohn Marino #include <stdlib.h> 28*e4b17023SJohn Marino 29*e4b17023SJohn Marino enum { 30*e4b17023SJohn Marino C99 = 1, 31*e4b17023SJohn Marino CXX = 2, 32*e4b17023SJohn Marino digit = 4, 33*e4b17023SJohn Marino not_NFC = 8, 34*e4b17023SJohn Marino not_NFKC = 16, 35*e4b17023SJohn Marino maybe_not_NFC = 32 36*e4b17023SJohn Marino }; 37*e4b17023SJohn Marino 38*e4b17023SJohn Marino static unsigned flags[65536]; 39*e4b17023SJohn Marino static unsigned short decomp[65536][2]; 40*e4b17023SJohn Marino static unsigned char combining_value[65536]; 41*e4b17023SJohn Marino 42*e4b17023SJohn Marino /* Die! */ 43*e4b17023SJohn Marino 44*e4b17023SJohn Marino static void 45*e4b17023SJohn Marino fail (const char *s) 46*e4b17023SJohn Marino { 47*e4b17023SJohn Marino fprintf (stderr, "%s\n", s); 48*e4b17023SJohn Marino exit (1); 49*e4b17023SJohn Marino } 50*e4b17023SJohn Marino 51*e4b17023SJohn Marino /* Read ucnid.tab and set the C99 and CXX flags in header[]. */ 52*e4b17023SJohn Marino 53*e4b17023SJohn Marino static void 54*e4b17023SJohn Marino read_ucnid (const char *fname) 55*e4b17023SJohn Marino { 56*e4b17023SJohn Marino FILE *f = fopen (fname, "r"); 57*e4b17023SJohn Marino unsigned fl = 0; 58*e4b17023SJohn Marino 59*e4b17023SJohn Marino if (!f) 60*e4b17023SJohn Marino fail ("opening ucnid.tab"); 61*e4b17023SJohn Marino for (;;) 62*e4b17023SJohn Marino { 63*e4b17023SJohn Marino char line[256]; 64*e4b17023SJohn Marino 65*e4b17023SJohn Marino if (!fgets (line, sizeof (line), f)) 66*e4b17023SJohn Marino break; 67*e4b17023SJohn Marino if (strcmp (line, "[C99]\n") == 0) 68*e4b17023SJohn Marino fl = C99; 69*e4b17023SJohn Marino else if (strcmp (line, "[CXX]\n") == 0) 70*e4b17023SJohn Marino fl = CXX; 71*e4b17023SJohn Marino else if (isxdigit (line[0])) 72*e4b17023SJohn Marino { 73*e4b17023SJohn Marino char *l = line; 74*e4b17023SJohn Marino while (*l) 75*e4b17023SJohn Marino { 76*e4b17023SJohn Marino unsigned long start, end; 77*e4b17023SJohn Marino char *endptr; 78*e4b17023SJohn Marino start = strtoul (l, &endptr, 16); 79*e4b17023SJohn Marino if (endptr == l || (*endptr != '-' && ! isspace (*endptr))) 80*e4b17023SJohn Marino fail ("parsing ucnid.tab [1]"); 81*e4b17023SJohn Marino l = endptr; 82*e4b17023SJohn Marino if (*l != '-') 83*e4b17023SJohn Marino end = start; 84*e4b17023SJohn Marino else 85*e4b17023SJohn Marino { 86*e4b17023SJohn Marino end = strtoul (l + 1, &endptr, 16); 87*e4b17023SJohn Marino if (end < start) 88*e4b17023SJohn Marino fail ("parsing ucnid.tab, end before start"); 89*e4b17023SJohn Marino l = endptr; 90*e4b17023SJohn Marino if (! isspace (*l)) 91*e4b17023SJohn Marino fail ("parsing ucnid.tab, junk after range"); 92*e4b17023SJohn Marino } 93*e4b17023SJohn Marino while (isspace (*l)) 94*e4b17023SJohn Marino l++; 95*e4b17023SJohn Marino if (end > 0xFFFF) 96*e4b17023SJohn Marino fail ("parsing ucnid.tab, end too large"); 97*e4b17023SJohn Marino while (start <= end) 98*e4b17023SJohn Marino flags[start++] |= fl; 99*e4b17023SJohn Marino } 100*e4b17023SJohn Marino } 101*e4b17023SJohn Marino } 102*e4b17023SJohn Marino if (ferror (f)) 103*e4b17023SJohn Marino fail ("reading ucnid.tab"); 104*e4b17023SJohn Marino fclose (f); 105*e4b17023SJohn Marino } 106*e4b17023SJohn Marino 107*e4b17023SJohn Marino /* Read UnicodeData.txt and set the 'digit' flag, and 108*e4b17023SJohn Marino also fill in the 'decomp' table to be the decompositions of 109*e4b17023SJohn Marino characters for which both the character decomposed and all the code 110*e4b17023SJohn Marino points in the decomposition are either C99 or CXX. */ 111*e4b17023SJohn Marino 112*e4b17023SJohn Marino static void 113*e4b17023SJohn Marino read_table (char *fname) 114*e4b17023SJohn Marino { 115*e4b17023SJohn Marino FILE * f = fopen (fname, "r"); 116*e4b17023SJohn Marino 117*e4b17023SJohn Marino if (!f) 118*e4b17023SJohn Marino fail ("opening UnicodeData.txt"); 119*e4b17023SJohn Marino for (;;) 120*e4b17023SJohn Marino { 121*e4b17023SJohn Marino char line[256]; 122*e4b17023SJohn Marino unsigned long codepoint, this_decomp[4]; 123*e4b17023SJohn Marino char *l; 124*e4b17023SJohn Marino int i; 125*e4b17023SJohn Marino int decomp_useful; 126*e4b17023SJohn Marino 127*e4b17023SJohn Marino if (!fgets (line, sizeof (line), f)) 128*e4b17023SJohn Marino break; 129*e4b17023SJohn Marino codepoint = strtoul (line, &l, 16); 130*e4b17023SJohn Marino if (l == line || *l != ';') 131*e4b17023SJohn Marino fail ("parsing UnicodeData.txt, reading code point"); 132*e4b17023SJohn Marino if (codepoint > 0xffff || ! (flags[codepoint] & (C99 | CXX))) 133*e4b17023SJohn Marino continue; 134*e4b17023SJohn Marino 135*e4b17023SJohn Marino do { 136*e4b17023SJohn Marino l++; 137*e4b17023SJohn Marino } while (*l != ';'); 138*e4b17023SJohn Marino /* Category value; things starting with 'N' are numbers of some 139*e4b17023SJohn Marino kind. */ 140*e4b17023SJohn Marino if (*++l == 'N') 141*e4b17023SJohn Marino flags[codepoint] |= digit; 142*e4b17023SJohn Marino 143*e4b17023SJohn Marino do { 144*e4b17023SJohn Marino l++; 145*e4b17023SJohn Marino } while (*l != ';'); 146*e4b17023SJohn Marino /* Canonical combining class; in NFC/NFKC, they must be increasing 147*e4b17023SJohn Marino (or zero). */ 148*e4b17023SJohn Marino if (! isdigit (*++l)) 149*e4b17023SJohn Marino fail ("parsing UnicodeData.txt, combining class not number"); 150*e4b17023SJohn Marino combining_value[codepoint] = strtoul (l, &l, 10); 151*e4b17023SJohn Marino if (*l++ != ';') 152*e4b17023SJohn Marino fail ("parsing UnicodeData.txt, junk after combining class"); 153*e4b17023SJohn Marino 154*e4b17023SJohn Marino /* Skip over bidi value. */ 155*e4b17023SJohn Marino do { 156*e4b17023SJohn Marino l++; 157*e4b17023SJohn Marino } while (*l != ';'); 158*e4b17023SJohn Marino 159*e4b17023SJohn Marino /* Decomposition mapping. */ 160*e4b17023SJohn Marino decomp_useful = flags[codepoint]; 161*e4b17023SJohn Marino if (*++l == '<') /* Compatibility mapping. */ 162*e4b17023SJohn Marino continue; 163*e4b17023SJohn Marino for (i = 0; i < 4; i++) 164*e4b17023SJohn Marino { 165*e4b17023SJohn Marino if (*l == ';') 166*e4b17023SJohn Marino break; 167*e4b17023SJohn Marino if (!isxdigit (*l)) 168*e4b17023SJohn Marino fail ("parsing UnicodeData.txt, decomposition format"); 169*e4b17023SJohn Marino this_decomp[i] = strtoul (l, &l, 16); 170*e4b17023SJohn Marino decomp_useful &= flags[this_decomp[i]]; 171*e4b17023SJohn Marino while (isspace (*l)) 172*e4b17023SJohn Marino l++; 173*e4b17023SJohn Marino } 174*e4b17023SJohn Marino if (i > 2) /* Decomposition too long. */ 175*e4b17023SJohn Marino fail ("parsing UnicodeData.txt, decomposition too long"); 176*e4b17023SJohn Marino if (decomp_useful) 177*e4b17023SJohn Marino while (--i >= 0) 178*e4b17023SJohn Marino decomp[codepoint][i] = this_decomp[i]; 179*e4b17023SJohn Marino } 180*e4b17023SJohn Marino if (ferror (f)) 181*e4b17023SJohn Marino fail ("reading UnicodeData.txt"); 182*e4b17023SJohn Marino fclose (f); 183*e4b17023SJohn Marino } 184*e4b17023SJohn Marino 185*e4b17023SJohn Marino /* Read DerivedNormalizationProps.txt and set the flags that say whether 186*e4b17023SJohn Marino a character is in NFC, NFKC, or is context-dependent. */ 187*e4b17023SJohn Marino 188*e4b17023SJohn Marino static void 189*e4b17023SJohn Marino read_derived (const char *fname) 190*e4b17023SJohn Marino { 191*e4b17023SJohn Marino FILE * f = fopen (fname, "r"); 192*e4b17023SJohn Marino 193*e4b17023SJohn Marino if (!f) 194*e4b17023SJohn Marino fail ("opening DerivedNormalizationProps.txt"); 195*e4b17023SJohn Marino for (;;) 196*e4b17023SJohn Marino { 197*e4b17023SJohn Marino char line[256]; 198*e4b17023SJohn Marino unsigned long start, end; 199*e4b17023SJohn Marino char *l; 200*e4b17023SJohn Marino bool not_NFC_p, not_NFKC_p, maybe_not_NFC_p; 201*e4b17023SJohn Marino 202*e4b17023SJohn Marino if (!fgets (line, sizeof (line), f)) 203*e4b17023SJohn Marino break; 204*e4b17023SJohn Marino not_NFC_p = (strstr (line, "; NFC_QC; N") != NULL); 205*e4b17023SJohn Marino not_NFKC_p = (strstr (line, "; NFKC_QC; N") != NULL); 206*e4b17023SJohn Marino maybe_not_NFC_p = (strstr (line, "; NFC_QC; M") != NULL); 207*e4b17023SJohn Marino if (! not_NFC_p && ! not_NFKC_p && ! maybe_not_NFC_p) 208*e4b17023SJohn Marino continue; 209*e4b17023SJohn Marino 210*e4b17023SJohn Marino start = strtoul (line, &l, 16); 211*e4b17023SJohn Marino if (l == line) 212*e4b17023SJohn Marino fail ("parsing DerivedNormalizationProps.txt, reading start"); 213*e4b17023SJohn Marino if (start > 0xffff) 214*e4b17023SJohn Marino continue; 215*e4b17023SJohn Marino if (*l == '.' && l[1] == '.') 216*e4b17023SJohn Marino end = strtoul (l + 2, &l, 16); 217*e4b17023SJohn Marino else 218*e4b17023SJohn Marino end = start; 219*e4b17023SJohn Marino 220*e4b17023SJohn Marino while (start <= end) 221*e4b17023SJohn Marino flags[start++] |= ((not_NFC_p ? not_NFC : 0) 222*e4b17023SJohn Marino | (not_NFKC_p ? not_NFKC : 0) 223*e4b17023SJohn Marino | (maybe_not_NFC_p ? maybe_not_NFC : 0) 224*e4b17023SJohn Marino ); 225*e4b17023SJohn Marino } 226*e4b17023SJohn Marino if (ferror (f)) 227*e4b17023SJohn Marino fail ("reading DerivedNormalizationProps.txt"); 228*e4b17023SJohn Marino fclose (f); 229*e4b17023SJohn Marino } 230*e4b17023SJohn Marino 231*e4b17023SJohn Marino /* Write out the table. 232*e4b17023SJohn Marino The table consists of two words per entry. The first word is the flags 233*e4b17023SJohn Marino for the unicode code points up to and including the second word. */ 234*e4b17023SJohn Marino 235*e4b17023SJohn Marino static void 236*e4b17023SJohn Marino write_table (void) 237*e4b17023SJohn Marino { 238*e4b17023SJohn Marino unsigned i; 239*e4b17023SJohn Marino unsigned last_flag = flags[0]; 240*e4b17023SJohn Marino bool really_safe = decomp[0][0] == 0; 241*e4b17023SJohn Marino unsigned char last_combine = combining_value[0]; 242*e4b17023SJohn Marino 243*e4b17023SJohn Marino for (i = 1; i <= 65536; i++) 244*e4b17023SJohn Marino if (i == 65536 245*e4b17023SJohn Marino || (flags[i] != last_flag && ((flags[i] | last_flag) & (C99 | CXX))) 246*e4b17023SJohn Marino || really_safe != (decomp[i][0] == 0) 247*e4b17023SJohn Marino || combining_value[i] != last_combine) 248*e4b17023SJohn Marino { 249*e4b17023SJohn Marino printf ("{ %s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n", 250*e4b17023SJohn Marino last_flag & C99 ? "C99" : " 0", 251*e4b17023SJohn Marino last_flag & digit ? "DIG" : " 0", 252*e4b17023SJohn Marino last_flag & CXX ? "CXX" : " 0", 253*e4b17023SJohn Marino really_safe ? "CID" : " 0", 254*e4b17023SJohn Marino last_flag & not_NFC ? " 0" : "NFC", 255*e4b17023SJohn Marino last_flag & not_NFKC ? " 0" : "NKC", 256*e4b17023SJohn Marino last_flag & maybe_not_NFC ? "CTX" : " 0", 257*e4b17023SJohn Marino combining_value[i - 1], 258*e4b17023SJohn Marino i - 1); 259*e4b17023SJohn Marino last_flag = flags[i]; 260*e4b17023SJohn Marino last_combine = combining_value[0]; 261*e4b17023SJohn Marino really_safe = decomp[i][0] == 0; 262*e4b17023SJohn Marino } 263*e4b17023SJohn Marino } 264*e4b17023SJohn Marino 265*e4b17023SJohn Marino /* Print out the huge copyright notice. */ 266*e4b17023SJohn Marino 267*e4b17023SJohn Marino static void 268*e4b17023SJohn Marino write_copyright (void) 269*e4b17023SJohn Marino { 270*e4b17023SJohn Marino static const char copyright[] = "\ 271*e4b17023SJohn Marino /* Unicode characters and various properties.\n\ 272*e4b17023SJohn Marino Copyright (C) 2003, 2005 Free Software Foundation, Inc.\n\ 273*e4b17023SJohn Marino \n\ 274*e4b17023SJohn Marino This program is free software; you can redistribute it and/or modify it\n\ 275*e4b17023SJohn Marino under the terms of the GNU General Public License as published by the\n\ 276*e4b17023SJohn Marino Free Software Foundation; either version 3, or (at your option) any\n\ 277*e4b17023SJohn Marino later version.\n\ 278*e4b17023SJohn Marino \n\ 279*e4b17023SJohn Marino This program is distributed in the hope that it will be useful,\n\ 280*e4b17023SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of\n\ 281*e4b17023SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n\ 282*e4b17023SJohn Marino GNU General Public License for more details.\n\ 283*e4b17023SJohn Marino \n\ 284*e4b17023SJohn Marino You should have received a copy of the GNU General Public License\n\ 285*e4b17023SJohn Marino along with this program; see the file COPYING3. If not see\n\ 286*e4b17023SJohn Marino <http://www.gnu.org/licenses/>.\n\ 287*e4b17023SJohn Marino \n\ 288*e4b17023SJohn Marino \n\ 289*e4b17023SJohn Marino Copyright (C) 1991-2005 Unicode, Inc. All rights reserved.\n\ 290*e4b17023SJohn Marino Distributed under the Terms of Use in\n\ 291*e4b17023SJohn Marino http://www.unicode.org/copyright.html.\n\ 292*e4b17023SJohn Marino \n\ 293*e4b17023SJohn Marino Permission is hereby granted, free of charge, to any person\n\ 294*e4b17023SJohn Marino obtaining a copy of the Unicode data files and any associated\n\ 295*e4b17023SJohn Marino documentation (the \"Data Files\") or Unicode software and any\n\ 296*e4b17023SJohn Marino associated documentation (the \"Software\") to deal in the Data Files\n\ 297*e4b17023SJohn Marino or Software without restriction, including without limitation the\n\ 298*e4b17023SJohn Marino rights to use, copy, modify, merge, publish, distribute, and/or\n\ 299*e4b17023SJohn Marino sell copies of the Data Files or Software, and to permit persons to\n\ 300*e4b17023SJohn Marino whom the Data Files or Software are furnished to do so, provided\n\ 301*e4b17023SJohn Marino that (a) the above copyright notice(s) and this permission notice\n\ 302*e4b17023SJohn Marino appear with all copies of the Data Files or Software, (b) both the\n\ 303*e4b17023SJohn Marino above copyright notice(s) and this permission notice appear in\n\ 304*e4b17023SJohn Marino associated documentation, and (c) there is clear notice in each\n\ 305*e4b17023SJohn Marino modified Data File or in the Software as well as in the\n\ 306*e4b17023SJohn Marino documentation associated with the Data File(s) or Software that the\n\ 307*e4b17023SJohn Marino data or software has been modified.\n\ 308*e4b17023SJohn Marino \n\ 309*e4b17023SJohn Marino THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\ 310*e4b17023SJohn Marino OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\ 311*e4b17023SJohn Marino WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\ 312*e4b17023SJohn Marino NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\ 313*e4b17023SJohn Marino COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\ 314*e4b17023SJohn Marino ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\ 315*e4b17023SJohn Marino DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\ 316*e4b17023SJohn Marino WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\ 317*e4b17023SJohn Marino ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\ 318*e4b17023SJohn Marino OF THE DATA FILES OR SOFTWARE.\n\ 319*e4b17023SJohn Marino \n\ 320*e4b17023SJohn Marino Except as contained in this notice, the name of a copyright holder\n\ 321*e4b17023SJohn Marino shall not be used in advertising or otherwise to promote the sale,\n\ 322*e4b17023SJohn Marino use or other dealings in these Data Files or Software without prior\n\ 323*e4b17023SJohn Marino written authorization of the copyright holder. */\n"; 324*e4b17023SJohn Marino 325*e4b17023SJohn Marino puts (copyright); 326*e4b17023SJohn Marino } 327*e4b17023SJohn Marino 328*e4b17023SJohn Marino /* Main program. */ 329*e4b17023SJohn Marino 330*e4b17023SJohn Marino int 331*e4b17023SJohn Marino main(int argc, char ** argv) 332*e4b17023SJohn Marino { 333*e4b17023SJohn Marino if (argc != 4) 334*e4b17023SJohn Marino fail ("too few arguments to makeucn"); 335*e4b17023SJohn Marino read_ucnid (argv[1]); 336*e4b17023SJohn Marino read_table (argv[2]); 337*e4b17023SJohn Marino read_derived (argv[3]); 338*e4b17023SJohn Marino 339*e4b17023SJohn Marino write_copyright (); 340*e4b17023SJohn Marino write_table (); 341*e4b17023SJohn Marino return 0; 342*e4b17023SJohn Marino } 343