xref: /netbsd-src/external/gpl3/gcc/dist/libcpp/makeucnid.cc (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1 /* Make ucnid.h from various sources.
2    Copyright (C) 2005-2022 Free Software Foundation, Inc.
3 
4 This program is free software; you can redistribute it and/or modify it
5 under the terms of the GNU General Public License as published by the
6 Free Software Foundation; either version 3, or (at your option) any
7 later version.
8 
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; see the file COPYING3.  If not see
16 <http://www.gnu.org/licenses/>.  */
17 
18 /* Run this program as
19    ./makeucnid ucnid.tab UnicodeData.txt DerivedNormalizationProps.txt \
20       DerivedCoreProperties.txt > ucnid.h
21 */
22 
23 #include <stdio.h>
24 #include <string.h>
25 #include <ctype.h>
26 #include <stdbool.h>
27 #include <stdlib.h>
28 
29 enum {
30   C99 = 1,
31   CXX = 2,
32   N99 = 4,
33   C11 = 8,
34   N11 = 16,
35   CXX23 = 32,
36   NXX23 = 64,
37   all_languages = C99 | CXX | C11 | CXX23 | NXX23,
38   not_NFC = 128,
39   not_NFKC = 256,
40   maybe_not_NFC = 512
41 };
42 
43 #define NUM_CODE_POINTS 0x110000
44 #define MAX_CODE_POINT 0x10ffff
45 
46 static unsigned flags[NUM_CODE_POINTS];
47 static unsigned int all_decomp[NUM_CODE_POINTS][2];
48 static unsigned int decomp[NUM_CODE_POINTS][2];
49 static unsigned char combining_value[NUM_CODE_POINTS];
50 
51 /* Die!  */
52 
53 static void
fail(const char * s)54 fail (const char *s)
55 {
56   fprintf (stderr, "%s\n", s);
57   exit (1);
58 }
59 
60 /* Read ucnid.tab and set the flags for language versions in header[].  */
61 
62 static void
read_ucnid(const char * fname)63 read_ucnid (const char *fname)
64 {
65   FILE *f = fopen (fname, "r");
66   unsigned fl = 0;
67 
68   if (!f)
69     fail ("opening ucnid.tab");
70   for (;;)
71     {
72       char line[256];
73 
74       if (!fgets (line, sizeof (line), f))
75 	break;
76       if (strcmp (line, "[C99]\n") == 0)
77 	fl = C99;
78       else if (strcmp (line, "[C99DIG]\n") == 0)
79 	fl = C99|N99;
80       else if (strcmp (line, "[CXX]\n") == 0)
81 	fl = CXX;
82       else if (strcmp (line, "[C11]\n") == 0)
83 	fl = C11;
84       else if (strcmp (line, "[C11NOSTART]\n") == 0)
85 	fl = C11|N11;
86       else if (isxdigit (line[0]))
87 	{
88 	  char *l = line;
89 	  while (*l)
90 	    {
91 	      unsigned long start, end;
92 	      char *endptr;
93 	      start = strtoul (l, &endptr, 16);
94 	      if (endptr == l || (*endptr != '-' && ! isspace (*endptr)))
95 		fail ("parsing ucnid.tab [1]");
96 	      l = endptr;
97 	      if (*l != '-')
98 		end = start;
99 	      else
100 		{
101 		  end = strtoul (l + 1, &endptr, 16);
102 		  if (end < start)
103 		    fail ("parsing ucnid.tab, end before start");
104 		  l = endptr;
105 		  if (! isspace (*l))
106 		    fail ("parsing ucnid.tab, junk after range");
107 		}
108 	      while (isspace (*l))
109 		l++;
110 	      if (end > MAX_CODE_POINT)
111 		fail ("parsing ucnid.tab, end too large");
112 	      while (start <= end)
113 		flags[start++] |= fl;
114 	    }
115 	}
116     }
117   if (ferror (f))
118     fail ("reading ucnid.tab");
119   fclose (f);
120 }
121 
122 /* Read UnicodeData.txt and fill in the 'decomp' table to be the
123    decompositions of characters for which both the character
124    decomposed and all the code points in the decomposition are valid
125    for some supported language version, and the 'all_decomp' table to
126    be the decompositions of all characters without those
127    constraints.  */
128 
129 static void
read_table(char * fname)130 read_table (char *fname)
131 {
132   FILE * f = fopen (fname, "r");
133 
134   if (!f)
135     fail ("opening UnicodeData.txt");
136   for (;;)
137     {
138       char line[256];
139       unsigned long codepoint, this_decomp[4];
140       char *l;
141       int i, j;
142       int decomp_useful;
143 
144       if (!fgets (line, sizeof (line), f))
145 	break;
146       codepoint = strtoul (line, &l, 16);
147       if (l == line || *l != ';')
148 	fail ("parsing UnicodeData.txt, reading code point");
149       if (codepoint > MAX_CODE_POINT)
150 	fail ("parsing UnicodeData.txt, code point too large");
151 
152       do {
153 	l++;
154       } while (*l != ';');
155       /* Category value.  */
156       do {
157 	l++;
158       } while (*l != ';');
159       /* Canonical combining class; in NFC/NFKC, they must be increasing
160 	 (or zero).  */
161       if (! isdigit (*++l))
162 	fail ("parsing UnicodeData.txt, combining class not number");
163       combining_value[codepoint] = strtoul (l, &l, 10);
164       if (*l++ != ';')
165 	fail ("parsing UnicodeData.txt, junk after combining class");
166 
167       /* Skip over bidi value.  */
168       do {
169 	l++;
170       } while (*l != ';');
171 
172       /* Decomposition mapping.  */
173       decomp_useful = flags[codepoint];
174       if (*++l == '<')  /* Compatibility mapping. */
175 	continue;
176       for (i = 0; i < 4; i++)
177 	{
178 	  if (*l == ';')
179 	    break;
180 	  if (!isxdigit (*l))
181 	    fail ("parsing UnicodeData.txt, decomposition format");
182 	  this_decomp[i] = strtoul (l, &l, 16);
183 	  decomp_useful &= flags[this_decomp[i]];
184 	  while (isspace (*l))
185 	    l++;
186 	}
187       if (i > 2)  /* Decomposition too long.  */
188 	fail ("parsing UnicodeData.txt, decomposition too long");
189       for (j = 0; j < i; j++)
190 	all_decomp[codepoint][j] = this_decomp[j];
191       if ((flags[codepoint] & all_languages) && decomp_useful)
192 	while (--i >= 0)
193 	  decomp[codepoint][i] = this_decomp[i];
194     }
195   if (ferror (f))
196     fail ("reading UnicodeData.txt");
197   fclose (f);
198 }
199 
200 /* Read DerivedNormalizationProps.txt and set the flags that say whether
201    a character is in NFC, NFKC, or is context-dependent.  */
202 
203 static void
read_derived(const char * fname)204 read_derived (const char *fname)
205 {
206   FILE * f = fopen (fname, "r");
207 
208   if (!f)
209     fail ("opening DerivedNormalizationProps.txt");
210   for (;;)
211     {
212       char line[256];
213       unsigned long start, end;
214       char *l;
215       bool not_NFC_p, not_NFKC_p, maybe_not_NFC_p;
216 
217       if (!fgets (line, sizeof (line), f))
218 	break;
219       not_NFC_p = (strstr (line, "; NFC_QC; N") != NULL);
220       not_NFKC_p = (strstr (line, "; NFKC_QC; N") != NULL);
221       maybe_not_NFC_p = (strstr (line, "; NFC_QC; M") != NULL);
222       if (! not_NFC_p && ! not_NFKC_p && ! maybe_not_NFC_p)
223 	continue;
224 
225       start = strtoul (line, &l, 16);
226       if (l == line)
227 	fail ("parsing DerivedNormalizationProps.txt, reading start");
228       if (start > MAX_CODE_POINT)
229 	fail ("parsing DerivedNormalizationProps.txt, code point too large");
230       if (*l == '.' && l[1] == '.')
231 	end = strtoul (l + 2, &l, 16);
232       else
233 	end = start;
234 
235       while (start <= end)
236 	flags[start++] |= ((not_NFC_p ? not_NFC : 0)
237 			   | (not_NFKC_p ? not_NFKC : 0)
238 			   | (maybe_not_NFC_p ? maybe_not_NFC : 0)
239 			   );
240     }
241   if (ferror (f))
242     fail ("reading DerivedNormalizationProps.txt");
243   fclose (f);
244 }
245 
246 /* Read DerivedCoreProperties.txt and fill in languages version in
247    flags from the XID_Start and XID_Continue properties.  */
248 
249 static void
read_derivedcore(char * fname)250 read_derivedcore (char *fname)
251 {
252   FILE * f = fopen (fname, "r");
253 
254   if (!f)
255     fail ("opening DerivedCoreProperties.txt");
256   for (;;)
257     {
258       char line[256];
259       unsigned long codepoint_start, codepoint_end;
260       char *l;
261       int i, j;
262 
263       if (!fgets (line, sizeof (line), f))
264 	break;
265       if (line[0] == '#' || line[0] == '\n' || line[0] == '\r')
266 	continue;
267       codepoint_start = strtoul (line, &l, 16);
268       if (l == line)
269 	fail ("parsing DerivedCoreProperties.txt, reading code point");
270       if (codepoint_start > MAX_CODE_POINT)
271 	fail ("parsing DerivedCoreProperties.txt, code point too large");
272 
273       if (*l == '.' && l[1] == '.')
274 	{
275 	  char *l2 = l + 2;
276 	  codepoint_end = strtoul (l + 2, &l, 16);
277 	  if (l == l2 || codepoint_end < codepoint_start)
278 	    fail ("parsing DerivedCoreProperties.txt, reading code point");
279 	  if (codepoint_end > MAX_CODE_POINT)
280 	    fail ("parsing DerivedCoreProperties.txt, code point too large");
281 	}
282       else
283 	codepoint_end = codepoint_start;
284 
285       while (*l == ' ')
286 	l++;
287       if (*l++ != ';')
288 	fail ("parsing DerivedCoreProperties.txt, reading code point");
289 
290       while (*l == ' ')
291 	l++;
292 
293       if (codepoint_end < 0x80)
294         continue;
295 
296       if (strncmp (l, "XID_Start ", 10) == 0)
297 	{
298 	  for (; codepoint_start <= codepoint_end; codepoint_start++)
299 	    flags[codepoint_start]
300 	      = (flags[codepoint_start] | CXX23) & ~NXX23;
301 	}
302       else if (strncmp (l, "XID_Continue ", 13) == 0)
303 	{
304 	  for (; codepoint_start <= codepoint_end; codepoint_start++)
305 	    if ((flags[codepoint_start] & CXX23) == 0)
306 	      flags[codepoint_start] |= CXX23 | NXX23;
307 	}
308     }
309   if (ferror (f))
310     fail ("reading DerivedCoreProperties.txt");
311   fclose (f);
312 }
313 
314 /* Write out the table.
315    The table consists of two words per entry.  The first word is the flags
316    for the unicode code points up to and including the second word.  */
317 
318 static void
write_table(void)319 write_table (void)
320 {
321   unsigned i;
322   unsigned last_flag = flags[0];
323   bool really_safe = decomp[0][0] == 0;
324   unsigned char last_combine = combining_value[0];
325 
326   printf ("static const struct ucnrange ucnranges[] = {\n");
327 
328   for (i = 1; i <= NUM_CODE_POINTS; i++)
329     if (i == NUM_CODE_POINTS
330 	|| (flags[i] != last_flag && ((flags[i] | last_flag) & all_languages))
331 	|| really_safe != (decomp[i][0] == 0)
332 	|| combining_value[i] != last_combine)
333       {
334 	printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
335 		last_flag & C99 ? "C99" : "  0",
336 		last_flag & N99 ? "N99" : "  0",
337 		last_flag & CXX ? "CXX" : "  0",
338 		last_flag & C11 ? "C11" : "  0",
339 		last_flag & N11 ? "N11" : "  0",
340 		last_flag & CXX23 ? "CXX23" : "    0",
341 		last_flag & NXX23 ? "NXX23" : "    0",
342 		really_safe ? "CID" : "  0",
343 		last_flag & not_NFC ? "  0" : "NFC",
344 		last_flag & not_NFKC ? "  0" : "NKC",
345 		last_flag & maybe_not_NFC ? "CTX" : "  0",
346 		combining_value[i - 1],
347 		i - 1);
348 	last_flag = flags[i];
349 	last_combine = combining_value[i];
350 	really_safe = decomp[i][0] == 0;
351       }
352 
353   printf ("};\n");
354 }
355 
356 /* Return whether a given character is valid in an identifier for some
357    supported language, either as itself or as a UCN.  */
358 
359 static bool
char_id_valid(unsigned int c)360 char_id_valid (unsigned int c)
361 {
362   return ((flags[c] & all_languages)
363 	  || (c == 0x24)
364 	  || (c >= 0x30 && c <= 0x39)
365 	  || (c >= 0x41 && c <= 0x5a)
366 	  || (c >= 0x61 && c <= 0x7a));
367 }
368 
369 /* Write out the switch statement over characters for which it is
370    context-dependent whether they are in NFC.  */
371 
372 static void
write_context_switch(void)373 write_context_switch (void)
374 {
375   unsigned i;
376   printf ("static bool\n"
377 	  "check_nfc (cpp_reader *pfile, cppchar_t c, cppchar_t p)\n"
378 	  "{\n"
379 	  "  switch (c)\n"
380 	  "    {\n");
381   for (i = 0; i < NUM_CODE_POINTS; i++)
382     {
383       bool found_case = false;
384       unsigned j;
385       if (!(flags[i] & all_languages) || !(flags[i] & maybe_not_NFC))
386 	continue;
387       if ((i >= 0x1161 && i <= 0x1175) || (i >= 0x11A8 && i <= 0x11C2))
388 	continue; /* Hangul handled algorithmically.  */
389       printf ("    case %#06x:\n"
390 	      "      switch (p)\n"
391 	      "\t{\n", i);
392       /* If an NFC starter character decomposes with this character I
393 	 as the second character and an NFC starter character S as the
394 	 first character, that latter character as a previous
395 	 character means this character is not NFC.  Furthermore, any
396 	 NFC starter character K made by a series of compositions of S
397 	 with combining characters whose combining class is greater
398 	 than that of I also means this character is not NFC.  */
399       for (j = 0; j < NUM_CODE_POINTS; j++)
400 	{
401 	  unsigned s, k;
402 	  if (all_decomp[j][1] != i)
403 	    continue;
404 	  s = all_decomp[j][0];
405 	  if (combining_value[s] != 0 || (flags[s] & not_NFC) != 0)
406 	    continue;
407 	  if (char_id_valid (s))
408 	    {
409 	      found_case = true;
410 	      printf ("\tcase %#06x:\n", s);
411 	    }
412 	  for (k = 0; k < NUM_CODE_POINTS; k++)
413 	    {
414 	      unsigned t = k;
415 	      if (k == s || !char_id_valid (k))
416 		continue;
417 	      while (all_decomp[t][1] != 0
418 		     && combining_value[all_decomp[t][1]] > combining_value[i])
419 		{
420 		  if (combining_value[t] != 0 || (flags[t] & not_NFC) != 0)
421 		    break;
422 		  t = all_decomp[t][0];
423 		}
424 	      if (t == s)
425 		{
426 		  found_case = true;
427 		  printf ("\tcase %#06x:\n", k);
428 		}
429 	    }
430 	}
431       if (found_case)
432 	printf ("\t  return false;\n");
433       else
434 	printf ("\t/* Non-NFC cases not applicable to C/C++.  */\n");
435       printf ("\tdefault:\n"
436 	      "\t  return true;\n"
437 	      "\t}\n\n");
438     }
439   printf ("    default:\n"
440 	  "      cpp_error (pfile, CPP_DL_ICE, \"Character %%x might not be NFKC\", c);\n"
441 	  "      return true;\n"
442 	  "  }\n"
443 	  "}\n");
444 }
445 
446 /* Print out the huge copyright notice.  */
447 
448 static void
write_copyright(void)449 write_copyright (void)
450 {
451   static const char copyright[] = "\
452 /* Unicode characters and various properties.\n\
453    Copyright (C) 2003-2022 Free Software Foundation, Inc.\n\
454 \n\
455    This program is free software; you can redistribute it and/or modify it\n\
456    under the terms of the GNU General Public License as published by the\n\
457    Free Software Foundation; either version 3, or (at your option) any\n\
458    later version.\n\
459 \n\
460    This program is distributed in the hope that it will be useful,\n\
461    but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
462    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n\
463    GNU General Public License for more details.\n\
464 \n\
465    You should have received a copy of the GNU General Public License\n\
466    along with this program; see the file COPYING3.  If not see\n\
467    <http://www.gnu.org/licenses/>.\n\
468 \n\
469 \n\
470    Copyright (C) 1991-2005 Unicode, Inc.  All rights reserved.\n\
471    Distributed under the Terms of Use in\n\
472    http://www.unicode.org/copyright.html.\n\
473 \n\
474    Permission is hereby granted, free of charge, to any person\n\
475    obtaining a copy of the Unicode data files and any associated\n\
476    documentation (the \"Data Files\") or Unicode software and any\n\
477    associated documentation (the \"Software\") to deal in the Data Files\n\
478    or Software without restriction, including without limitation the\n\
479    rights to use, copy, modify, merge, publish, distribute, and/or\n\
480    sell copies of the Data Files or Software, and to permit persons to\n\
481    whom the Data Files or Software are furnished to do so, provided\n\
482    that (a) the above copyright notice(s) and this permission notice\n\
483    appear with all copies of the Data Files or Software, (b) both the\n\
484    above copyright notice(s) and this permission notice appear in\n\
485    associated documentation, and (c) there is clear notice in each\n\
486    modified Data File or in the Software as well as in the\n\
487    documentation associated with the Data File(s) or Software that the\n\
488    data or software has been modified.\n\
489 \n\
490    THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\
491    OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\
492    WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\
493    NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\
494    COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\
495    ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\
496    DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\
497    WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\
498    ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\
499    OF THE DATA FILES OR SOFTWARE.\n\
500 \n\
501    Except as contained in this notice, the name of a copyright holder\n\
502    shall not be used in advertising or otherwise to promote the sale,\n\
503    use or other dealings in these Data Files or Software without prior\n\
504    written authorization of the copyright holder.  */\n";
505 
506    puts (copyright);
507 }
508 
509 /* Main program.  */
510 
511 int
main(int argc,char ** argv)512 main(int argc, char ** argv)
513 {
514   if (argc != 5)
515     fail ("too few arguments to makeucn");
516   read_ucnid (argv[1]);
517   read_table (argv[2]);
518   read_derived (argv[3]);
519   read_derivedcore (argv[4]);
520 
521   write_copyright ();
522   write_table ();
523   write_context_switch ();
524   return 0;
525 }
526