xref: /netbsd-src/external/gpl2/gettext/dist/gnulib-local/lib/gen-lbrkprop.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* Generate a Unicode conforming Line Break Properties tables from a
2    UnicodeData file.
3    Written by Bruno Haible <bruno@clisp.org>, 2000-2004.
4 
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9 
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
18 
19 /* Usage example:
20      $ gen-lbrkprop /usr/local/share/Unidata/UnicodeData.txt \
21 		    Combining.txt \
22 		    /usr/local/share/Unidata/EastAsianWidth.txt \
23 		    /usr/local/share/Unidata/LineBreak.txt \
24 		    3.1.0
25  */
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <stdbool.h>
30 #include <stdint.h>
31 #include <string.h>
32 #include <time.h>
33 
34 /* This structure represents one line in the UnicodeData.txt file.  */
35 struct unicode_attribute
36 {
37   const char *name;           /* Character name */
38   const char *category;       /* General category */
39   const char *combining;      /* Canonical combining classes */
40   const char *bidi;           /* Bidirectional category */
41   const char *decomposition;  /* Character decomposition mapping */
42   const char *decdigit;       /* Decimal digit value */
43   const char *digit;          /* Digit value */
44   const char *numeric;        /* Numeric value */
45   int mirrored;               /* mirrored */
46   const char *oldname;        /* Old Unicode 1.0 name */
47   const char *comment;        /* Comment */
48   unsigned int upper;         /* Uppercase mapping */
49   unsigned int lower;         /* Lowercase mapping */
50   unsigned int title;         /* Titlecase mapping */
51 };
52 
53 /* Missing fields are represented with "" for strings, and NONE for
54    characters.  */
55 #define NONE (~(unsigned int)0)
56 
57 /* The entire contents of the UnicodeData.txt file.  */
58 struct unicode_attribute unicode_attributes [0x110000];
59 
60 /* Stores in unicode_attributes[i] the values from the given fields.  */
61 static void
fill_attribute(unsigned int i,const char * field1,const char * field2,const char * field3,const char * field4,const char * field5,const char * field6,const char * field7,const char * field8,const char * field9,const char * field10,const char * field11,const char * field12,const char * field13,const char * field14)62 fill_attribute (unsigned int i,
63 		const char *field1, const char *field2,
64 		const char *field3, const char *field4,
65 		const char *field5, const char *field6,
66 		const char *field7, const char *field8,
67 		const char *field9, const char *field10,
68 		const char *field11, const char *field12,
69 		const char *field13, const char *field14)
70 {
71   struct unicode_attribute * uni;
72 
73   if (i >= 0x110000)
74     {
75       fprintf (stderr, "index too large\n");
76       exit (1);
77     }
78   uni = &unicode_attributes[i];
79   /* Copy the strings.  */
80   uni->name          = strdup (field1);
81   uni->category      = (field2[0] == '\0' ? "" : strdup (field2));
82   uni->combining     = (field3[0] == '\0' ? "" : strdup (field3));
83   uni->bidi          = (field4[0] == '\0' ? "" : strdup (field4));
84   uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
85   uni->decdigit      = (field6[0] == '\0' ? "" : strdup (field6));
86   uni->digit         = (field7[0] == '\0' ? "" : strdup (field7));
87   uni->numeric       = (field8[0] == '\0' ? "" : strdup (field8));
88   uni->mirrored      = (field9[0] == 'Y');
89   uni->oldname       = (field10[0] == '\0' ? "" : strdup (field10));
90   uni->comment       = (field11[0] == '\0' ? "" : strdup (field11));
91   uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
92   uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
93   uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
94 }
95 
96 /* Maximum length of a field in the UnicodeData.txt file.  */
97 #define FIELDLEN 120
98 
99 /* Reads the next field from STREAM.  The buffer BUFFER has size FIELDLEN.
100    Reads up to (but excluding) DELIM.
101    Returns 1 when a field was successfully read, otherwise 0.  */
102 static int
getfield(FILE * stream,char * buffer,int delim)103 getfield (FILE *stream, char *buffer, int delim)
104 {
105   int count = 0;
106   int c;
107 
108   for (; (c = getc (stream)), (c != EOF && c != delim); )
109     {
110       /* The original unicode.org UnicodeData.txt file happens to have
111 	 CR/LF line terminators.  Silently convert to LF.  */
112       if (c == '\r')
113 	continue;
114 
115       /* Put c into the buffer.  */
116       if (++count >= FIELDLEN - 1)
117 	{
118 	  fprintf (stderr, "field too long\n");
119 	  exit (1);
120 	}
121       *buffer++ = c;
122     }
123 
124   if (c == EOF)
125     return 0;
126 
127   *buffer = '\0';
128   return 1;
129 }
130 
131 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
132    file.  */
133 static void
fill_attributes(const char * unicodedata_filename)134 fill_attributes (const char *unicodedata_filename)
135 {
136   unsigned int i, j;
137   FILE *stream;
138   char field0[FIELDLEN];
139   char field1[FIELDLEN];
140   char field2[FIELDLEN];
141   char field3[FIELDLEN];
142   char field4[FIELDLEN];
143   char field5[FIELDLEN];
144   char field6[FIELDLEN];
145   char field7[FIELDLEN];
146   char field8[FIELDLEN];
147   char field9[FIELDLEN];
148   char field10[FIELDLEN];
149   char field11[FIELDLEN];
150   char field12[FIELDLEN];
151   char field13[FIELDLEN];
152   char field14[FIELDLEN];
153   int lineno = 0;
154 
155   for (i = 0; i < 0x110000; i++)
156     unicode_attributes[i].name = NULL;
157 
158   stream = fopen (unicodedata_filename, "r");
159   if (stream == NULL)
160     {
161       fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
162       exit (1);
163     }
164 
165   for (;;)
166     {
167       int n;
168 
169       lineno++;
170       n = getfield (stream, field0, ';');
171       n += getfield (stream, field1, ';');
172       n += getfield (stream, field2, ';');
173       n += getfield (stream, field3, ';');
174       n += getfield (stream, field4, ';');
175       n += getfield (stream, field5, ';');
176       n += getfield (stream, field6, ';');
177       n += getfield (stream, field7, ';');
178       n += getfield (stream, field8, ';');
179       n += getfield (stream, field9, ';');
180       n += getfield (stream, field10, ';');
181       n += getfield (stream, field11, ';');
182       n += getfield (stream, field12, ';');
183       n += getfield (stream, field13, ';');
184       n += getfield (stream, field14, '\n');
185       if (n == 0)
186 	break;
187       if (n != 15)
188 	{
189 	  fprintf (stderr, "short line in'%s':%d\n",
190 		   unicodedata_filename, lineno);
191 	  exit (1);
192 	}
193       i = strtoul (field0, NULL, 16);
194       if (field1[0] == '<'
195 	  && strlen (field1) >= 9
196 	  && !strcmp (field1 + strlen(field1) - 8, ", First>"))
197 	{
198 	  /* Deal with a range. */
199 	  lineno++;
200 	  n = getfield (stream, field0, ';');
201 	  n += getfield (stream, field1, ';');
202 	  n += getfield (stream, field2, ';');
203 	  n += getfield (stream, field3, ';');
204 	  n += getfield (stream, field4, ';');
205 	  n += getfield (stream, field5, ';');
206 	  n += getfield (stream, field6, ';');
207 	  n += getfield (stream, field7, ';');
208 	  n += getfield (stream, field8, ';');
209 	  n += getfield (stream, field9, ';');
210 	  n += getfield (stream, field10, ';');
211 	  n += getfield (stream, field11, ';');
212 	  n += getfield (stream, field12, ';');
213 	  n += getfield (stream, field13, ';');
214 	  n += getfield (stream, field14, '\n');
215 	  if (n != 15)
216 	    {
217 	      fprintf (stderr, "missing end range in '%s':%d\n",
218 		       unicodedata_filename, lineno);
219 	      exit (1);
220 	    }
221 	  if (!(field1[0] == '<'
222 		&& strlen (field1) >= 8
223 		&& !strcmp (field1 + strlen (field1) - 7, ", Last>")))
224 	    {
225 	      fprintf (stderr, "missing end range in '%s':%d\n",
226 		       unicodedata_filename, lineno);
227 	      exit (1);
228 	    }
229 	  field1[strlen (field1) - 7] = '\0';
230 	  j = strtoul (field0, NULL, 16);
231 	  for (; i <= j; i++)
232 	    fill_attribute (i, field1+1, field2, field3, field4, field5,
233 			       field6, field7, field8, field9, field10,
234 			       field11, field12, field13, field14);
235 	}
236       else
237 	{
238 	  /* Single character line */
239 	  fill_attribute (i, field1, field2, field3, field4, field5,
240 			     field6, field7, field8, field9, field10,
241 			     field11, field12, field13, field14);
242 	}
243     }
244   if (ferror (stream) || fclose (stream))
245     {
246       fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
247       exit (1);
248     }
249 }
250 
251 /* The combining property from the PropList.txt file.  */
252 char unicode_combining[0x110000];
253 
254 /* Stores in unicode_combining[] the Combining property from the
255    Unicode 3.0 PropList.txt file.  */
256 static void
fill_combining(const char * proplist_filename)257 fill_combining (const char *proplist_filename)
258 {
259   unsigned int i;
260   FILE *stream;
261   char buf[100+1];
262 
263   for (i = 0; i < 0x110000; i++)
264     unicode_combining[i] = 0;
265 
266   stream = fopen (proplist_filename, "r");
267   if (stream == NULL)
268     {
269       fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
270       exit (1);
271     }
272 
273   /* Search for the "Property dump for: 0x20000004 (Combining)" line.  */
274   do
275     {
276       if (fscanf (stream, "%100[^\n]\n", buf) < 1)
277 	{
278 	  fprintf (stderr, "no combining property found in '%s'\n",
279 		   proplist_filename);
280 	  exit (1);
281 	}
282     }
283   while (strstr (buf, "(Combining)") == NULL);
284 
285   for (;;)
286     {
287       unsigned int i1, i2;
288 
289       if (fscanf (stream, "%100[^\n]\n", buf) < 1)
290 	{
291 	  fprintf (stderr, "premature end of combining property in '%s'\n",
292 		   proplist_filename);
293 	  exit (1);
294 	}
295       if (buf[0] == '*')
296 	break;
297       if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
298 	{
299 	  if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
300 	    {
301 	      fprintf (stderr, "parse error in combining property in '%s'\n",
302 		       proplist_filename);
303 	      exit (1);
304 	    }
305 	}
306       else if (strlen (buf) >= 4)
307 	{
308 	  if (sscanf (buf, "%4X", &i1) < 1)
309 	    {
310 	      fprintf (stderr, "parse error in combining property in '%s'\n",
311 		       proplist_filename);
312 	      exit (1);
313 	    }
314 	  i2 = i1;
315 	}
316       else
317 	{
318 	  fprintf (stderr, "parse error in combining property in '%s'\n",
319 		   proplist_filename);
320 	  exit (1);
321 	}
322       for (i = i1; i <= i2; i++)
323 	unicode_combining[i] = 1;
324     }
325   if (ferror (stream) || fclose (stream))
326     {
327       fprintf (stderr, "error reading from '%s'\n", proplist_filename);
328       exit (1);
329     }
330 }
331 
332 /* The width property from the EastAsianWidth.txt file.
333    Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na".  */
334 const char * unicode_width[0x110000];
335 
336 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
337    file.  */
338 static void
fill_width(const char * width_filename)339 fill_width (const char *width_filename)
340 {
341   unsigned int i, j;
342   FILE *stream;
343   char field0[FIELDLEN];
344   char field1[FIELDLEN];
345   char field2[FIELDLEN];
346   int lineno = 0;
347 
348   for (i = 0; i < 0x110000; i++)
349     unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
350 
351   stream = fopen (width_filename, "r");
352   if (stream == NULL)
353     {
354       fprintf (stderr, "error during fopen of '%s'\n", width_filename);
355       exit (1);
356     }
357 
358   for (;;)
359     {
360       int n;
361       int c;
362 
363       lineno++;
364       c = getc (stream);
365       if (c == EOF)
366 	break;
367       if (c == '#')
368 	{
369 	  do c = getc (stream); while (c != EOF && c != '\n');
370 	  continue;
371 	}
372       ungetc (c, stream);
373       n = getfield (stream, field0, ';');
374       n += getfield (stream, field1, ' ');
375       n += getfield (stream, field2, '\n');
376       if (n == 0)
377 	break;
378       if (n != 3)
379 	{
380 	  fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
381 	  exit (1);
382 	}
383       i = strtoul (field0, NULL, 16);
384       if (strstr (field0, "..") != NULL)
385 	{
386 	  /* Deal with a range.  */
387 	  j = strtoul (strstr (field0, "..") + 2, NULL, 16);
388 	  for (; i <= j; i++)
389 	    unicode_width[i] = strdup (field1);
390 	}
391       else
392 	{
393 	  /* Single character line.  */
394 	  unicode_width[i] = strdup (field1);
395 	}
396     }
397   if (ferror (stream) || fclose (stream))
398     {
399       fprintf (stderr, "error reading from '%s'\n", width_filename);
400       exit (1);
401     }
402 }
403 
404 /* Line breaking classification.  */
405 
406 enum
407 {
408   /* Values >= 20 are resolved at run time. */
409   LBP_BK =  0, /* mandatory break */
410 /*LBP_CR,         carriage return - not used here because it's a DOSism */
411 /*LBP_LF,         line feed - not used here because it's a DOSism */
412   LBP_CM = 20, /* attached characters and combining marks */
413 /*LBP_SG,         surrogates - not used here because they are not characters */
414   LBP_ZW =  1, /* zero width space */
415   LBP_IN =  2, /* inseparable */
416   LBP_GL =  3, /* non-breaking (glue) */
417   LBP_CB = 22, /* contingent break opportunity */
418   LBP_SP = 21, /* space */
419   LBP_BA =  4, /* break opportunity after */
420   LBP_BB =  5, /* break opportunity before */
421   LBP_B2 =  6, /* break opportunity before and after */
422   LBP_HY =  7, /* hyphen */
423   LBP_NS =  8, /* non starter */
424   LBP_OP =  9, /* opening punctuation */
425   LBP_CL = 10, /* closing punctuation */
426   LBP_QU = 11, /* ambiguous quotation */
427   LBP_EX = 12, /* exclamation/interrogation */
428   LBP_ID = 13, /* ideographic */
429   LBP_NU = 14, /* numeric */
430   LBP_IS = 15, /* infix separator (numeric) */
431   LBP_SY = 16, /* symbols allowing breaks */
432   LBP_AL = 17, /* ordinary alphabetic and symbol characters */
433   LBP_PR = 18, /* prefix (numeric) */
434   LBP_PO = 19, /* postfix (numeric) */
435   LBP_SA = 23, /* complex context (South East Asian) */
436   LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
437   LBP_XX = 25  /* unknown */
438 };
439 
440 /* Returns the line breaking classification for ch, as a bit mask.  */
441 static int
get_lbp(unsigned int ch)442 get_lbp (unsigned int ch)
443 {
444   int attr = 0;
445 
446   if (unicode_attributes[ch].name != NULL)
447     {
448       /* mandatory break */
449       if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
450 	  || ch == 0x000C /* form feed */
451 	  || ch == 0x2028 /* LINE SEPARATOR */
452 	  || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
453 	attr |= 1 << LBP_BK;
454 
455       /* zero width space */
456       if (ch == 0x200B /* ZERO WIDTH SPACE */)
457 	attr |= 1 << LBP_ZW;
458 
459       /* inseparable */
460       if (ch == 0x2024 /* ONE DOT LEADER */
461 	  || ch == 0x2025 /* TWO DOT LEADER */
462 	  || ch == 0x2026 /* HORIZONTAL ELLIPSIS */)
463 	attr |= 1 << LBP_IN;
464 
465       /* non-breaking (glue) */
466       if (ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */
467 	  || ch == 0x00A0 /* NO-BREAK SPACE */
468 	  || ch == 0x202F /* NARROW NO-BREAK SPACE */
469 	  || ch == 0x2007 /* FIGURE SPACE */
470 	  || ch == 0x2011 /* NON-BREAKING HYPHEN */
471 	  || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */)
472 	attr |= 1 << LBP_GL;
473 
474       /* contingent break opportunity */
475       if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
476 	attr |= 1 << LBP_CB;
477 
478       /* space */
479       if (ch == 0x0020 /* SPACE */)
480 	attr |= 1 << LBP_SP;
481 
482       /* break opportunity after */
483       if (ch == 0x2000 /* EN QUAD */
484 	  || ch == 0x2001 /* EM QUAD */
485 	  || ch == 0x2002 /* EN SPACE */
486 	  || ch == 0x2003 /* EM SPACE */
487 	  || ch == 0x2004 /* THREE-PER-EM SPACE */
488 	  || ch == 0x2005 /* FOUR-PER-EM SPACE */
489 	  || ch == 0x2006 /* SIX-PER-EM SPACE */
490 	  || ch == 0x2008 /* PUNCTUATION SPACE */
491 	  || ch == 0x2009 /* THIN SPACE */
492 	  || ch == 0x200A /* HAIR SPACE */
493 	  || ch == 0x0009 /* tab */
494 	  || ch == 0x058A /* ARMENIAN HYPHEN */
495 	  || ch == 0x2010 /* HYPHEN */
496 	  || ch == 0x2012 /* FIGURE DASH */
497 	  || ch == 0x2013 /* EN DASH */
498 	  || ch == 0x00AD /* SOFT HYPHEN */
499 	  || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
500 	  || ch == 0x1361 /* ETHIOPIC WORDSPACE */
501 	  || ch == 0x1680 /* OGHAM SPACE MARK */
502 	  || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
503 	  || ch == 0x2027 /* HYPHENATION POINT */
504 	  || ch == 0x007C /* VERTICAL LINE */)
505 	attr |= 1 << LBP_BA;
506 
507       /* break opportunity before */
508       if (ch == 0x00B4 /* ACUTE ACCENT */
509 	  || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
510 	  || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
511 	  || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
512 	attr |= 1 << LBP_BB;
513 
514       /* break opportunity before and after */
515       if (ch == 0x2014 /* EM DASH */)
516 	attr |= 1 << LBP_B2;
517 
518       /* hyphen */
519       if (ch == 0x002D /* HYPHEN-MINUS */)
520 	attr |= 1 << LBP_HY;
521 
522       /* exclamation/interrogation */
523       if (ch == 0x0021 /* EXCLAMATION MARK */
524 	  || ch == 0x003F /* QUESTION MARK */
525 	  || ch == 0xFE56 /* SMALL QUESTION MARK */
526 	  || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
527 	  || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
528 	  || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
529 	attr |= 1 << LBP_EX;
530 
531       /* opening punctuation */
532       if (unicode_attributes[ch].category[0] == 'P'
533 	  && unicode_attributes[ch].category[1] == 's')
534 	attr |= 1 << LBP_OP;
535 
536       /* closing punctuation */
537       if (ch == 0x3001 /* IDEOGRAPHIC COMMA */
538 	  || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
539 	  || ch == 0xFE50 /* SMALL COMMA */
540 	  || ch == 0xFE52 /* SMALL FULL STOP */
541 	  || ch == 0xFF0C /* FULLWIDTH COMMA */
542 	  || ch == 0xFF0E /* FULLWIDTH FULL STOP */
543 	  || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
544 	  || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
545 	  || (unicode_attributes[ch].category[0] == 'P'
546 	      && unicode_attributes[ch].category[1] == 'e'))
547 	attr |= 1 << LBP_CL;
548 
549       /* ambiguous quotation */
550       if (ch == 0x0022 /* QUOTATION MARK */
551 	  || ch == 0x0027 /* APOSTROPHE */
552 	  || (unicode_attributes[ch].category[0] == 'P'
553 	      && (unicode_attributes[ch].category[1] == 'f'
554 		  || unicode_attributes[ch].category[1] == 'i')))
555 	attr |= 1 << LBP_QU;
556 
557       /* attached characters and combining marks */
558       if ((unicode_attributes[ch].category[0] == 'M'
559 	   && (unicode_attributes[ch].category[1] == 'n'
560 	       || unicode_attributes[ch].category[1] == 'c'
561 	       || unicode_attributes[ch].category[1] == 'e'))
562 	  || (ch >= 0x1160 && ch <= 0x11F9)
563 	  || (unicode_attributes[ch].category[0] == 'C'
564 	      && (unicode_attributes[ch].category[1] == 'c'
565 		  || unicode_attributes[ch].category[1] == 'f')))
566 	if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL))))
567 	  attr |= 1 << LBP_CM;
568 
569       /* non starter */
570       if (ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
571 	  || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
572 	  || ch == 0x17D4 /* KHMER SIGN KHAN */
573 	  || ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
574 	  || ch == 0x17D7 /* KHMER SIGN LEK TOO */
575 	  || ch == 0x17D8 /* KHMER SIGN BEYYAL */
576 	  || ch == 0x17D9 /* KHMER SIGN PHNAEK MUAN */
577 	  || ch == 0x17DA /* KHMER SIGN KOOMUUT */
578 	  || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
579 	  || ch == 0x2044 /* FRACTION SLASH */
580 	  || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
581 	  || ch == 0x301C /* WAVE DASH */
582 	  || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
583 	  || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
584 	  || ch == 0x309D /* HIRAGANA ITERATION MARK */
585 	  || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
586 	  || ch == 0x30FB /* KATAKANA MIDDLE DOT */
587 	  || ch == 0x30FD /* KATAKANA ITERATION MARK */
588 	  || ch == 0xFE54 /* SMALL SEMICOLON */
589 	  || ch == 0xFE55 /* SMALL COLON */
590 	  || ch == 0xFF1A /* FULLWIDTH COLON */
591 	  || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
592 	  || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
593 	  || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
594 	  || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
595 	  || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
596 	  || (unicode_attributes[ch].category[0] == 'L'
597 	      && unicode_attributes[ch].category[1] == 'm'
598 	      && (unicode_width[ch][0] == 'W'
599 		  || unicode_width[ch][0] == 'H'))
600 	  || (unicode_attributes[ch].category[0] == 'S'
601 	      && unicode_attributes[ch].category[1] == 'k'
602 	      && unicode_width[ch][0] == 'W')
603 	  || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
604 	  || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
605 	attr |= 1 << LBP_NS;
606 
607       /* numeric */
608       if (unicode_attributes[ch].category[0] == 'N'
609 	  && unicode_attributes[ch].category[1] == 'd'
610 	  && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
611 	attr |= 1 << LBP_NU;
612 
613       /* infix separator (numeric) */
614       if (ch == 0x002C /* COMMA */
615 	  || ch == 0x002E /* FULL STOP */
616 	  || ch == 0x003A /* COLON */
617 	  || ch == 0x003B /* SEMICOLON */
618 	  || ch == 0x0589 /* ARMENIAN FULL STOP */)
619 	attr |= 1 << LBP_IS;
620 
621       /* symbols allowing breaks */
622       if (ch == 0x002F /* SOLIDUS */)
623 	attr |= 1 << LBP_SY;
624 
625       /* postfix (numeric) */
626       if (ch == 0x0025 /* PERCENT SIGN */
627 	  || ch == 0x00A2 /* CENT SIGN */
628 	  || ch == 0x00B0 /* DEGREE SIGN */
629 	  || ch == 0x2030 /* PER MILLE SIGN */
630 	  || ch == 0x2031 /* PER TEN THOUSAND SIGN */
631 	  || ch == 0x2032 /* PRIME */
632 	  || ch == 0x2033 /* DOUBLE PRIME */
633 	  || ch == 0x2034 /* TRIPLE PRIME */
634 	  || ch == 0x2035 /* REVERSED PRIME */
635 	  || ch == 0x2036 /* REVERSED DOUBLE PRIME */
636 	  || ch == 0x2037 /* REVERSED TRIPLE PRIME */
637 	  || ch == 0x20A7 /* PESETA SIGN */
638 	  || ch == 0x2103 /* DEGREE CELSIUS */
639 	  || ch == 0x2109 /* DEGREE FAHRENHEIT */
640 	  || ch == 0x2126 /* OHM SIGN */
641 	  || ch == 0xFE6A /* SMALL PERCENT SIGN */
642 	  || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
643 	  || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */)
644 	attr |= 1 << LBP_PO;
645 
646       /* prefix (numeric) */
647       if (ch == 0x002B /* PLUS SIGN */
648 	  || ch == 0x005C /* REVERSE SOLIDUS */
649 	  || ch == 0x00B1 /* PLUS-MINUS SIGN */
650 	  || ch == 0x2116 /* NUMERO SIGN */
651 	  || ch == 0x2212 /* MINUS SIGN */
652 	  || ch == 0x2213 /* MINUS-OR-PLUS SIGN */
653 	  || (unicode_attributes[ch].category[0] == 'S'
654 	      && unicode_attributes[ch].category[1] == 'c'))
655 	if (!(attr & (1 << LBP_PO)))
656 	  attr |= 1 << LBP_PR;
657 
658       /* complex context (South East Asian) */
659       if (((ch >= 0x0E00 && ch <= 0x0EFF)
660 	   || (ch >= 0x1000 && ch <= 0x109F)
661 	   || (ch >= 0x1780 && ch <= 0x17FF))
662 	  && unicode_attributes[ch].category[0] == 'L'
663 	  && (unicode_attributes[ch].category[1] == 'm'
664 	      || unicode_attributes[ch].category[1] == 'o'))
665 	if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_NU) | (1 << LBP_BA) | (1 << LBP_PR))))
666 	  attr |= 1 << LBP_SA;
667 
668       /* ideographic */
669       if ((ch >= 0x1100 && ch <= 0x115F) /* HANGUL CHOSEONG */
670 	  || (ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
671 	  || ch == 0x3000 /* IDEOGRAPHIC SPACE */
672 	  || (ch >= 0x3130 && ch <= 0x318F) /* HANGUL LETTER */
673 	  || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
674 	  || (ch >= 0x4E00 && ch <= 0x9FAF) /* CJK Ideograph */
675 	  || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK COMPATIBILITY IDEOGRAPH */
676 	  || (ch >= 0xAC00 && ch <= 0xD7AF) /* HANGUL SYLLABLE */
677 	  || (ch >= 0xA000 && ch <= 0xA48C) /* YI SYLLABLE */
678 	  || (ch >= 0xA490 && ch <= 0xA4C6) /* YI RADICAL */
679 	  || ch == 0xFE62 /* SMALL PLUS SIGN */
680 	  || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
681 	  || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
682 	  || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
683 	  || ch == 0xFE66 /* SMALL EQUALS SIGN */
684 	  || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
685 	  || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
686 	  || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
687 	  || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
688 	  || (ch >= 0x3000 && ch <= 0x33FF
689 	      && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))
690 	  /* Extra characters for compatibility with Unicode LineBreak.txt.  */
691 	  || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
692 	  || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
693 	  || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
694 	  || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
695 	  || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
696 	  || ch == 0xFE49 /* DASHED OVERLINE */
697 	  || ch == 0xFE4A /* CENTRELINE OVERLINE */
698 	  || ch == 0xFE4B /* WAVY OVERLINE */
699 	  || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
700 	  || ch == 0xFE4D /* DASHED LOW LINE */
701 	  || ch == 0xFE4E /* CENTRELINE LOW LINE */
702 	  || ch == 0xFE4F /* WAVY LOW LINE */
703 	  || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
704 	  || ch == 0xFE58 /* SMALL EM DASH */
705 	  || ch == 0xFE5F /* SMALL NUMBER SIGN */
706 	  || ch == 0xFE60 /* SMALL AMPERSAND */
707 	  || ch == 0xFE61 /* SMALL ASTERISK */
708 	  || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
709 	  || ch == 0xFE6B /* SMALL COMMERCIAL AT */
710 	  || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
711 	  || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
712 	  || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
713 	  || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
714 	  || ch == 0xFF0A /* FULLWIDTH ASTERISK */
715 	  || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
716 	  || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
717 	  || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
718 	  || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
719 	  || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
720 	  || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
721 	  || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
722 	  || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
723 	  || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
724 	  || ch == 0xFF3F /* FULLWIDTH LOW LINE */
725 	  || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
726 	  || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
727 	  || ch == 0xFF5E /* FULLWIDTH TILDE */
728 	  || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
729 	  || ch == 0xFFE3 /* FULLWIDTH MACRON */
730 	  || ch == 0xFFE4) /* FULLWIDTH BROKEN BAR */
731 	{
732 	  /* ambiguous (ideograph) ? */
733 	  if (unicode_width[ch] != NULL
734 	      && unicode_width[ch][0] == 'A')
735 	    attr |= 1 << LBP_AI;
736 	  else
737 	    attr |= 1 << LBP_ID;
738 	}
739 
740       /* ordinary alphabetic and symbol characters */
741       if ((unicode_attributes[ch].category[0] == 'L'
742 	   && (unicode_attributes[ch].category[1] == 'u'
743 	       || unicode_attributes[ch].category[1] == 'l'
744 	       || unicode_attributes[ch].category[1] == 't'
745 	       || unicode_attributes[ch].category[1] == 'm'
746 	       || unicode_attributes[ch].category[1] == 'o'))
747 	  || (unicode_attributes[ch].category[0] == 'S'
748 	      && (unicode_attributes[ch].category[1] == 'm'
749 		  || unicode_attributes[ch].category[1] == 'c'
750 		  || unicode_attributes[ch].category[1] == 'k'
751 		  || unicode_attributes[ch].category[1] == 'o'))
752 	  /* Extra characters for compatibility with Unicode LineBreak.txt.  */
753 	  || ch == 0x0023 /* NUMBER SIGN */
754 	  || ch == 0x0026 /* AMPERSAND */
755 	  || ch == 0x002A /* ASTERISK */
756 	  || ch == 0x0040 /* COMMERCIAL AT */
757 	  || ch == 0x005F /* LOW LINE */
758 	  || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
759 	  || ch == 0x00B2 /* SUPERSCRIPT TWO */
760 	  || ch == 0x00B3 /* SUPERSCRIPT THREE */
761 	  || ch == 0x00B7 /* MIDDLE DOT */
762 	  || ch == 0x00B9 /* SUPERSCRIPT ONE */
763 	  || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
764 	  || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
765 	  || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
766 	  || ch == 0x00BF /* INVERTED QUESTION MARK */
767 	  || ch == 0x037E /* GREEK QUESTION MARK */
768 	  || ch == 0x0387 /* GREEK ANO TELEIA */
769 	  || ch == 0x055A /* ARMENIAN APOSTROPHE */
770 	  || ch == 0x055B /* ARMENIAN EMPHASIS MARK */
771 	  || ch == 0x055C /* ARMENIAN EXCLAMATION MARK */
772 	  || ch == 0x055D /* ARMENIAN COMMA */
773 	  || ch == 0x055E /* ARMENIAN QUESTION MARK */
774 	  || ch == 0x055F /* ARMENIAN ABBREVIATION MARK */
775 	  || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
776 	  || ch == 0x05C0 /* HEBREW PUNCTUATION PASEQ */
777 	  || ch == 0x05C3 /* HEBREW PUNCTUATION SOF PASUQ */
778 	  || ch == 0x05F3 /* HEBREW PUNCTUATION GERESH */
779 	  || ch == 0x05F4 /* HEBREW PUNCTUATION GERSHAYIM */
780 	  || ch == 0x060C /* ARABIC COMMA */
781 	  || ch == 0x061B /* ARABIC SEMICOLON */
782 	  || ch == 0x061F /* ARABIC QUESTION MARK */
783 	  || ch == 0x066A /* ARABIC PERCENT SIGN */
784 	  || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
785 	  || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */
786 	  || ch == 0x066D /* ARABIC FIVE POINTED STAR */
787 	  || ch == 0x06D4 /* ARABIC FULL STOP */
788 	  || ch == 0x0700 /* SYRIAC END OF PARAGRAPH */
789 	  || ch == 0x0701 /* SYRIAC SUPRALINEAR FULL STOP */
790 	  || ch == 0x0702 /* SYRIAC SUBLINEAR FULL STOP */
791 	  || ch == 0x0703 /* SYRIAC SUPRALINEAR COLON */
792 	  || ch == 0x0704 /* SYRIAC SUBLINEAR COLON */
793 	  || ch == 0x0705 /* SYRIAC HORIZONTAL COLON */
794 	  || ch == 0x0706 /* SYRIAC COLON SKEWED LEFT */
795 	  || ch == 0x0707 /* SYRIAC COLON SKEWED RIGHT */
796 	  || ch == 0x0708 /* SYRIAC SUPRALINEAR COLON SKEWED LEFT */
797 	  || ch == 0x0709 /* SYRIAC SUBLINEAR COLON SKEWED RIGHT */
798 	  || ch == 0x070A /* SYRIAC CONTRACTION */
799 	  || ch == 0x070B /* SYRIAC HARKLEAN OBELUS */
800 	  || ch == 0x070C /* SYRIAC HARKLEAN METOBELUS */
801 	  || ch == 0x070D /* SYRIAC HARKLEAN ASTERISCUS */
802 	  || ch == 0x0964 /* DEVANAGARI DANDA */
803 	  || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
804 	  || ch == 0x0970 /* DEVANAGARI ABBREVIATION SIGN */
805 	  || ch == 0x09F4 /* BENGALI CURRENCY NUMERATOR ONE */
806 	  || ch == 0x09F5 /* BENGALI CURRENCY NUMERATOR TWO */
807 	  || ch == 0x09F6 /* BENGALI CURRENCY NUMERATOR THREE */
808 	  || ch == 0x09F7 /* BENGALI CURRENCY NUMERATOR FOUR */
809 	  || ch == 0x09F8 /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
810 	  || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
811 	  || ch == 0x0BF0 /* TAMIL NUMBER TEN */
812 	  || ch == 0x0BF1 /* TAMIL NUMBER ONE HUNDRED */
813 	  || ch == 0x0BF2 /* TAMIL NUMBER ONE THOUSAND */
814 	  || ch == 0x0DF4 /* SINHALA PUNCTUATION KUNDDALIYA */
815 	  || ch == 0x0E4F /* THAI CHARACTER FONGMAN */
816 	  || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
817 	  || ch == 0x0F05 /* TIBETAN MARK CLOSING YIG MGO SGAB MA */
818 	  || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
819 	  || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
820 	  || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
821 	  || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
822 	  || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
823 	  || ch == 0x0F0D /* TIBETAN MARK SHAD */
824 	  || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
825 	  || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
826 	  || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
827 	  || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
828 	  || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
829 	  || ch == 0x0F2A /* TIBETAN DIGIT HALF ONE */
830 	  || ch == 0x0F2B /* TIBETAN DIGIT HALF TWO */
831 	  || ch == 0x0F2C /* TIBETAN DIGIT HALF THREE */
832 	  || ch == 0x0F2D /* TIBETAN DIGIT HALF FOUR */
833 	  || ch == 0x0F2E /* TIBETAN DIGIT HALF FIVE */
834 	  || ch == 0x0F2F /* TIBETAN DIGIT HALF SIX */
835 	  || ch == 0x0F30 /* TIBETAN DIGIT HALF SEVEN */
836 	  || ch == 0x0F31 /* TIBETAN DIGIT HALF EIGHT */
837 	  || ch == 0x0F32 /* TIBETAN DIGIT HALF NINE */
838 	  || ch == 0x0F33 /* TIBETAN DIGIT HALF ZERO */
839 	  || ch == 0x0F85 /* TIBETAN MARK PALUTA */
840 	  || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
841 	  || ch == 0x104B /* MYANMAR SIGN SECTION */
842 	  || ch == 0x104C /* MYANMAR SYMBOL LOCATIVE */
843 	  || ch == 0x104D /* MYANMAR SYMBOL COMPLETED */
844 	  || ch == 0x104E /* MYANMAR SYMBOL AFOREMENTIONED */
845 	  || ch == 0x104F /* MYANMAR SYMBOL GENITIVE */
846 	  || ch == 0x10FB /* GEORGIAN PARAGRAPH SEPARATOR */
847 	  || ch == 0x1362 /* ETHIOPIC FULL STOP */
848 	  || ch == 0x1363 /* ETHIOPIC COMMA */
849 	  || ch == 0x1364 /* ETHIOPIC SEMICOLON */
850 	  || ch == 0x1365 /* ETHIOPIC COLON */
851 	  || ch == 0x1366 /* ETHIOPIC PREFACE COLON */
852 	  || ch == 0x1367 /* ETHIOPIC QUESTION MARK */
853 	  || ch == 0x1368 /* ETHIOPIC PARAGRAPH SEPARATOR */
854 	  || ch == 0x1372 /* ETHIOPIC NUMBER TEN */
855 	  || ch == 0x1373 /* ETHIOPIC NUMBER TWENTY */
856 	  || ch == 0x1374 /* ETHIOPIC NUMBER THIRTY */
857 	  || ch == 0x1375 /* ETHIOPIC NUMBER FORTY */
858 	  || ch == 0x1376 /* ETHIOPIC NUMBER FIFTY */
859 	  || ch == 0x1377 /* ETHIOPIC NUMBER SIXTY */
860 	  || ch == 0x1378 /* ETHIOPIC NUMBER SEVENTY */
861 	  || ch == 0x1379 /* ETHIOPIC NUMBER EIGHTY */
862 	  || ch == 0x137A /* ETHIOPIC NUMBER NINETY */
863 	  || ch == 0x137B /* ETHIOPIC NUMBER HUNDRED */
864 	  || ch == 0x137C /* ETHIOPIC NUMBER TEN THOUSAND */
865 	  || ch == 0x166D /* CANADIAN SYLLABICS CHI SIGN */
866 	  || ch == 0x166E /* CANADIAN SYLLABICS FULL STOP */
867 	  || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
868 	  || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
869 	  || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
870 	  || ch == 0x16EE /* RUNIC ARLAUG SYMBOL */
871 	  || ch == 0x16EF /* RUNIC TVIMADUR SYMBOL */
872 	  || ch == 0x16F0 /* RUNIC BELGTHOR SYMBOL */
873 	  || ch == 0x17DC /* KHMER SIGN AVAKRAHASANYA */
874 	  || ch == 0x1800 /* MONGOLIAN BIRGA */
875 	  || ch == 0x1801 /* MONGOLIAN ELLIPSIS */
876 	  || ch == 0x1802 /* MONGOLIAN COMMA */
877 	  || ch == 0x1803 /* MONGOLIAN FULL STOP */
878 	  || ch == 0x1804 /* MONGOLIAN COLON */
879 	  || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
880 	  || ch == 0x1807 /* MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER */
881 	  || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
882 	  || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
883 	  || ch == 0x180A /* MONGOLIAN NIRUGU */
884 	  || ch == 0x2015 /* HORIZONTAL BAR */
885 	  || ch == 0x2016 /* DOUBLE VERTICAL LINE */
886 	  || ch == 0x2017 /* DOUBLE LOW LINE */
887 	  || ch == 0x2020 /* DAGGER */
888 	  || ch == 0x2021 /* DOUBLE DAGGER */
889 	  || ch == 0x2022 /* BULLET */
890 	  || ch == 0x2023 /* TRIANGULAR BULLET */
891 	  || ch == 0x2038 /* CARET */
892 	  || ch == 0x203B /* REFERENCE MARK */
893 	  || ch == 0x203D /* INTERROBANG */
894 	  || ch == 0x203E /* OVERLINE */
895 	  || ch == 0x203F /* UNDERTIE */
896 	  || ch == 0x2040 /* CHARACTER TIE */
897 	  || ch == 0x2041 /* CARET INSERTION POINT */
898 	  || ch == 0x2042 /* ASTERISM */
899 	  || ch == 0x2043 /* HYPHEN BULLET */
900 	  || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
901 	  || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
902 	  || ch == 0x204A /* TIRONIAN SIGN ET */
903 	  || ch == 0x204B /* REVERSED PILCROW SIGN */
904 	  || ch == 0x204C /* BLACK LEFTWARDS BULLET */
905 	  || ch == 0x204D /* BLACK RIGHTWARDS BULLET */
906 	  || ch == 0x2070 /* SUPERSCRIPT ZERO */
907 	  || ch == 0x2074 /* SUPERSCRIPT FOUR */
908 	  || ch == 0x2075 /* SUPERSCRIPT FIVE */
909 	  || ch == 0x2076 /* SUPERSCRIPT SIX */
910 	  || ch == 0x2077 /* SUPERSCRIPT SEVEN */
911 	  || ch == 0x2078 /* SUPERSCRIPT EIGHT */
912 	  || ch == 0x2079 /* SUPERSCRIPT NINE */
913 	  || ch == 0x2080 /* SUBSCRIPT ZERO */
914 	  || ch == 0x2081 /* SUBSCRIPT ONE */
915 	  || ch == 0x2082 /* SUBSCRIPT TWO */
916 	  || ch == 0x2083 /* SUBSCRIPT THREE */
917 	  || ch == 0x2084 /* SUBSCRIPT FOUR */
918 	  || ch == 0x2085 /* SUBSCRIPT FIVE */
919 	  || ch == 0x2086 /* SUBSCRIPT SIX */
920 	  || ch == 0x2087 /* SUBSCRIPT SEVEN */
921 	  || ch == 0x2088 /* SUBSCRIPT EIGHT */
922 	  || ch == 0x2089 /* SUBSCRIPT NINE */
923 	  || (ch >= 0x2153 && ch <= 0x215E) /* VULGAR FRACTION */
924 	  || ch == 0x215F /* FRACTION NUMERATOR ONE */
925 	  || (ch >= 0x2160 && ch <= 0x2183) /* ROMAN NUMERAL */
926 	  || (ch >= 0x2460 && ch <= 0x2473) /* CIRCLED NUMBER */
927 	  || (ch >= 0x2474 && ch <= 0x2487) /* PARENTHESIZED NUMBER */
928 	  || (ch >= 0x2488 && ch <= 0x249B) /* NUMBER FULL STOP */
929 	  || ch == 0x24EA /* CIRCLED DIGIT ZERO */
930 	  || (ch >= 0x2776 && ch <= 0x2793) /* DINGBAT CIRCLED DIGIT */
931 	  || ch == 0x10320 /* OLD ITALIC NUMERAL ONE */
932 	  || ch == 0x10321 /* OLD ITALIC NUMERAL FIVE */
933 	  || ch == 0x10322 /* OLD ITALIC NUMERAL TEN */
934 	  || ch == 0x10323 /* OLD ITALIC NUMERAL FIFTY */
935 	  || ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
936 	if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_ID) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SA) | (1 << LBP_CB))))
937 	  {
938 	    /* ambiguous (alphabetic) ? */
939 	    if (unicode_width[ch] != NULL
940 		&& unicode_width[ch][0] == 'A')
941 	      attr |= 1 << LBP_AI;
942 	    else
943 	      attr |= 1 << LBP_AL;
944 	  }
945     }
946 
947   if (attr == 0)
948     /* unknown */
949     attr |= 1 << LBP_XX;
950 
951   return attr;
952 }
953 
954 /* Output the line breaking properties in a human readable format.  */
955 static void
debug_output_lbp(FILE * stream)956 debug_output_lbp (FILE *stream)
957 {
958   unsigned int i;
959 
960   for (i = 0; i < 0x110000; i++)
961     {
962       int attr = get_lbp (i);
963       if (attr != 1 << LBP_XX)
964 	{
965 	  fprintf (stream, "0x%04X", i);
966 #define PRINT_BIT(attr,bit) \
967   if (attr & (1 << bit)) fprintf (stream, " " #bit);
968 	  PRINT_BIT(attr,LBP_BK);
969 	  PRINT_BIT(attr,LBP_CM);
970 	  PRINT_BIT(attr,LBP_ZW);
971 	  PRINT_BIT(attr,LBP_IN);
972 	  PRINT_BIT(attr,LBP_GL);
973 	  PRINT_BIT(attr,LBP_CB);
974 	  PRINT_BIT(attr,LBP_SP);
975 	  PRINT_BIT(attr,LBP_BA);
976 	  PRINT_BIT(attr,LBP_BB);
977 	  PRINT_BIT(attr,LBP_B2);
978 	  PRINT_BIT(attr,LBP_HY);
979 	  PRINT_BIT(attr,LBP_NS);
980 	  PRINT_BIT(attr,LBP_OP);
981 	  PRINT_BIT(attr,LBP_CL);
982 	  PRINT_BIT(attr,LBP_QU);
983 	  PRINT_BIT(attr,LBP_EX);
984 	  PRINT_BIT(attr,LBP_ID);
985 	  PRINT_BIT(attr,LBP_NU);
986 	  PRINT_BIT(attr,LBP_IS);
987 	  PRINT_BIT(attr,LBP_SY);
988 	  PRINT_BIT(attr,LBP_AL);
989 	  PRINT_BIT(attr,LBP_PR);
990 	  PRINT_BIT(attr,LBP_PO);
991 	  PRINT_BIT(attr,LBP_SA);
992 	  PRINT_BIT(attr,LBP_XX);
993 	  PRINT_BIT(attr,LBP_AI);
994 #undef PRINT_BIT
995 	  fprintf (stream, "\n");
996 	}
997     }
998 }
999 
1000 static void
debug_output_tables(const char * filename)1001 debug_output_tables (const char *filename)
1002 {
1003   FILE *stream;
1004 
1005   stream = fopen (filename, "w");
1006   if (stream == NULL)
1007     {
1008       fprintf (stderr, "cannot open '%s' for writing\n", filename);
1009       exit (1);
1010     }
1011 
1012   debug_output_lbp (stream);
1013 
1014   if (ferror (stream) || fclose (stream))
1015     {
1016       fprintf (stderr, "error writing to '%s'\n", filename);
1017       exit (1);
1018     }
1019 }
1020 
1021 /* The line breaking property from the LineBreak.txt file.  */
1022 int unicode_org_lbp[0x110000];
1023 
1024 /* Stores in unicode_org_lbp[] the line breaking property from the
1025    LineBreak.txt file.  */
1026 static void
fill_org_lbp(const char * linebreak_filename)1027 fill_org_lbp (const char *linebreak_filename)
1028 {
1029   unsigned int i, j;
1030   FILE *stream;
1031   char field0[FIELDLEN];
1032   char field1[FIELDLEN];
1033   char field2[FIELDLEN];
1034   int lineno = 0;
1035 
1036   for (i = 0; i < 0x110000; i++)
1037     unicode_org_lbp[i] = LBP_XX;
1038 
1039   stream = fopen (linebreak_filename, "r");
1040   if (stream == NULL)
1041     {
1042       fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
1043       exit (1);
1044     }
1045 
1046   for (;;)
1047     {
1048       int n;
1049       int c;
1050       int value;
1051 
1052       lineno++;
1053       c = getc (stream);
1054       if (c == EOF)
1055 	break;
1056       if (c == '#')
1057 	{
1058 	  do c = getc (stream); while (c != EOF && c != '\n');
1059 	  continue;
1060 	}
1061       ungetc (c, stream);
1062       n = getfield (stream, field0, ';');
1063       n += getfield (stream, field1, ' ');
1064       n += getfield (stream, field2, '\n');
1065       if (n == 0)
1066 	break;
1067       if (n != 3)
1068 	{
1069 	  fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
1070 		   lineno);
1071 	  exit (1);
1072 	}
1073 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
1074       if (false) {}
1075       TRY(LBP_BK)
1076       TRY(LBP_CM)
1077       TRY(LBP_ZW)
1078       TRY(LBP_IN)
1079       TRY(LBP_GL)
1080       TRY(LBP_CB)
1081       TRY(LBP_SP)
1082       TRY(LBP_BA)
1083       TRY(LBP_BB)
1084       TRY(LBP_B2)
1085       TRY(LBP_HY)
1086       TRY(LBP_NS)
1087       TRY(LBP_OP)
1088       TRY(LBP_CL)
1089       TRY(LBP_QU)
1090       TRY(LBP_EX)
1091       TRY(LBP_ID)
1092       TRY(LBP_NU)
1093       TRY(LBP_IS)
1094       TRY(LBP_SY)
1095       TRY(LBP_AL)
1096       TRY(LBP_PR)
1097       TRY(LBP_PO)
1098       TRY(LBP_SA)
1099       TRY(LBP_XX)
1100       TRY(LBP_AI)
1101 #undef TRY
1102       else if (strcmp (field1, "LF") == 0) value = LBP_BK;
1103       else if (strcmp (field1, "CR") == 0) value = LBP_BK;
1104       else if (strcmp (field1, "SG") == 0) value = LBP_XX;
1105       else
1106 	{
1107 	  fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
1108 		   field1, linebreak_filename, lineno);
1109 	  exit (1);
1110 	}
1111       i = strtoul (field0, NULL, 16);
1112       if (strstr (field0, "..") != NULL)
1113 	{
1114 	  /* Deal with a range.  */
1115 	  j = strtoul (strstr (field0, "..") + 2, NULL, 16);
1116 	  for (; i <= j; i++)
1117 	    unicode_org_lbp[i] = value;
1118 	}
1119       else
1120 	{
1121 	  /* Single character line.  */
1122 	  unicode_org_lbp[i] = value;
1123 	}
1124     }
1125   if (ferror (stream) || fclose (stream))
1126     {
1127       fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
1128       exit (1);
1129     }
1130 }
1131 
1132 /* Output the line breaking properties in a human readable format.  */
1133 static void
debug_output_org_lbp(FILE * stream)1134 debug_output_org_lbp (FILE *stream)
1135 {
1136   unsigned int i;
1137 
1138   for (i = 0; i < 0x110000; i++)
1139     {
1140       int attr = unicode_org_lbp[i];
1141       if (attr != LBP_XX)
1142 	{
1143 	  fprintf (stream, "0x%04X", i);
1144 #define PRINT_BIT(attr,bit) \
1145   if (attr == bit) fprintf (stream, " " #bit);
1146 	  PRINT_BIT(attr,LBP_BK);
1147 	  PRINT_BIT(attr,LBP_CM);
1148 	  PRINT_BIT(attr,LBP_ZW);
1149 	  PRINT_BIT(attr,LBP_IN);
1150 	  PRINT_BIT(attr,LBP_GL);
1151 	  PRINT_BIT(attr,LBP_CB);
1152 	  PRINT_BIT(attr,LBP_SP);
1153 	  PRINT_BIT(attr,LBP_BA);
1154 	  PRINT_BIT(attr,LBP_BB);
1155 	  PRINT_BIT(attr,LBP_B2);
1156 	  PRINT_BIT(attr,LBP_HY);
1157 	  PRINT_BIT(attr,LBP_NS);
1158 	  PRINT_BIT(attr,LBP_OP);
1159 	  PRINT_BIT(attr,LBP_CL);
1160 	  PRINT_BIT(attr,LBP_QU);
1161 	  PRINT_BIT(attr,LBP_EX);
1162 	  PRINT_BIT(attr,LBP_ID);
1163 	  PRINT_BIT(attr,LBP_NU);
1164 	  PRINT_BIT(attr,LBP_IS);
1165 	  PRINT_BIT(attr,LBP_SY);
1166 	  PRINT_BIT(attr,LBP_AL);
1167 	  PRINT_BIT(attr,LBP_PR);
1168 	  PRINT_BIT(attr,LBP_PO);
1169 	  PRINT_BIT(attr,LBP_SA);
1170 	  PRINT_BIT(attr,LBP_XX);
1171 	  PRINT_BIT(attr,LBP_AI);
1172 #undef PRINT_BIT
1173 	  fprintf (stream, "\n");
1174 	}
1175     }
1176 }
1177 
1178 static void
debug_output_org_tables(const char * filename)1179 debug_output_org_tables (const char *filename)
1180 {
1181   FILE *stream;
1182 
1183   stream = fopen (filename, "w");
1184   if (stream == NULL)
1185     {
1186       fprintf (stderr, "cannot open '%s' for writing\n", filename);
1187       exit (1);
1188     }
1189 
1190   debug_output_org_lbp (stream);
1191 
1192   if (ferror (stream) || fclose (stream))
1193     {
1194       fprintf (stderr, "error writing to '%s'\n", filename);
1195       exit (1);
1196     }
1197 }
1198 
1199 /* Construction of sparse 3-level tables.  */
1200 #define TABLE lbp_table
1201 #define ELEMENT unsigned char
1202 #define DEFAULT LBP_XX
1203 #define xmalloc malloc
1204 #define xrealloc realloc
1205 #include "3level.h"
1206 
1207 static void
output_lbp(FILE * stream)1208 output_lbp (FILE *stream)
1209 {
1210   unsigned int i;
1211   struct lbp_table t;
1212   unsigned int level1_offset, level2_offset, level3_offset;
1213 
1214   t.p = 7;
1215   t.q = 9;
1216   lbp_table_init (&t);
1217 
1218   for (i = 0; i < 0x110000; i++)
1219     {
1220       int attr = get_lbp (i);
1221 
1222       /* Now attr should contain exactly one bit.  */
1223       if (attr == 0 || ((attr & (attr - 1)) != 0))
1224 	abort ();
1225 
1226       if (attr != 1 << LBP_XX)
1227 	{
1228 	  unsigned int log2_attr;
1229 	  for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
1230 
1231 	  lbp_table_add (&t, i, log2_attr);
1232 	}
1233     }
1234 
1235   lbp_table_finalize (&t);
1236 
1237   level1_offset =
1238     5 * sizeof (uint32_t);
1239   level2_offset =
1240     5 * sizeof (uint32_t)
1241     + t.level1_size * sizeof (uint32_t);
1242   level3_offset =
1243     5 * sizeof (uint32_t)
1244     + t.level1_size * sizeof (uint32_t)
1245     + (t.level2_size << t.q) * sizeof (uint32_t);
1246 
1247   for (i = 0; i < 5; i++)
1248     fprintf (stream, "#define lbrkprop_header_%d %d\n", i,
1249 	     ((uint32_t *) t.result)[i]);
1250   fprintf (stream, "static const\n");
1251   fprintf (stream, "struct\n");
1252   fprintf (stream, "  {\n");
1253   fprintf (stream, "    int level1[%d];\n", t.level1_size);
1254   fprintf (stream, "    int level2[%d << %d];\n", t.level2_size, t.q);
1255   fprintf (stream, "    unsigned char level3[%d << %d];\n", t.level3_size, t.p);
1256   fprintf (stream, "  }\n");
1257   fprintf (stream, "lbrkprop =\n");
1258   fprintf (stream, "{\n");
1259   fprintf (stream, "  {");
1260   for (i = 0; i < t.level1_size; i++)
1261     {
1262       uint32_t offset;
1263       if (i > 0 && (i % 8) == 0)
1264 	fprintf (stream, "\n   ");
1265       offset = ((uint32_t *) (t.result + level1_offset))[i];
1266       fprintf (stream, " %5d%s",
1267 	       offset == 0 ? -1 : (offset - level2_offset) / sizeof (uint32_t),
1268 	       (i+1 < t.level1_size ? "," : ""));
1269     }
1270   fprintf (stream, " },\n");
1271   fprintf (stream, "  {");
1272   if (t.level2_size << t.q > 8)
1273     fprintf (stream, "\n   ");
1274   for (i = 0; i < t.level2_size << t.q; i++)
1275     {
1276       uint32_t offset;
1277       if (i > 0 && (i % 8) == 0)
1278 	fprintf (stream, "\n   ");
1279       offset = ((uint32_t *) (t.result + level2_offset))[i];
1280       fprintf (stream, " %5d%s",
1281 	       offset == 0 ? -1 : (offset - level3_offset) / sizeof (uint8_t),
1282 	       (i+1 < t.level2_size << t.q ? "," : ""));
1283     }
1284   if (t.level2_size << t.q > 8)
1285     fprintf (stream, "\n ");
1286   fprintf (stream, " },\n");
1287   fprintf (stream, "  {");
1288   if (t.level3_size << t.p > 8)
1289     fprintf (stream, "\n   ");
1290   for (i = 0; i < t.level3_size << t.p; i++)
1291     {
1292       unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
1293       const char *value_string;
1294       switch (value)
1295 	{
1296 #define CASE(x) case x: value_string = #x; break;
1297 	  CASE(LBP_BK);
1298 	  CASE(LBP_CM);
1299 	  CASE(LBP_ZW);
1300 	  CASE(LBP_IN);
1301 	  CASE(LBP_GL);
1302 	  CASE(LBP_CB);
1303 	  CASE(LBP_SP);
1304 	  CASE(LBP_BA);
1305 	  CASE(LBP_BB);
1306 	  CASE(LBP_B2);
1307 	  CASE(LBP_HY);
1308 	  CASE(LBP_NS);
1309 	  CASE(LBP_OP);
1310 	  CASE(LBP_CL);
1311 	  CASE(LBP_QU);
1312 	  CASE(LBP_EX);
1313 	  CASE(LBP_ID);
1314 	  CASE(LBP_NU);
1315 	  CASE(LBP_IS);
1316 	  CASE(LBP_SY);
1317 	  CASE(LBP_AL);
1318 	  CASE(LBP_PR);
1319 	  CASE(LBP_PO);
1320 	  CASE(LBP_SA);
1321 	  CASE(LBP_XX);
1322 	  CASE(LBP_AI);
1323 #undef CASE
1324 	  default:
1325 	    abort ();
1326 	}
1327       if (i > 0 && (i % 8) == 0)
1328 	fprintf (stream, "\n   ");
1329       fprintf (stream, " %s%s", value_string,
1330 	       (i+1 < t.level3_size << t.p ? "," : ""));
1331     }
1332   if (t.level3_size << t.p > 8)
1333     fprintf (stream, "\n ");
1334   fprintf (stream, " }\n");
1335   fprintf (stream, "};\n");
1336 }
1337 
1338 static void
output_tables(const char * filename,const char * version)1339 output_tables (const char *filename, const char *version)
1340 {
1341   FILE *stream;
1342 
1343   stream = fopen (filename, "w");
1344   if (stream == NULL)
1345     {
1346       fprintf (stderr, "cannot open '%s' for writing\n", filename);
1347       exit (1);
1348     }
1349 
1350   fprintf (stream, "/* Line breaking properties of Unicode characters.  */\n");
1351   fprintf (stream, "/* Generated automatically by gen-lbrkprop for Unicode %s.  */\n",
1352 	   version);
1353   fprintf (stream, "\n");
1354 
1355   /* Put a GPL header on it.  The gnulib module is under LGPL (although it
1356      still carries the GPL header), and it's gnulib-tool which replaces the
1357      GPL header with an LGPL header.  */
1358   fprintf (stream, "/* Copyright (C) 2000-2004 Free Software Foundation, Inc.\n");
1359   fprintf (stream, "\n");
1360   fprintf (stream, "This program is free software; you can redistribute it and/or modify\n");
1361   fprintf (stream, "it under the terms of the GNU General Public License as published by\n");
1362   fprintf (stream, "the Free Software Foundation; either version 2, or (at your option)\n");
1363   fprintf (stream, "any later version.\n");
1364   fprintf (stream, "\n");
1365   fprintf (stream, "This program is distributed in the hope that it will be useful,\n");
1366   fprintf (stream, "but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
1367   fprintf (stream, "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n");
1368   fprintf (stream, "GNU General Public License for more details.\n");
1369   fprintf (stream, "\n");
1370   fprintf (stream, "You should have received a copy of the GNU General Public License\n");
1371   fprintf (stream, "along with this program; if not, write to the Free Software\n");
1372   fprintf (stream, "Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */\n");
1373   fprintf (stream, "\n");
1374 
1375   output_lbp (stream);
1376 
1377   if (ferror (stream) || fclose (stream))
1378     {
1379       fprintf (stderr, "error writing to '%s'\n", filename);
1380       exit (1);
1381     }
1382 }
1383 
1384 int
main(int argc,char * argv[])1385 main (int argc, char * argv[])
1386 {
1387   if (argc != 6)
1388     {
1389       fprintf (stderr, "Usage: %s UnicodeData.txt Combining.txt EastAsianWidth.txt LineBreak.txt version\n",
1390 	       argv[0]);
1391       exit (1);
1392     }
1393 
1394   fill_attributes (argv[1]);
1395   fill_combining (argv[2]);
1396   fill_width (argv[3]);
1397   fill_org_lbp (argv[4]);
1398 
1399   debug_output_tables ("lbrkprop.txt");
1400   debug_output_org_tables ("lbrkprop_org.txt");
1401 
1402   output_tables ("lbrkprop.h", argv[5]);
1403 
1404   return 0;
1405 }
1406