xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/libuniname/gen-uninames (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1#!/usr/local/bin/clisp -C
2
3;;; Creation of CLISP's uni_names.h from the UnicodeData.txt table.
4;;; Bruno Haible 2000-12-28
5
6(defparameter add-comments nil)
7
8(defstruct unicode-char
9  (code nil :type integer)
10  (name nil :type string)
11  word-indices
12  word-indices-index
13)
14
15(defstruct word-list
16  (hashed nil :type hash-table)
17  (sorted nil :type list)
18  size                          ; number of characters total
19  length                        ; number of words
20)
21
22(defun main (inputfile outputfile)
23  (declare (type string inputfile outputfile))
24  #+UNICODE (setq *default-file-encoding* charset:utf-8)
25  (let ((all-chars '()))
26    ;; Read all characters and names from the input file.
27    (with-open-file (istream inputfile :direction :input)
28      (loop
29        (let ((line (read-line istream nil nil)))
30          (unless line (return))
31          (let* ((i1 (position #\; line))
32                 (i2 (position #\; line :start (1+ i1)))
33                 (code-string (subseq line 0 i1))
34                 (code (parse-integer code-string :radix 16))
35                 (name-string (subseq line (1+ i1) i2)))
36            ; Ignore characters whose name starts with "<".
37            (unless (eql (char name-string 0) #\<)
38              ; Also ignore Hangul syllables; they are treated specially.
39              (unless (<= #xAC00 code #xD7A3)
40                ; Also ignore CJK compatibility ideographs; they are treated
41                ; specially as well.
42                (unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A)
43                            (<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D))
44                  ; Transform the code so that it fits in 16 bits. In
45                  ; Unicode 3.1 the following ranges are used.
46                  ;   0x00000..0x04DFF  >>12=  0x00..0x04  -> 0x0..0x4
47                  ;   0x0A000..0x0A4FF  >>12=  0x0A        -> 0x5
48                  ;   0x0F900..0x0FFFF  >>12=  0x0F        -> 0x6
49                  ;   0x10300..0x104FF  >>12=  0x10        -> 0x7
50                  ;   0x1D000..0x1D7DD  >>12=  0x1D        -> 0x8
51                  ;   0x2F800..0x2FAFF  >>12=  0x2F        -> 0x9
52                  ;   0xE0000..0xE00FF  >>12=  0xE0        -> 0xA
53                  (flet ((transform (x)
54                           (dpb
55                             (case (ash x -12)
56                               ((#x00 #x01 #x02 #x03 #x04) (ash x -12))
57                               (#x0A 5)
58                               (#x0F 6)
59                               (#x10 7)
60                               (#x1D 8)
61                               (#x2F 9)
62                               (#xE0 #xA)
63                               (t (error "Update the transform function for 0x~5,'0X" x))
64                             )
65                             (byte 8 12)
66                             x
67                        )) )
68                    (push (make-unicode-char :code (transform code)
69                                             :name name-string)
70                          all-chars
71            ) ) ) ) )
72    ) ) ) )
73    (setq all-chars (nreverse all-chars))
74    ;; Split into words.
75    (let ((words-by-length (make-array 0 :adjustable t)))
76      (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars)))
77        (let ((i1 0))
78          (loop
79            (when (>= i1 (length name)) (return))
80            (let ((i2 (or (position #\Space name :start i1) (length name))))
81              (let* ((word (subseq name i1 i2))
82                     (len (length word)))
83                (when (>= len (length words-by-length))
84                  (adjust-array words-by-length (1+ len))
85                )
86                (unless (aref words-by-length len)
87                  (setf (aref words-by-length len)
88                        (make-word-list
89                          :hashed (make-hash-table :test #'equal)
90                          :sorted '()
91                ) )     )
92                (let ((word-list (aref words-by-length len)))
93                  (unless (gethash word (word-list-hashed word-list))
94                    (setf (gethash word (word-list-hashed word-list)) t)
95                    (push word (word-list-sorted word-list))
96                ) )
97              )
98              (setq i1 (1+ i2))
99      ) ) ) )
100      ;; Sort the word lists.
101      (dotimes (len (length words-by-length))
102        (unless (aref words-by-length len)
103          (setf (aref words-by-length len)
104                (make-word-list
105                  :hashed (make-hash-table :test #'equal)
106                  :sorted '()
107        ) )     )
108        (let ((word-list (aref words-by-length len)))
109          (setf (word-list-sorted word-list)
110                (sort (word-list-sorted word-list) #'string<)
111          )
112          (setf (word-list-size word-list)
113                (reduce #'+ (mapcar #'length (word-list-sorted word-list)))
114          )
115          (setf (word-list-length word-list)
116                (length (word-list-sorted word-list))
117      ) ) )
118      ;; Output the tables.
119      (with-open-file (ostream outputfile :direction :output
120                       #+UNICODE :external-format #+UNICODE charset:ascii)
121        (format ostream "/*~%")
122        (format ostream " * ~A~%" (file-namestring outputfile))
123        (format ostream " *~%")
124        (format ostream " * Unicode character name table.~%")
125        (format ostream " * Generated automatically by the gen-uninames utility.~%")
126        (format ostream " */~%")
127        (format ostream "~%")
128        (format ostream "static const char unicode_name_words[~D] = {~%"
129                        (let ((sum 0))
130                          (dotimes (len (length words-by-length))
131                            (let ((word-list (aref words-by-length len)))
132                              (incf sum (word-list-size word-list))
133                          ) )
134                          sum
135        )               )
136        (dotimes (len (length words-by-length))
137          (let ((word-list (aref words-by-length len)))
138            (dolist (word (word-list-sorted word-list))
139              (format ostream " ~{ '~C',~}~%" (coerce word 'list))
140        ) ) )
141        (format ostream "};~%")
142        (format ostream "#define UNICODE_CHARNAME_NUM_WORDS ~D~%"
143                        (let ((sum 0))
144                          (dotimes (len (length words-by-length))
145                            (let ((word-list (aref words-by-length len)))
146                              (incf sum (word-list-length word-list))
147                          ) )
148                          sum
149        )               )
150        #| ; Redundant data
151        (format ostream "static const uint16_t unicode_name_word_offsets[~D] = {~%"
152                        (let ((sum 0))
153                          (dotimes (len (length words-by-length))
154                            (let ((word-list (aref words-by-length len)))
155                              (incf sum (word-list-length word-list))
156                          ) )
157                          sum
158        )               )
159        (dotimes (len (length words-by-length))
160          (let ((word-list (aref words-by-length len)))
161            (when (word-list-sorted word-list)
162              (format ostream " ")
163              (do ((l (word-list-sorted word-list) (cdr l))
164                   (offset 0 (+ offset (length (car l)))))
165                  ((endp l))
166                (format ostream "~<~% ~0,79:; ~D,~>" offset)
167              )
168              (format ostream "~%")
169        ) ) )
170        (format ostream "};~%")
171        |#
172        (format ostream "static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%"
173                        (1+ (length words-by-length))
174        )
175        (let ((extra-offset 0)
176              (ind-offset 0))
177          (dotimes (len (length words-by-length))
178            (let ((word-list (aref words-by-length len)))
179              (format ostream "  { ~D, ~D },~%" extra-offset ind-offset)
180              (incf extra-offset (word-list-size word-list))
181              (incf ind-offset (word-list-length word-list))
182          ) )
183          (format ostream "  { ~D, ~D }~%" extra-offset ind-offset)
184        )
185        (format ostream "};~%")
186        (let ((ind-offset 0))
187          (dotimes (len (length words-by-length))
188            (let ((word-list (aref words-by-length len)))
189              (dolist (word (word-list-sorted word-list))
190                (setf (gethash word (word-list-hashed word-list)) ind-offset)
191                (incf ind-offset)
192        ) ) ) )
193        (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY"))
194          (format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word
195                          (gethash word (word-list-hashed (aref words-by-length (length word))))
196        ) )
197        ;; Compute the word-indices for every unicode-char.
198        (dolist (uc all-chars)
199          (let ((name (unicode-char-name uc))
200                (indices '()))
201            (let ((i1 0))
202              (loop
203                (when (>= i1 (length name)) (return))
204                (let ((i2 (or (position #\Space name :start i1) (length name))))
205                  (let* ((word (subseq name i1 i2))
206                         (len (length word)))
207                    (push (gethash word (word-list-hashed (aref words-by-length len)))
208                          indices
209                    )
210                  )
211                  (setq i1 (1+ i2))
212            ) ) )
213            (setf (unicode-char-word-indices uc)
214                  (coerce (nreverse indices) 'vector)
215            )
216        ) )
217        ;; Sort the list of unicode-chars by word-indices.
218        (setq all-chars
219              (sort all-chars
220                    (lambda (vec1 vec2)
221                      (let ((len1 (length vec1))
222                            (len2 (length vec2)))
223                        (do ((i 0 (1+ i)))
224                            (nil)
225                          (if (< i len2)
226                            (if (< i len1)
227                              (cond ((< (aref vec1 i) (aref vec2 i)) (return t))
228                                    ((> (aref vec1 i) (aref vec2 i)) (return nil))
229                              )
230                              (return t)
231                            )
232                            (return nil)
233                    ) ) ) )
234                    :key #'unicode-char-word-indices
235        )     )
236        ;; Output the word-indices.
237        (format ostream "static const uint16_t unicode_names[~D] = {~%"
238                        (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
239        )
240        (let ((i 0))
241          (dolist (uc all-chars)
242            (format ostream " ~{ ~D,~}"
243                            (maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0)))
244                                     (coerce (unicode-char-word-indices uc) 'list)
245                            )
246            )
247            (when add-comments
248              (format ostream "~40T/* ~A */" (unicode-char-name uc))
249            )
250            (format ostream "~%")
251            (setf (unicode-char-word-indices-index uc) i)
252            (incf i (length (unicode-char-word-indices uc)))
253        ) )
254        (format ostream "};~%")
255        (format ostream "static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[~D] = {~%"
256                        (length all-chars)
257        )
258        (dolist (uc all-chars)
259          (format ostream "  { 0x~4,'0X, ~D },"
260                          (unicode-char-code uc)
261                          (unicode-char-word-indices-index uc)
262          )
263          (when add-comments
264            (format ostream "~21T/* ~A */" (unicode-char-name uc))
265          )
266          (format ostream "~%")
267        )
268        (format ostream "};~%")
269        (format ostream "static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[~D] = {~%"
270                        (length all-chars)
271        )
272        (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code))
273          (format ostream "  { 0x~4,'0X, ~D },"
274                          (unicode-char-code uc)
275                          (unicode-char-word-indices-index uc)
276          )
277          (when add-comments
278            (format ostream "~21T/* ~A */" (unicode-char-name uc))
279          )
280          (format ostream "~%")
281        )
282        (format ostream "};~%")
283        (format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
284                        (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars))
285        )
286        (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
287                        (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
288        )
289      )
290) ) )
291
292(main (first *args*) (second *args*))
293