xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/libuniname/gen-uninames (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1*946379e7Schristos#!/usr/local/bin/clisp -C
2*946379e7Schristos
3*946379e7Schristos;;; Creation of CLISP's uni_names.h from the UnicodeData.txt table.
4*946379e7Schristos;;; Bruno Haible 2000-12-28
5*946379e7Schristos
6*946379e7Schristos(defparameter add-comments nil)
7*946379e7Schristos
8*946379e7Schristos(defstruct unicode-char
9*946379e7Schristos  (code nil :type integer)
10*946379e7Schristos  (name nil :type string)
11*946379e7Schristos  word-indices
12*946379e7Schristos  word-indices-index
13*946379e7Schristos)
14*946379e7Schristos
15*946379e7Schristos(defstruct word-list
16*946379e7Schristos  (hashed nil :type hash-table)
17*946379e7Schristos  (sorted nil :type list)
18*946379e7Schristos  size                          ; number of characters total
19*946379e7Schristos  length                        ; number of words
20*946379e7Schristos)
21*946379e7Schristos
22*946379e7Schristos(defun main (inputfile outputfile)
23*946379e7Schristos  (declare (type string inputfile outputfile))
24*946379e7Schristos  #+UNICODE (setq *default-file-encoding* charset:utf-8)
25*946379e7Schristos  (let ((all-chars '()))
26*946379e7Schristos    ;; Read all characters and names from the input file.
27*946379e7Schristos    (with-open-file (istream inputfile :direction :input)
28*946379e7Schristos      (loop
29*946379e7Schristos        (let ((line (read-line istream nil nil)))
30*946379e7Schristos          (unless line (return))
31*946379e7Schristos          (let* ((i1 (position #\; line))
32*946379e7Schristos                 (i2 (position #\; line :start (1+ i1)))
33*946379e7Schristos                 (code-string (subseq line 0 i1))
34*946379e7Schristos                 (code (parse-integer code-string :radix 16))
35*946379e7Schristos                 (name-string (subseq line (1+ i1) i2)))
36*946379e7Schristos            ; Ignore characters whose name starts with "<".
37*946379e7Schristos            (unless (eql (char name-string 0) #\<)
38*946379e7Schristos              ; Also ignore Hangul syllables; they are treated specially.
39*946379e7Schristos              (unless (<= #xAC00 code #xD7A3)
40*946379e7Schristos                ; Also ignore CJK compatibility ideographs; they are treated
41*946379e7Schristos                ; specially as well.
42*946379e7Schristos                (unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A)
43*946379e7Schristos                            (<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D))
44*946379e7Schristos                  ; Transform the code so that it fits in 16 bits. In
45*946379e7Schristos                  ; Unicode 3.1 the following ranges are used.
46*946379e7Schristos                  ;   0x00000..0x04DFF  >>12=  0x00..0x04  -> 0x0..0x4
47*946379e7Schristos                  ;   0x0A000..0x0A4FF  >>12=  0x0A        -> 0x5
48*946379e7Schristos                  ;   0x0F900..0x0FFFF  >>12=  0x0F        -> 0x6
49*946379e7Schristos                  ;   0x10300..0x104FF  >>12=  0x10        -> 0x7
50*946379e7Schristos                  ;   0x1D000..0x1D7DD  >>12=  0x1D        -> 0x8
51*946379e7Schristos                  ;   0x2F800..0x2FAFF  >>12=  0x2F        -> 0x9
52*946379e7Schristos                  ;   0xE0000..0xE00FF  >>12=  0xE0        -> 0xA
53*946379e7Schristos                  (flet ((transform (x)
54*946379e7Schristos                           (dpb
55*946379e7Schristos                             (case (ash x -12)
56*946379e7Schristos                               ((#x00 #x01 #x02 #x03 #x04) (ash x -12))
57*946379e7Schristos                               (#x0A 5)
58*946379e7Schristos                               (#x0F 6)
59*946379e7Schristos                               (#x10 7)
60*946379e7Schristos                               (#x1D 8)
61*946379e7Schristos                               (#x2F 9)
62*946379e7Schristos                               (#xE0 #xA)
63*946379e7Schristos                               (t (error "Update the transform function for 0x~5,'0X" x))
64*946379e7Schristos                             )
65*946379e7Schristos                             (byte 8 12)
66*946379e7Schristos                             x
67*946379e7Schristos                        )) )
68*946379e7Schristos                    (push (make-unicode-char :code (transform code)
69*946379e7Schristos                                             :name name-string)
70*946379e7Schristos                          all-chars
71*946379e7Schristos            ) ) ) ) )
72*946379e7Schristos    ) ) ) )
73*946379e7Schristos    (setq all-chars (nreverse all-chars))
74*946379e7Schristos    ;; Split into words.
75*946379e7Schristos    (let ((words-by-length (make-array 0 :adjustable t)))
76*946379e7Schristos      (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars)))
77*946379e7Schristos        (let ((i1 0))
78*946379e7Schristos          (loop
79*946379e7Schristos            (when (>= i1 (length name)) (return))
80*946379e7Schristos            (let ((i2 (or (position #\Space name :start i1) (length name))))
81*946379e7Schristos              (let* ((word (subseq name i1 i2))
82*946379e7Schristos                     (len (length word)))
83*946379e7Schristos                (when (>= len (length words-by-length))
84*946379e7Schristos                  (adjust-array words-by-length (1+ len))
85*946379e7Schristos                )
86*946379e7Schristos                (unless (aref words-by-length len)
87*946379e7Schristos                  (setf (aref words-by-length len)
88*946379e7Schristos                        (make-word-list
89*946379e7Schristos                          :hashed (make-hash-table :test #'equal)
90*946379e7Schristos                          :sorted '()
91*946379e7Schristos                ) )     )
92*946379e7Schristos                (let ((word-list (aref words-by-length len)))
93*946379e7Schristos                  (unless (gethash word (word-list-hashed word-list))
94*946379e7Schristos                    (setf (gethash word (word-list-hashed word-list)) t)
95*946379e7Schristos                    (push word (word-list-sorted word-list))
96*946379e7Schristos                ) )
97*946379e7Schristos              )
98*946379e7Schristos              (setq i1 (1+ i2))
99*946379e7Schristos      ) ) ) )
100*946379e7Schristos      ;; Sort the word lists.
101*946379e7Schristos      (dotimes (len (length words-by-length))
102*946379e7Schristos        (unless (aref words-by-length len)
103*946379e7Schristos          (setf (aref words-by-length len)
104*946379e7Schristos                (make-word-list
105*946379e7Schristos                  :hashed (make-hash-table :test #'equal)
106*946379e7Schristos                  :sorted '()
107*946379e7Schristos        ) )     )
108*946379e7Schristos        (let ((word-list (aref words-by-length len)))
109*946379e7Schristos          (setf (word-list-sorted word-list)
110*946379e7Schristos                (sort (word-list-sorted word-list) #'string<)
111*946379e7Schristos          )
112*946379e7Schristos          (setf (word-list-size word-list)
113*946379e7Schristos                (reduce #'+ (mapcar #'length (word-list-sorted word-list)))
114*946379e7Schristos          )
115*946379e7Schristos          (setf (word-list-length word-list)
116*946379e7Schristos                (length (word-list-sorted word-list))
117*946379e7Schristos      ) ) )
118*946379e7Schristos      ;; Output the tables.
119*946379e7Schristos      (with-open-file (ostream outputfile :direction :output
120*946379e7Schristos                       #+UNICODE :external-format #+UNICODE charset:ascii)
121*946379e7Schristos        (format ostream "/*~%")
122*946379e7Schristos        (format ostream " * ~A~%" (file-namestring outputfile))
123*946379e7Schristos        (format ostream " *~%")
124*946379e7Schristos        (format ostream " * Unicode character name table.~%")
125*946379e7Schristos        (format ostream " * Generated automatically by the gen-uninames utility.~%")
126*946379e7Schristos        (format ostream " */~%")
127*946379e7Schristos        (format ostream "~%")
128*946379e7Schristos        (format ostream "static const char unicode_name_words[~D] = {~%"
129*946379e7Schristos                        (let ((sum 0))
130*946379e7Schristos                          (dotimes (len (length words-by-length))
131*946379e7Schristos                            (let ((word-list (aref words-by-length len)))
132*946379e7Schristos                              (incf sum (word-list-size word-list))
133*946379e7Schristos                          ) )
134*946379e7Schristos                          sum
135*946379e7Schristos        )               )
136*946379e7Schristos        (dotimes (len (length words-by-length))
137*946379e7Schristos          (let ((word-list (aref words-by-length len)))
138*946379e7Schristos            (dolist (word (word-list-sorted word-list))
139*946379e7Schristos              (format ostream " ~{ '~C',~}~%" (coerce word 'list))
140*946379e7Schristos        ) ) )
141*946379e7Schristos        (format ostream "};~%")
142*946379e7Schristos        (format ostream "#define UNICODE_CHARNAME_NUM_WORDS ~D~%"
143*946379e7Schristos                        (let ((sum 0))
144*946379e7Schristos                          (dotimes (len (length words-by-length))
145*946379e7Schristos                            (let ((word-list (aref words-by-length len)))
146*946379e7Schristos                              (incf sum (word-list-length word-list))
147*946379e7Schristos                          ) )
148*946379e7Schristos                          sum
149*946379e7Schristos        )               )
150*946379e7Schristos        #| ; Redundant data
151*946379e7Schristos        (format ostream "static const uint16_t unicode_name_word_offsets[~D] = {~%"
152*946379e7Schristos                        (let ((sum 0))
153*946379e7Schristos                          (dotimes (len (length words-by-length))
154*946379e7Schristos                            (let ((word-list (aref words-by-length len)))
155*946379e7Schristos                              (incf sum (word-list-length word-list))
156*946379e7Schristos                          ) )
157*946379e7Schristos                          sum
158*946379e7Schristos        )               )
159*946379e7Schristos        (dotimes (len (length words-by-length))
160*946379e7Schristos          (let ((word-list (aref words-by-length len)))
161*946379e7Schristos            (when (word-list-sorted word-list)
162*946379e7Schristos              (format ostream " ")
163*946379e7Schristos              (do ((l (word-list-sorted word-list) (cdr l))
164*946379e7Schristos                   (offset 0 (+ offset (length (car l)))))
165*946379e7Schristos                  ((endp l))
166*946379e7Schristos                (format ostream "~<~% ~0,79:; ~D,~>" offset)
167*946379e7Schristos              )
168*946379e7Schristos              (format ostream "~%")
169*946379e7Schristos        ) ) )
170*946379e7Schristos        (format ostream "};~%")
171*946379e7Schristos        |#
172*946379e7Schristos        (format ostream "static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%"
173*946379e7Schristos                        (1+ (length words-by-length))
174*946379e7Schristos        )
175*946379e7Schristos        (let ((extra-offset 0)
176*946379e7Schristos              (ind-offset 0))
177*946379e7Schristos          (dotimes (len (length words-by-length))
178*946379e7Schristos            (let ((word-list (aref words-by-length len)))
179*946379e7Schristos              (format ostream "  { ~D, ~D },~%" extra-offset ind-offset)
180*946379e7Schristos              (incf extra-offset (word-list-size word-list))
181*946379e7Schristos              (incf ind-offset (word-list-length word-list))
182*946379e7Schristos          ) )
183*946379e7Schristos          (format ostream "  { ~D, ~D }~%" extra-offset ind-offset)
184*946379e7Schristos        )
185*946379e7Schristos        (format ostream "};~%")
186*946379e7Schristos        (let ((ind-offset 0))
187*946379e7Schristos          (dotimes (len (length words-by-length))
188*946379e7Schristos            (let ((word-list (aref words-by-length len)))
189*946379e7Schristos              (dolist (word (word-list-sorted word-list))
190*946379e7Schristos                (setf (gethash word (word-list-hashed word-list)) ind-offset)
191*946379e7Schristos                (incf ind-offset)
192*946379e7Schristos        ) ) ) )
193*946379e7Schristos        (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY"))
194*946379e7Schristos          (format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word
195*946379e7Schristos                          (gethash word (word-list-hashed (aref words-by-length (length word))))
196*946379e7Schristos        ) )
197*946379e7Schristos        ;; Compute the word-indices for every unicode-char.
198*946379e7Schristos        (dolist (uc all-chars)
199*946379e7Schristos          (let ((name (unicode-char-name uc))
200*946379e7Schristos                (indices '()))
201*946379e7Schristos            (let ((i1 0))
202*946379e7Schristos              (loop
203*946379e7Schristos                (when (>= i1 (length name)) (return))
204*946379e7Schristos                (let ((i2 (or (position #\Space name :start i1) (length name))))
205*946379e7Schristos                  (let* ((word (subseq name i1 i2))
206*946379e7Schristos                         (len (length word)))
207*946379e7Schristos                    (push (gethash word (word-list-hashed (aref words-by-length len)))
208*946379e7Schristos                          indices
209*946379e7Schristos                    )
210*946379e7Schristos                  )
211*946379e7Schristos                  (setq i1 (1+ i2))
212*946379e7Schristos            ) ) )
213*946379e7Schristos            (setf (unicode-char-word-indices uc)
214*946379e7Schristos                  (coerce (nreverse indices) 'vector)
215*946379e7Schristos            )
216*946379e7Schristos        ) )
217*946379e7Schristos        ;; Sort the list of unicode-chars by word-indices.
218*946379e7Schristos        (setq all-chars
219*946379e7Schristos              (sort all-chars
220*946379e7Schristos                    (lambda (vec1 vec2)
221*946379e7Schristos                      (let ((len1 (length vec1))
222*946379e7Schristos                            (len2 (length vec2)))
223*946379e7Schristos                        (do ((i 0 (1+ i)))
224*946379e7Schristos                            (nil)
225*946379e7Schristos                          (if (< i len2)
226*946379e7Schristos                            (if (< i len1)
227*946379e7Schristos                              (cond ((< (aref vec1 i) (aref vec2 i)) (return t))
228*946379e7Schristos                                    ((> (aref vec1 i) (aref vec2 i)) (return nil))
229*946379e7Schristos                              )
230*946379e7Schristos                              (return t)
231*946379e7Schristos                            )
232*946379e7Schristos                            (return nil)
233*946379e7Schristos                    ) ) ) )
234*946379e7Schristos                    :key #'unicode-char-word-indices
235*946379e7Schristos        )     )
236*946379e7Schristos        ;; Output the word-indices.
237*946379e7Schristos        (format ostream "static const uint16_t unicode_names[~D] = {~%"
238*946379e7Schristos                        (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
239*946379e7Schristos        )
240*946379e7Schristos        (let ((i 0))
241*946379e7Schristos          (dolist (uc all-chars)
242*946379e7Schristos            (format ostream " ~{ ~D,~}"
243*946379e7Schristos                            (maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0)))
244*946379e7Schristos                                     (coerce (unicode-char-word-indices uc) 'list)
245*946379e7Schristos                            )
246*946379e7Schristos            )
247*946379e7Schristos            (when add-comments
248*946379e7Schristos              (format ostream "~40T/* ~A */" (unicode-char-name uc))
249*946379e7Schristos            )
250*946379e7Schristos            (format ostream "~%")
251*946379e7Schristos            (setf (unicode-char-word-indices-index uc) i)
252*946379e7Schristos            (incf i (length (unicode-char-word-indices uc)))
253*946379e7Schristos        ) )
254*946379e7Schristos        (format ostream "};~%")
255*946379e7Schristos        (format ostream "static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[~D] = {~%"
256*946379e7Schristos                        (length all-chars)
257*946379e7Schristos        )
258*946379e7Schristos        (dolist (uc all-chars)
259*946379e7Schristos          (format ostream "  { 0x~4,'0X, ~D },"
260*946379e7Schristos                          (unicode-char-code uc)
261*946379e7Schristos                          (unicode-char-word-indices-index uc)
262*946379e7Schristos          )
263*946379e7Schristos          (when add-comments
264*946379e7Schristos            (format ostream "~21T/* ~A */" (unicode-char-name uc))
265*946379e7Schristos          )
266*946379e7Schristos          (format ostream "~%")
267*946379e7Schristos        )
268*946379e7Schristos        (format ostream "};~%")
269*946379e7Schristos        (format ostream "static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[~D] = {~%"
270*946379e7Schristos                        (length all-chars)
271*946379e7Schristos        )
272*946379e7Schristos        (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code))
273*946379e7Schristos          (format ostream "  { 0x~4,'0X, ~D },"
274*946379e7Schristos                          (unicode-char-code uc)
275*946379e7Schristos                          (unicode-char-word-indices-index uc)
276*946379e7Schristos          )
277*946379e7Schristos          (when add-comments
278*946379e7Schristos            (format ostream "~21T/* ~A */" (unicode-char-name uc))
279*946379e7Schristos          )
280*946379e7Schristos          (format ostream "~%")
281*946379e7Schristos        )
282*946379e7Schristos        (format ostream "};~%")
283*946379e7Schristos        (format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
284*946379e7Schristos                        (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars))
285*946379e7Schristos        )
286*946379e7Schristos        (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
287*946379e7Schristos                        (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
288*946379e7Schristos        )
289*946379e7Schristos      )
290*946379e7Schristos) ) )
291*946379e7Schristos
292*946379e7Schristos(main (first *args*) (second *args*))
293