1*946379e7Schristos#!/usr/local/bin/clisp -C 2*946379e7Schristos 3*946379e7Schristos;;; Creation of CLISP's uni_names.h from the UnicodeData.txt table. 4*946379e7Schristos;;; Bruno Haible 2000-12-28 5*946379e7Schristos 6*946379e7Schristos(defparameter add-comments nil) 7*946379e7Schristos 8*946379e7Schristos(defstruct unicode-char 9*946379e7Schristos (code nil :type integer) 10*946379e7Schristos (name nil :type string) 11*946379e7Schristos word-indices 12*946379e7Schristos word-indices-index 13*946379e7Schristos) 14*946379e7Schristos 15*946379e7Schristos(defstruct word-list 16*946379e7Schristos (hashed nil :type hash-table) 17*946379e7Schristos (sorted nil :type list) 18*946379e7Schristos size ; number of characters total 19*946379e7Schristos length ; number of words 20*946379e7Schristos) 21*946379e7Schristos 22*946379e7Schristos(defun main (inputfile outputfile) 23*946379e7Schristos (declare (type string inputfile outputfile)) 24*946379e7Schristos #+UNICODE (setq *default-file-encoding* charset:utf-8) 25*946379e7Schristos (let ((all-chars '())) 26*946379e7Schristos ;; Read all characters and names from the input file. 27*946379e7Schristos (with-open-file (istream inputfile :direction :input) 28*946379e7Schristos (loop 29*946379e7Schristos (let ((line (read-line istream nil nil))) 30*946379e7Schristos (unless line (return)) 31*946379e7Schristos (let* ((i1 (position #\; line)) 32*946379e7Schristos (i2 (position #\; line :start (1+ i1))) 33*946379e7Schristos (code-string (subseq line 0 i1)) 34*946379e7Schristos (code (parse-integer code-string :radix 16)) 35*946379e7Schristos (name-string (subseq line (1+ i1) i2))) 36*946379e7Schristos ; Ignore characters whose name starts with "<". 37*946379e7Schristos (unless (eql (char name-string 0) #\<) 38*946379e7Schristos ; Also ignore Hangul syllables; they are treated specially. 39*946379e7Schristos (unless (<= #xAC00 code #xD7A3) 40*946379e7Schristos ; Also ignore CJK compatibility ideographs; they are treated 41*946379e7Schristos ; specially as well. 42*946379e7Schristos (unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A) 43*946379e7Schristos (<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D)) 44*946379e7Schristos ; Transform the code so that it fits in 16 bits. In 45*946379e7Schristos ; Unicode 3.1 the following ranges are used. 46*946379e7Schristos ; 0x00000..0x04DFF >>12= 0x00..0x04 -> 0x0..0x4 47*946379e7Schristos ; 0x0A000..0x0A4FF >>12= 0x0A -> 0x5 48*946379e7Schristos ; 0x0F900..0x0FFFF >>12= 0x0F -> 0x6 49*946379e7Schristos ; 0x10300..0x104FF >>12= 0x10 -> 0x7 50*946379e7Schristos ; 0x1D000..0x1D7DD >>12= 0x1D -> 0x8 51*946379e7Schristos ; 0x2F800..0x2FAFF >>12= 0x2F -> 0x9 52*946379e7Schristos ; 0xE0000..0xE00FF >>12= 0xE0 -> 0xA 53*946379e7Schristos (flet ((transform (x) 54*946379e7Schristos (dpb 55*946379e7Schristos (case (ash x -12) 56*946379e7Schristos ((#x00 #x01 #x02 #x03 #x04) (ash x -12)) 57*946379e7Schristos (#x0A 5) 58*946379e7Schristos (#x0F 6) 59*946379e7Schristos (#x10 7) 60*946379e7Schristos (#x1D 8) 61*946379e7Schristos (#x2F 9) 62*946379e7Schristos (#xE0 #xA) 63*946379e7Schristos (t (error "Update the transform function for 0x~5,'0X" x)) 64*946379e7Schristos ) 65*946379e7Schristos (byte 8 12) 66*946379e7Schristos x 67*946379e7Schristos )) ) 68*946379e7Schristos (push (make-unicode-char :code (transform code) 69*946379e7Schristos :name name-string) 70*946379e7Schristos all-chars 71*946379e7Schristos ) ) ) ) ) 72*946379e7Schristos ) ) ) ) 73*946379e7Schristos (setq all-chars (nreverse all-chars)) 74*946379e7Schristos ;; Split into words. 75*946379e7Schristos (let ((words-by-length (make-array 0 :adjustable t))) 76*946379e7Schristos (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars))) 77*946379e7Schristos (let ((i1 0)) 78*946379e7Schristos (loop 79*946379e7Schristos (when (>= i1 (length name)) (return)) 80*946379e7Schristos (let ((i2 (or (position #\Space name :start i1) (length name)))) 81*946379e7Schristos (let* ((word (subseq name i1 i2)) 82*946379e7Schristos (len (length word))) 83*946379e7Schristos (when (>= len (length words-by-length)) 84*946379e7Schristos (adjust-array words-by-length (1+ len)) 85*946379e7Schristos ) 86*946379e7Schristos (unless (aref words-by-length len) 87*946379e7Schristos (setf (aref words-by-length len) 88*946379e7Schristos (make-word-list 89*946379e7Schristos :hashed (make-hash-table :test #'equal) 90*946379e7Schristos :sorted '() 91*946379e7Schristos ) ) ) 92*946379e7Schristos (let ((word-list (aref words-by-length len))) 93*946379e7Schristos (unless (gethash word (word-list-hashed word-list)) 94*946379e7Schristos (setf (gethash word (word-list-hashed word-list)) t) 95*946379e7Schristos (push word (word-list-sorted word-list)) 96*946379e7Schristos ) ) 97*946379e7Schristos ) 98*946379e7Schristos (setq i1 (1+ i2)) 99*946379e7Schristos ) ) ) ) 100*946379e7Schristos ;; Sort the word lists. 101*946379e7Schristos (dotimes (len (length words-by-length)) 102*946379e7Schristos (unless (aref words-by-length len) 103*946379e7Schristos (setf (aref words-by-length len) 104*946379e7Schristos (make-word-list 105*946379e7Schristos :hashed (make-hash-table :test #'equal) 106*946379e7Schristos :sorted '() 107*946379e7Schristos ) ) ) 108*946379e7Schristos (let ((word-list (aref words-by-length len))) 109*946379e7Schristos (setf (word-list-sorted word-list) 110*946379e7Schristos (sort (word-list-sorted word-list) #'string<) 111*946379e7Schristos ) 112*946379e7Schristos (setf (word-list-size word-list) 113*946379e7Schristos (reduce #'+ (mapcar #'length (word-list-sorted word-list))) 114*946379e7Schristos ) 115*946379e7Schristos (setf (word-list-length word-list) 116*946379e7Schristos (length (word-list-sorted word-list)) 117*946379e7Schristos ) ) ) 118*946379e7Schristos ;; Output the tables. 119*946379e7Schristos (with-open-file (ostream outputfile :direction :output 120*946379e7Schristos #+UNICODE :external-format #+UNICODE charset:ascii) 121*946379e7Schristos (format ostream "/*~%") 122*946379e7Schristos (format ostream " * ~A~%" (file-namestring outputfile)) 123*946379e7Schristos (format ostream " *~%") 124*946379e7Schristos (format ostream " * Unicode character name table.~%") 125*946379e7Schristos (format ostream " * Generated automatically by the gen-uninames utility.~%") 126*946379e7Schristos (format ostream " */~%") 127*946379e7Schristos (format ostream "~%") 128*946379e7Schristos (format ostream "static const char unicode_name_words[~D] = {~%" 129*946379e7Schristos (let ((sum 0)) 130*946379e7Schristos (dotimes (len (length words-by-length)) 131*946379e7Schristos (let ((word-list (aref words-by-length len))) 132*946379e7Schristos (incf sum (word-list-size word-list)) 133*946379e7Schristos ) ) 134*946379e7Schristos sum 135*946379e7Schristos ) ) 136*946379e7Schristos (dotimes (len (length words-by-length)) 137*946379e7Schristos (let ((word-list (aref words-by-length len))) 138*946379e7Schristos (dolist (word (word-list-sorted word-list)) 139*946379e7Schristos (format ostream " ~{ '~C',~}~%" (coerce word 'list)) 140*946379e7Schristos ) ) ) 141*946379e7Schristos (format ostream "};~%") 142*946379e7Schristos (format ostream "#define UNICODE_CHARNAME_NUM_WORDS ~D~%" 143*946379e7Schristos (let ((sum 0)) 144*946379e7Schristos (dotimes (len (length words-by-length)) 145*946379e7Schristos (let ((word-list (aref words-by-length len))) 146*946379e7Schristos (incf sum (word-list-length word-list)) 147*946379e7Schristos ) ) 148*946379e7Schristos sum 149*946379e7Schristos ) ) 150*946379e7Schristos #| ; Redundant data 151*946379e7Schristos (format ostream "static const uint16_t unicode_name_word_offsets[~D] = {~%" 152*946379e7Schristos (let ((sum 0)) 153*946379e7Schristos (dotimes (len (length words-by-length)) 154*946379e7Schristos (let ((word-list (aref words-by-length len))) 155*946379e7Schristos (incf sum (word-list-length word-list)) 156*946379e7Schristos ) ) 157*946379e7Schristos sum 158*946379e7Schristos ) ) 159*946379e7Schristos (dotimes (len (length words-by-length)) 160*946379e7Schristos (let ((word-list (aref words-by-length len))) 161*946379e7Schristos (when (word-list-sorted word-list) 162*946379e7Schristos (format ostream " ") 163*946379e7Schristos (do ((l (word-list-sorted word-list) (cdr l)) 164*946379e7Schristos (offset 0 (+ offset (length (car l))))) 165*946379e7Schristos ((endp l)) 166*946379e7Schristos (format ostream "~<~% ~0,79:; ~D,~>" offset) 167*946379e7Schristos ) 168*946379e7Schristos (format ostream "~%") 169*946379e7Schristos ) ) ) 170*946379e7Schristos (format ostream "};~%") 171*946379e7Schristos |# 172*946379e7Schristos (format ostream "static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%" 173*946379e7Schristos (1+ (length words-by-length)) 174*946379e7Schristos ) 175*946379e7Schristos (let ((extra-offset 0) 176*946379e7Schristos (ind-offset 0)) 177*946379e7Schristos (dotimes (len (length words-by-length)) 178*946379e7Schristos (let ((word-list (aref words-by-length len))) 179*946379e7Schristos (format ostream " { ~D, ~D },~%" extra-offset ind-offset) 180*946379e7Schristos (incf extra-offset (word-list-size word-list)) 181*946379e7Schristos (incf ind-offset (word-list-length word-list)) 182*946379e7Schristos ) ) 183*946379e7Schristos (format ostream " { ~D, ~D }~%" extra-offset ind-offset) 184*946379e7Schristos ) 185*946379e7Schristos (format ostream "};~%") 186*946379e7Schristos (let ((ind-offset 0)) 187*946379e7Schristos (dotimes (len (length words-by-length)) 188*946379e7Schristos (let ((word-list (aref words-by-length len))) 189*946379e7Schristos (dolist (word (word-list-sorted word-list)) 190*946379e7Schristos (setf (gethash word (word-list-hashed word-list)) ind-offset) 191*946379e7Schristos (incf ind-offset) 192*946379e7Schristos ) ) ) ) 193*946379e7Schristos (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY")) 194*946379e7Schristos (format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word 195*946379e7Schristos (gethash word (word-list-hashed (aref words-by-length (length word)))) 196*946379e7Schristos ) ) 197*946379e7Schristos ;; Compute the word-indices for every unicode-char. 198*946379e7Schristos (dolist (uc all-chars) 199*946379e7Schristos (let ((name (unicode-char-name uc)) 200*946379e7Schristos (indices '())) 201*946379e7Schristos (let ((i1 0)) 202*946379e7Schristos (loop 203*946379e7Schristos (when (>= i1 (length name)) (return)) 204*946379e7Schristos (let ((i2 (or (position #\Space name :start i1) (length name)))) 205*946379e7Schristos (let* ((word (subseq name i1 i2)) 206*946379e7Schristos (len (length word))) 207*946379e7Schristos (push (gethash word (word-list-hashed (aref words-by-length len))) 208*946379e7Schristos indices 209*946379e7Schristos ) 210*946379e7Schristos ) 211*946379e7Schristos (setq i1 (1+ i2)) 212*946379e7Schristos ) ) ) 213*946379e7Schristos (setf (unicode-char-word-indices uc) 214*946379e7Schristos (coerce (nreverse indices) 'vector) 215*946379e7Schristos ) 216*946379e7Schristos ) ) 217*946379e7Schristos ;; Sort the list of unicode-chars by word-indices. 218*946379e7Schristos (setq all-chars 219*946379e7Schristos (sort all-chars 220*946379e7Schristos (lambda (vec1 vec2) 221*946379e7Schristos (let ((len1 (length vec1)) 222*946379e7Schristos (len2 (length vec2))) 223*946379e7Schristos (do ((i 0 (1+ i))) 224*946379e7Schristos (nil) 225*946379e7Schristos (if (< i len2) 226*946379e7Schristos (if (< i len1) 227*946379e7Schristos (cond ((< (aref vec1 i) (aref vec2 i)) (return t)) 228*946379e7Schristos ((> (aref vec1 i) (aref vec2 i)) (return nil)) 229*946379e7Schristos ) 230*946379e7Schristos (return t) 231*946379e7Schristos ) 232*946379e7Schristos (return nil) 233*946379e7Schristos ) ) ) ) 234*946379e7Schristos :key #'unicode-char-word-indices 235*946379e7Schristos ) ) 236*946379e7Schristos ;; Output the word-indices. 237*946379e7Schristos (format ostream "static const uint16_t unicode_names[~D] = {~%" 238*946379e7Schristos (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars)) 239*946379e7Schristos ) 240*946379e7Schristos (let ((i 0)) 241*946379e7Schristos (dolist (uc all-chars) 242*946379e7Schristos (format ostream " ~{ ~D,~}" 243*946379e7Schristos (maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0))) 244*946379e7Schristos (coerce (unicode-char-word-indices uc) 'list) 245*946379e7Schristos ) 246*946379e7Schristos ) 247*946379e7Schristos (when add-comments 248*946379e7Schristos (format ostream "~40T/* ~A */" (unicode-char-name uc)) 249*946379e7Schristos ) 250*946379e7Schristos (format ostream "~%") 251*946379e7Schristos (setf (unicode-char-word-indices-index uc) i) 252*946379e7Schristos (incf i (length (unicode-char-word-indices uc))) 253*946379e7Schristos ) ) 254*946379e7Schristos (format ostream "};~%") 255*946379e7Schristos (format ostream "static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[~D] = {~%" 256*946379e7Schristos (length all-chars) 257*946379e7Schristos ) 258*946379e7Schristos (dolist (uc all-chars) 259*946379e7Schristos (format ostream " { 0x~4,'0X, ~D }," 260*946379e7Schristos (unicode-char-code uc) 261*946379e7Schristos (unicode-char-word-indices-index uc) 262*946379e7Schristos ) 263*946379e7Schristos (when add-comments 264*946379e7Schristos (format ostream "~21T/* ~A */" (unicode-char-name uc)) 265*946379e7Schristos ) 266*946379e7Schristos (format ostream "~%") 267*946379e7Schristos ) 268*946379e7Schristos (format ostream "};~%") 269*946379e7Schristos (format ostream "static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[~D] = {~%" 270*946379e7Schristos (length all-chars) 271*946379e7Schristos ) 272*946379e7Schristos (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code)) 273*946379e7Schristos (format ostream " { 0x~4,'0X, ~D }," 274*946379e7Schristos (unicode-char-code uc) 275*946379e7Schristos (unicode-char-word-indices-index uc) 276*946379e7Schristos ) 277*946379e7Schristos (when add-comments 278*946379e7Schristos (format ostream "~21T/* ~A */" (unicode-char-name uc)) 279*946379e7Schristos ) 280*946379e7Schristos (format ostream "~%") 281*946379e7Schristos ) 282*946379e7Schristos (format ostream "};~%") 283*946379e7Schristos (format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%" 284*946379e7Schristos (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars)) 285*946379e7Schristos ) 286*946379e7Schristos (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%" 287*946379e7Schristos (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars)) 288*946379e7Schristos ) 289*946379e7Schristos ) 290*946379e7Schristos) ) ) 291*946379e7Schristos 292*946379e7Schristos(main (first *args*) (second *args*)) 293