1#!/usr/local/bin/clisp -C 2 3;;; Creation of gnulib's uninames.h from the UnicodeData.txt table. 4;;; Bruno Haible 2000-12-28 5 6(defparameter add-comments nil) 7 8(defstruct unicode-char 9 (code nil :type integer) 10 (name nil :type string) 11 word-indices 12 word-indices-index 13) 14 15(defstruct word-list 16 (hashed nil :type hash-table) 17 (sorted nil :type list) 18 size ; number of characters total 19 length ; number of words 20) 21 22(defun main (inputfile outputfile) 23 (declare (type string inputfile outputfile)) 24 #+UNICODE (setq *default-file-encoding* charset:utf-8) 25 (let ((all-chars '())) 26 ;; Read all characters and names from the input file. 27 (with-open-file (istream inputfile :direction :input) 28 (loop 29 (let ((line (read-line istream nil nil))) 30 (unless line (return)) 31 (let* ((i1 (position #\; line)) 32 (i2 (position #\; line :start (1+ i1))) 33 (code-string (subseq line 0 i1)) 34 (code (parse-integer code-string :radix 16)) 35 (name-string (subseq line (1+ i1) i2))) 36 ; Ignore characters whose name starts with "<". 37 (unless (eql (char name-string 0) #\<) 38 ; Also ignore Hangul syllables; they are treated specially. 39 (unless (<= #xAC00 code #xD7A3) 40 ; Also ignore CJK compatibility ideographs; they are treated 41 ; specially as well. 42 (unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A) 43 (<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D)) 44 ; Transform the code so that it fits in 16 bits. In 45 ; Unicode 3.1 the following ranges are used. 46 ; 0x00000..0x04DFF >>12= 0x00..0x04 -> 0x0..0x4 47 ; 0x0A000..0x0A4FF >>12= 0x0A -> 0x5 48 ; 0x0F900..0x0FFFF >>12= 0x0F -> 0x6 49 ; 0x10300..0x104FF >>12= 0x10 -> 0x7 50 ; 0x12000..0x12473 >>12= 0x12 -> 0x8 51 ; 0x1D000..0x1D7DD >>12= 0x1D -> 0x9 52 ; 0x2F800..0x2FAFF >>12= 0x2F -> 0xA 53 ; 0xE0000..0xE00FF >>12= 0xE0 -> 0xB 54 (flet ((transform (x) 55 (dpb 56 (case (ash x -12) 57 ((#x00 #x01 #x02 #x03 #x04) (ash x -12)) 58 (#x0A 5) 59 (#x0F 6) 60 (#x10 7) 61 (#x12 8) 62 (#x1D 9) 63 (#x2F #xA) 64 (#xE0 #xB) 65 (t (error "Update the transform function for 0x~5,'0X" x)) 66 ) 67 (byte 8 12) 68 x 69 )) ) 70 (push (make-unicode-char :code (transform code) 71 :name name-string) 72 all-chars 73 ) ) ) ) ) 74 ) ) ) ) 75 (setq all-chars (nreverse all-chars)) 76 ;; Split into words. 77 (let ((words-by-length (make-array 0 :adjustable t))) 78 (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars))) 79 (let ((i1 0)) 80 (loop 81 (when (>= i1 (length name)) (return)) 82 (let ((i2 (or (position #\Space name :start i1) (length name)))) 83 (let* ((word (subseq name i1 i2)) 84 (len (length word))) 85 (when (>= len (length words-by-length)) 86 (adjust-array words-by-length (1+ len)) 87 ) 88 (unless (aref words-by-length len) 89 (setf (aref words-by-length len) 90 (make-word-list 91 :hashed (make-hash-table :test #'equal) 92 :sorted '() 93 ) ) ) 94 (let ((word-list (aref words-by-length len))) 95 (unless (gethash word (word-list-hashed word-list)) 96 (setf (gethash word (word-list-hashed word-list)) t) 97 (push word (word-list-sorted word-list)) 98 ) ) 99 ) 100 (setq i1 (1+ i2)) 101 ) ) ) ) 102 ;; Sort the word lists. 103 (dotimes (len (length words-by-length)) 104 (unless (aref words-by-length len) 105 (setf (aref words-by-length len) 106 (make-word-list 107 :hashed (make-hash-table :test #'equal) 108 :sorted '() 109 ) ) ) 110 (let ((word-list (aref words-by-length len))) 111 (setf (word-list-sorted word-list) 112 (sort (word-list-sorted word-list) #'string<) 113 ) 114 (setf (word-list-size word-list) 115 (reduce #'+ (mapcar #'length (word-list-sorted word-list))) 116 ) 117 (setf (word-list-length word-list) 118 (length (word-list-sorted word-list)) 119 ) ) ) 120 ;; Output the tables. 121 (with-open-file (ostream outputfile :direction :output 122 #+UNICODE :external-format #+UNICODE charset:ascii) 123 (format ostream "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */~%") 124 (format ostream "/*~%") 125 (format ostream " * ~A~%" (file-namestring outputfile)) 126 (format ostream " *~%") 127 (format ostream " * Unicode character name table.~%") 128 (format ostream " * Generated automatically by the gen-uninames utility.~%") 129 (format ostream " */~%") 130 (format ostream "~%") 131 (format ostream "static const char unicode_name_words[~D] = {~%" 132 (let ((sum 0)) 133 (dotimes (len (length words-by-length)) 134 (let ((word-list (aref words-by-length len))) 135 (incf sum (word-list-size word-list)) 136 ) ) 137 sum 138 ) ) 139 (dotimes (len (length words-by-length)) 140 (let ((word-list (aref words-by-length len))) 141 (dolist (word (word-list-sorted word-list)) 142 (format ostream " ~{ '~C',~}~%" (coerce word 'list)) 143 ) ) ) 144 (format ostream "};~%") 145 (format ostream "#define UNICODE_CHARNAME_NUM_WORDS ~D~%" 146 (let ((sum 0)) 147 (dotimes (len (length words-by-length)) 148 (let ((word-list (aref words-by-length len))) 149 (incf sum (word-list-length word-list)) 150 ) ) 151 sum 152 ) ) 153 #| ; Redundant data 154 (format ostream "static const uint16_t unicode_name_word_offsets[~D] = {~%" 155 (let ((sum 0)) 156 (dotimes (len (length words-by-length)) 157 (let ((word-list (aref words-by-length len))) 158 (incf sum (word-list-length word-list)) 159 ) ) 160 sum 161 ) ) 162 (dotimes (len (length words-by-length)) 163 (let ((word-list (aref words-by-length len))) 164 (when (word-list-sorted word-list) 165 (format ostream " ") 166 (do ((l (word-list-sorted word-list) (cdr l)) 167 (offset 0 (+ offset (length (car l))))) 168 ((endp l)) 169 (format ostream "~<~% ~0,79:; ~D,~>" offset) 170 ) 171 (format ostream "~%") 172 ) ) ) 173 (format ostream "};~%") 174 |# 175 (format ostream "static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%" 176 (1+ (length words-by-length)) 177 ) 178 (let ((extra-offset 0) 179 (ind-offset 0)) 180 (dotimes (len (length words-by-length)) 181 (let ((word-list (aref words-by-length len))) 182 (format ostream " { ~D, ~D },~%" extra-offset ind-offset) 183 (incf extra-offset (word-list-size word-list)) 184 (incf ind-offset (word-list-length word-list)) 185 ) ) 186 (format ostream " { ~D, ~D }~%" extra-offset ind-offset) 187 ) 188 (format ostream "};~%") 189 (let ((ind-offset 0)) 190 (dotimes (len (length words-by-length)) 191 (let ((word-list (aref words-by-length len))) 192 (dolist (word (word-list-sorted word-list)) 193 (setf (gethash word (word-list-hashed word-list)) ind-offset) 194 (incf ind-offset) 195 ) ) ) ) 196 (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY")) 197 (format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word 198 (gethash word (word-list-hashed (aref words-by-length (length word)))) 199 ) ) 200 ;; Compute the word-indices for every unicode-char. 201 (dolist (uc all-chars) 202 (let ((name (unicode-char-name uc)) 203 (indices '())) 204 (let ((i1 0)) 205 (loop 206 (when (>= i1 (length name)) (return)) 207 (let ((i2 (or (position #\Space name :start i1) (length name)))) 208 (let* ((word (subseq name i1 i2)) 209 (len (length word))) 210 (push (gethash word (word-list-hashed (aref words-by-length len))) 211 indices 212 ) 213 ) 214 (setq i1 (1+ i2)) 215 ) ) ) 216 (setf (unicode-char-word-indices uc) 217 (coerce (nreverse indices) 'vector) 218 ) 219 ) ) 220 ;; Sort the list of unicode-chars by word-indices. 221 (setq all-chars 222 (sort all-chars 223 (lambda (vec1 vec2) 224 (let ((len1 (length vec1)) 225 (len2 (length vec2))) 226 (do ((i 0 (1+ i))) 227 (nil) 228 (if (< i len2) 229 (if (< i len1) 230 (cond ((< (aref vec1 i) (aref vec2 i)) (return t)) 231 ((> (aref vec1 i) (aref vec2 i)) (return nil)) 232 ) 233 (return t) 234 ) 235 (return nil) 236 ) ) ) ) 237 :key #'unicode-char-word-indices 238 ) ) 239 ;; Output the word-indices. 240 (format ostream "static const uint16_t unicode_names[~D] = {~%" 241 (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars)) 242 ) 243 (let ((i 0)) 244 (dolist (uc all-chars) 245 (format ostream " ~{ ~D,~}" 246 (maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0))) 247 (coerce (unicode-char-word-indices uc) 'list) 248 ) 249 ) 250 (when add-comments 251 (format ostream "~40T/* ~A */" (unicode-char-name uc)) 252 ) 253 (format ostream "~%") 254 (setf (unicode-char-word-indices-index uc) i) 255 (incf i (length (unicode-char-word-indices uc))) 256 ) ) 257 (format ostream "};~%") 258 (format ostream "static const struct { uint16_t code; uint32_t name:24; }~%") 259 (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%") 260 (format ostream "unicode_name_to_code[~D] = {~%" 261 (length all-chars) 262 ) 263 (dolist (uc all-chars) 264 (format ostream " { 0x~4,'0X, ~D }," 265 (unicode-char-code uc) 266 (unicode-char-word-indices-index uc) 267 ) 268 (when add-comments 269 (format ostream "~21T/* ~A */" (unicode-char-name uc)) 270 ) 271 (format ostream "~%") 272 ) 273 (format ostream "};~%") 274 (format ostream "static const struct { uint16_t code; uint32_t name:24; }~%") 275 (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%") 276 (format ostream "unicode_code_to_name[~D] = {~%" 277 (length all-chars) 278 ) 279 (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code)) 280 (format ostream " { 0x~4,'0X, ~D }," 281 (unicode-char-code uc) 282 (unicode-char-word-indices-index uc) 283 ) 284 (when add-comments 285 (format ostream "~21T/* ~A */" (unicode-char-name uc)) 286 ) 287 (format ostream "~%") 288 ) 289 (format ostream "};~%") 290 (format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%" 291 (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars)) 292 ) 293 (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%" 294 (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars)) 295 ) 296 ) 297) ) ) 298 299(main (first *args*) (second *args*)) 300