• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /netgear-WNDR4500v2-V1.0.0.60_1.0.38/ap/gpl/timemachine/gettext-0.17/gettext-tools/gnulib-lib/uniname/
1#!/usr/local/bin/clisp -C
2
3;;; Creation of gnulib's uninames.h from the UnicodeData.txt table.
4;;; Bruno Haible 2000-12-28
5
6(defparameter add-comments nil)
7
8(defstruct unicode-char
9  (code nil :type integer)
10  (name nil :type string)
11  word-indices
12  word-indices-index
13)
14
15(defstruct word-list
16  (hashed nil :type hash-table)
17  (sorted nil :type list)
18  size                          ; number of characters total
19  length                        ; number of words
20)
21
22(defun main (inputfile outputfile)
23  (declare (type string inputfile outputfile))
24  #+UNICODE (setq *default-file-encoding* charset:utf-8)
25  (let ((all-chars '()))
26    ;; Read all characters and names from the input file.
27    (with-open-file (istream inputfile :direction :input)
28      (loop
29        (let ((line (read-line istream nil nil)))
30          (unless line (return))
31          (let* ((i1 (position #\; line))
32                 (i2 (position #\; line :start (1+ i1)))
33                 (code-string (subseq line 0 i1))
34                 (code (parse-integer code-string :radix 16))
35                 (name-string (subseq line (1+ i1) i2)))
36            ; Ignore characters whose name starts with "<".
37            (unless (eql (char name-string 0) #\<)
38              ; Also ignore Hangul syllables; they are treated specially.
39              (unless (<= #xAC00 code #xD7A3)
40                ; Also ignore CJK compatibility ideographs; they are treated
41                ; specially as well.
42                (unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A)
43                            (<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D))
44                  ; Transform the code so that it fits in 16 bits. In
45                  ; Unicode 3.1 the following ranges are used.
46                  ;   0x00000..0x04DFF  >>12=  0x00..0x04  -> 0x0..0x4
47                  ;   0x0A000..0x0A4FF  >>12=  0x0A        -> 0x5
48                  ;   0x0F900..0x0FFFF  >>12=  0x0F        -> 0x6
49                  ;   0x10300..0x104FF  >>12=  0x10        -> 0x7
50                  ;   0x12000..0x12473  >>12=  0x12        -> 0x8
51                  ;   0x1D000..0x1D7DD  >>12=  0x1D        -> 0x9
52                  ;   0x2F800..0x2FAFF  >>12=  0x2F        -> 0xA
53                  ;   0xE0000..0xE00FF  >>12=  0xE0        -> 0xB
54                  (flet ((transform (x)
55                           (dpb
56                             (case (ash x -12)
57                               ((#x00 #x01 #x02 #x03 #x04) (ash x -12))
58                               (#x0A 5)
59                               (#x0F 6)
60                               (#x10 7)
61                               (#x12 8)
62                               (#x1D 9)
63                               (#x2F #xA)
64                               (#xE0 #xB)
65                               (t (error "Update the transform function for 0x~5,'0X" x))
66                             )
67                             (byte 8 12)
68                             x
69                        )) )
70                    (push (make-unicode-char :code (transform code)
71                                             :name name-string)
72                          all-chars
73            ) ) ) ) )
74    ) ) ) )
75    (setq all-chars (nreverse all-chars))
76    ;; Split into words.
77    (let ((words-by-length (make-array 0 :adjustable t)))
78      (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars)))
79        (let ((i1 0))
80          (loop
81            (when (>= i1 (length name)) (return))
82            (let ((i2 (or (position #\Space name :start i1) (length name))))
83              (let* ((word (subseq name i1 i2))
84                     (len (length word)))
85                (when (>= len (length words-by-length))
86                  (adjust-array words-by-length (1+ len))
87                )
88                (unless (aref words-by-length len)
89                  (setf (aref words-by-length len)
90                        (make-word-list
91                          :hashed (make-hash-table :test #'equal)
92                          :sorted '()
93                ) )     )
94                (let ((word-list (aref words-by-length len)))
95                  (unless (gethash word (word-list-hashed word-list))
96                    (setf (gethash word (word-list-hashed word-list)) t)
97                    (push word (word-list-sorted word-list))
98                ) )
99              )
100              (setq i1 (1+ i2))
101      ) ) ) )
102      ;; Sort the word lists.
103      (dotimes (len (length words-by-length))
104        (unless (aref words-by-length len)
105          (setf (aref words-by-length len)
106                (make-word-list
107                  :hashed (make-hash-table :test #'equal)
108                  :sorted '()
109        ) )     )
110        (let ((word-list (aref words-by-length len)))
111          (setf (word-list-sorted word-list)
112                (sort (word-list-sorted word-list) #'string<)
113          )
114          (setf (word-list-size word-list)
115                (reduce #'+ (mapcar #'length (word-list-sorted word-list)))
116          )
117          (setf (word-list-length word-list)
118                (length (word-list-sorted word-list))
119      ) ) )
120      ;; Output the tables.
121      (with-open-file (ostream outputfile :direction :output
122                       #+UNICODE :external-format #+UNICODE charset:ascii)
123        (format ostream "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */~%")
124        (format ostream "/*~%")
125        (format ostream " * ~A~%" (file-namestring outputfile))
126        (format ostream " *~%")
127        (format ostream " * Unicode character name table.~%")
128        (format ostream " * Generated automatically by the gen-uninames utility.~%")
129        (format ostream " */~%")
130        (format ostream "~%")
131        (format ostream "static const char unicode_name_words[~D] = {~%"
132                        (let ((sum 0))
133                          (dotimes (len (length words-by-length))
134                            (let ((word-list (aref words-by-length len)))
135                              (incf sum (word-list-size word-list))
136                          ) )
137                          sum
138        )               )
139        (dotimes (len (length words-by-length))
140          (let ((word-list (aref words-by-length len)))
141            (dolist (word (word-list-sorted word-list))
142              (format ostream " ~{ '~C',~}~%" (coerce word 'list))
143        ) ) )
144        (format ostream "};~%")
145        (format ostream "#define UNICODE_CHARNAME_NUM_WORDS ~D~%"
146                        (let ((sum 0))
147                          (dotimes (len (length words-by-length))
148                            (let ((word-list (aref words-by-length len)))
149                              (incf sum (word-list-length word-list))
150                          ) )
151                          sum
152        )               )
153        #| ; Redundant data
154        (format ostream "static const uint16_t unicode_name_word_offsets[~D] = {~%"
155                        (let ((sum 0))
156                          (dotimes (len (length words-by-length))
157                            (let ((word-list (aref words-by-length len)))
158                              (incf sum (word-list-length word-list))
159                          ) )
160                          sum
161        )               )
162        (dotimes (len (length words-by-length))
163          (let ((word-list (aref words-by-length len)))
164            (when (word-list-sorted word-list)
165              (format ostream " ")
166              (do ((l (word-list-sorted word-list) (cdr l))
167                   (offset 0 (+ offset (length (car l)))))
168                  ((endp l))
169                (format ostream "~<~% ~0,79:; ~D,~>" offset)
170              )
171              (format ostream "~%")
172        ) ) )
173        (format ostream "};~%")
174        |#
175        (format ostream "static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%"
176                        (1+ (length words-by-length))
177        )
178        (let ((extra-offset 0)
179              (ind-offset 0))
180          (dotimes (len (length words-by-length))
181            (let ((word-list (aref words-by-length len)))
182              (format ostream "  { ~D, ~D },~%" extra-offset ind-offset)
183              (incf extra-offset (word-list-size word-list))
184              (incf ind-offset (word-list-length word-list))
185          ) )
186          (format ostream "  { ~D, ~D }~%" extra-offset ind-offset)
187        )
188        (format ostream "};~%")
189        (let ((ind-offset 0))
190          (dotimes (len (length words-by-length))
191            (let ((word-list (aref words-by-length len)))
192              (dolist (word (word-list-sorted word-list))
193                (setf (gethash word (word-list-hashed word-list)) ind-offset)
194                (incf ind-offset)
195        ) ) ) )
196        (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY"))
197          (format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word
198                          (gethash word (word-list-hashed (aref words-by-length (length word))))
199        ) )
200        ;; Compute the word-indices for every unicode-char.
201        (dolist (uc all-chars)
202          (let ((name (unicode-char-name uc))
203                (indices '()))
204            (let ((i1 0))
205              (loop
206                (when (>= i1 (length name)) (return))
207                (let ((i2 (or (position #\Space name :start i1) (length name))))
208                  (let* ((word (subseq name i1 i2))
209                         (len (length word)))
210                    (push (gethash word (word-list-hashed (aref words-by-length len)))
211                          indices
212                    )
213                  )
214                  (setq i1 (1+ i2))
215            ) ) )
216            (setf (unicode-char-word-indices uc)
217                  (coerce (nreverse indices) 'vector)
218            )
219        ) )
220        ;; Sort the list of unicode-chars by word-indices.
221        (setq all-chars
222              (sort all-chars
223                    (lambda (vec1 vec2)
224                      (let ((len1 (length vec1))
225                            (len2 (length vec2)))
226                        (do ((i 0 (1+ i)))
227                            (nil)
228                          (if (< i len2)
229                            (if (< i len1)
230                              (cond ((< (aref vec1 i) (aref vec2 i)) (return t))
231                                    ((> (aref vec1 i) (aref vec2 i)) (return nil))
232                              )
233                              (return t)
234                            )
235                            (return nil)
236                    ) ) ) )
237                    :key #'unicode-char-word-indices
238        )     )
239        ;; Output the word-indices.
240        (format ostream "static const uint16_t unicode_names[~D] = {~%"
241                        (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
242        )
243        (let ((i 0))
244          (dolist (uc all-chars)
245            (format ostream " ~{ ~D,~}"
246                            (maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0)))
247                                     (coerce (unicode-char-word-indices uc) 'list)
248                            )
249            )
250            (when add-comments
251              (format ostream "~40T/* ~A */" (unicode-char-name uc))
252            )
253            (format ostream "~%")
254            (setf (unicode-char-word-indices-index uc) i)
255            (incf i (length (unicode-char-word-indices uc)))
256        ) )
257        (format ostream "};~%")
258        (format ostream "static const struct { uint16_t code; uint32_t name:24; }~%")
259        (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%")
260        (format ostream "unicode_name_to_code[~D] = {~%"
261                        (length all-chars)
262        )
263        (dolist (uc all-chars)
264          (format ostream "  { 0x~4,'0X, ~D },"
265                          (unicode-char-code uc)
266                          (unicode-char-word-indices-index uc)
267          )
268          (when add-comments
269            (format ostream "~21T/* ~A */" (unicode-char-name uc))
270          )
271          (format ostream "~%")
272        )
273        (format ostream "};~%")
274        (format ostream "static const struct { uint16_t code; uint32_t name:24; }~%")
275        (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%")
276        (format ostream "unicode_code_to_name[~D] = {~%"
277                        (length all-chars)
278        )
279        (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code))
280          (format ostream "  { 0x~4,'0X, ~D },"
281                          (unicode-char-code uc)
282                          (unicode-char-word-indices-index uc)
283          )
284          (when add-comments
285            (format ostream "~21T/* ~A */" (unicode-char-name uc))
286          )
287          (format ostream "~%")
288        )
289        (format ostream "};~%")
290        (format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
291                        (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars))
292        )
293        (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
294                        (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
295        )
296      )
297) ) )
298
299(main (first *args*) (second *args*))
300