1;;; china-util.el --- utilities for Chinese  -*- coding: iso-2022-7bit -*-
2
3;; Copyright (C) 1995, 2001, 2002, 2003, 2004, 2005, 2006, 2007
4;;   Free Software Foundation, Inc.
5;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
6;;   2005, 2006, 2007
7;;   National Institute of Advanced Industrial Science and Technology (AIST)
8;;   Registration Number H14PRO021
9
10;; Keywords: mule, multilingual, Chinese
11
12;; This file is part of GNU Emacs.
13
14;; GNU Emacs is free software; you can redistribute it and/or modify
15;; it under the terms of the GNU General Public License as published by
16;; the Free Software Foundation; either version 2, or (at your option)
17;; any later version.
18
19;; GNU Emacs is distributed in the hope that it will be useful,
20;; but WITHOUT ANY WARRANTY; without even the implied warranty of
21;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22;; GNU General Public License for more details.
23
24;; You should have received a copy of the GNU General Public License
25;; along with GNU Emacs; see the file COPYING.  If not, write to the
26;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
27;; Boston, MA 02110-1301, USA.
28
29;;; Commentary:
30
31;;; Code:
32
33;; Hz/ZW/EUC-TW encoding stuff
34
35;; HZ is an encoding method for Chinese character set GB2312 used
36;; widely in Internet.  It is very similar to 7-bit environment of
37;; ISO-2022.  The difference is that HZ uses the sequence "~{" and
38;; "~}" for designating GB2312 and ASCII respectively, hence, it
39;; doesn't uses ESC (0x1B) code.
40
41;; ZW is another encoding method for Chinese character set GB2312.  It
42;; encodes Chinese characters line by line by starting each line with
43;; the sequence "zW".  It also uses only 7-bit as HZ.
44
45;; EUC-TW is similar to EUC-KS or EUC-JP.  Its main character set is
46;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with
47;; a single shift escape followed by three bytes: the first gives the
48;; plane, the second and third the character code.  Note that characters
49;; of plane 1 are (redundantly) accessible with a single shift escape
50;; also.
51
52;; ISO-2022 escape sequence to designate GB2312.
53(defvar iso2022-gb-designation "\e$A")
54;; HZ escape sequence to designate GB2312.
55(defvar hz-gb-designnation "~{")
56;; ISO-2022 escape sequence to designate ASCII.
57(defvar iso2022-ascii-designation "\e(B")
58;; HZ escape sequence to designate ASCII.
59(defvar hz-ascii-designnation "~}")
60;; Regexp of ZW sequence to start GB2312.
61(defvar zw-start-gb "^zW")
62;; Regexp for start of GB2312 in an encoding mixture of HZ and ZW.
63(defvar hz/zw-start-gb
64  (concat hz-gb-designnation "\\|" zw-start-gb "\\|[^\0-\177]"))
65
66(defvar decode-hz-line-continuation nil
67  "Flag to tell if we should care line continuation convention of Hz.")
68
69(defconst hz-set-msb-table
70  (eval-when-compile
71    (let ((chars nil)
72	  (i 0))
73      (while (< i 33)
74	(push i chars)
75	(setq i (1+ i)))
76      (while (< i 127)
77	(push (+ i 128) chars)
78	(setq i (1+ i)))
79      (apply 'string (nreverse chars)))))
80
81;;;###autoload
82(defun decode-hz-region (beg end)
83  "Decode HZ/ZW encoded text in the current region.
84Return the length of resulting text."
85  (interactive "r")
86  (save-excursion
87    (save-restriction
88      (let (pos ch)
89	(narrow-to-region beg end)
90
91	;; We, at first, convert HZ/ZW to `euc-china',
92	;; then decode it.
93
94	;; "~\n" -> "\n", "~~" -> "~"
95	(goto-char (point-min))
96	(while (search-forward "~" nil t)
97	  (setq ch (following-char))
98	  (if (or (= ch ?\n) (= ch ?~)) (delete-char -1)))
99
100	;; "^zW...\n" -> Chinese GB2312
101	;; "~{...~}"  -> Chinese GB2312
102	(goto-char (point-min))
103	(setq beg nil)
104	(while (re-search-forward hz/zw-start-gb nil t)
105	  (setq pos (match-beginning 0)
106		ch (char-after pos))
107	  ;; Record the first position to start conversion.
108	  (or beg (setq beg pos))
109	  (end-of-line)
110	  (setq end (point))
111	  (if (>= ch 128)		; 8bit GB2312
112	      nil
113	    (goto-char pos)
114	    (delete-char 2)
115	    (setq end (- end 2))
116	    (if (= ch ?z)			; ZW -> euc-china
117		(progn
118		  (translate-region (point) end hz-set-msb-table)
119		  (goto-char end))
120	      (if (search-forward hz-ascii-designnation
121				  (if decode-hz-line-continuation nil end)
122				  t)
123		  (delete-char -2))
124	      (setq end (point))
125	      (translate-region pos (point) hz-set-msb-table))))
126	(if beg
127	    (decode-coding-region beg end 'euc-china)))
128      (- (point-max) (point-min)))))
129
130;;;###autoload
131(defun decode-hz-buffer ()
132  "Decode HZ/ZW encoded text in the current buffer."
133  (interactive)
134  (decode-hz-region (point-min) (point-max)))
135
136;;;###autoload
137(defun encode-hz-region (beg end)
138  "Encode the text in the current region to HZ.
139Return the length of resulting text."
140  (interactive "r")
141  (save-excursion
142    (save-restriction
143      (narrow-to-region beg end)
144
145      ;; "~" -> "~~"
146      (goto-char (point-min))
147      (while (search-forward "~" nil t)	(insert ?~))
148
149      ;; Chinese GB2312 -> "~{...~}"
150      (goto-char (point-min))
151      (if (re-search-forward "\\cc" nil t)
152	  (let (pos)
153	    (goto-char (setq pos (match-beginning 0)))
154	    (encode-coding-region pos (point-max) 'iso-2022-7bit)
155	    (goto-char pos)
156	    (while (search-forward iso2022-gb-designation nil t)
157	      (delete-char -3)
158	      (insert hz-gb-designnation))
159	    (goto-char pos)
160	    (while (search-forward iso2022-ascii-designation nil t)
161	      (delete-char -3)
162	      (insert hz-ascii-designnation))))
163      (- (point-max) (point-min)))))
164
165;;;###autoload
166(defun encode-hz-buffer ()
167  "Encode the text in the current buffer to HZ."
168  (interactive)
169  (encode-hz-region (point-min) (point-max)))
170
171;; The following sets up a translation table (big5-to-cns) from Big 5
172;; to CNS encoding, using some auxiliary functions to make the code
173;; more readable.
174
175;; Many kudos to Himi!  The used code has been adapted from his
176;; mule-ucs package.
177
178(eval-when-compile
179(defun big5-to-flat-code (num)
180  "Convert NUM in Big 5 encoding to a `flat code'.
1810xA140 will be mapped to position 0, 0xA141 to position 1, etc.
182There are no gaps in the flat code."
183
184  (let ((hi (/ num 256))
185        (lo (% num 256)))
186    (+ (* 157 (- hi #xa1))
187       (- lo (if (>= lo #xa1) 98 64)))))
188
189(defun flat-code-to-big5 (num)
190  "Convert NUM from a `flat code' to Big 5 encoding.
191This is the inverse function of `big5-to-flat-code'."
192
193  (let ((hi (/ num 157))
194        (lo (% num 157)))
195    (+ (* 256 (+ hi #xa1))
196       (+ lo (if (< lo 63) 64 98)))))
197
198(defun euc-to-flat-code (num)
199  "Convert NUM in EUC encoding (in GL representation) to a `flat code'.
2000x2121 will be mapped to position 0, 0x2122 to position 1, etc.
201There are no gaps in the flat code."
202
203  (let ((hi (/ num 256))
204        (lo (% num 256)))
205    (+ (* 94 (- hi #x21))
206       (- lo #x21))))
207
208(defun flat-code-to-euc (num)
209  "Convert NUM from a `flat code' to EUC encoding (in GL representation).
210The inverse function of `euc-to-flat-code'.  The high and low bytes are
211returned in a list."
212
213  (let ((hi (/ num 94))
214        (lo (% num 94)))
215    (list (+ hi #x21) (+ lo #x21))))
216
217(defun expand-euc-big5-alist (alist)
218  "Create a translation table and fills it with data given in ALIST.
219Elements of ALIST can be either given as
220
221  ((euc-charset . startchar) . (big5-range-begin . big5-range-end))
222
223or as
224
225  (euc-character . big5-charcode)
226
227The former maps a range of glyphs in an EUC charset (where STARTCHAR
228is in GL representation) to a certain range of Big 5 encoded
229characters, the latter maps a single glyph.  Glyphs which can't be
230mapped will be represented with the byte 0xFF.
231
232The return value is the filled translation table."
233
234  (let ((chartable (make-char-table 'translation-table #xFF))
235        char
236        big5
237        i
238        end
239        codepoint
240        charset)
241    (dolist (elem alist)
242      (setq char (car elem)
243            big5 (cdr elem))
244      (cond ((and (consp char)
245                  (consp big5))
246	     (setq i (big5-to-flat-code (car big5))
247		   end (big5-to-flat-code (cdr big5))
248		   codepoint (euc-to-flat-code (cdr char))
249		   charset (car char))
250	     (while (>= end i)
251	       (aset chartable
252		     (decode-big5-char (flat-code-to-big5 i))
253		     (apply (function make-char)
254			    charset
255			    (flat-code-to-euc codepoint)))
256	       (setq i (1+ i)
257		     codepoint (1+ codepoint))))
258            ((and (char-valid-p char)
259                  (numberp big5))
260	     (setq i (decode-big5-char big5))
261	     (aset chartable i char))
262            (t
263             (error "Unknown slot type: %S" elem))))
264    ;; the return value
265    chartable)))
266
267;; All non-CNS encodings are commented out.
268
269(define-translation-table 'big5-to-cns
270  (eval-when-compile
271  (expand-euc-big5-alist
272   '(
273     ;; Symbols
274     ((chinese-cns11643-1 . #x2121) . (#xA140 . #xA1F5))
275     (?$(G"X(B . #xA1F6)
276     (?$(G"W(B . #xA1F7)
277     ((chinese-cns11643-1 . #x2259) . (#xA1F8 . #xA2AE))
278     ((chinese-cns11643-1 . #x2421) . (#xA2AF . #xA3BF))
279     ;; Control codes (vendor dependent)
280     ((chinese-cns11643-1 . #x4221) . (#xA3C0 . #xA3E0))
281     ;; Level 1 Ideographs
282     ((chinese-cns11643-1 . #x4421) . (#xA440 . #xACFD))
283     (?$(GWS(B . #xACFE)
284     ((chinese-cns11643-1 . #x5323) . (#xAD40 . #xAFCF))
285     ((chinese-cns11643-1 . #x5754) . (#xAFD0 . #xBBC7))
286     ((chinese-cns11643-1 . #x6B51) . (#xBBC8 . #xBE51))
287     (?$(GkP(B . #xBE52)
288     ((chinese-cns11643-1 . #x6F5C) . (#xBE53 . #xC1AA))
289     ((chinese-cns11643-1 . #x7536) . (#xC1AB . #xC2CA))
290     (?$(Gu5(B . #xC2CB)
291     ((chinese-cns11643-1 . #x7737) . (#xC2CC . #xC360))
292     ((chinese-cns11643-1 . #x782E) . (#xC361 . #xC3B8))
293     (?$(Gxe(B . #xC3B9)
294     (?$(Gxd(B . #xC3BA)
295     ((chinese-cns11643-1 . #x7866) . (#xC3BB . #xC455))
296     (?$(Gx-(B . #xC456)
297     ((chinese-cns11643-1 . #x7962) . (#xC457 . #xC67E))
298     ;; Symbols
299     ((chinese-cns11643-1 . #x2621) . (#xC6A1 . #xC6BE))
300     ;; Radicals
301     (?$(G'#(B . #xC6BF)
302     (?$(G'$(B . #xC6C0)
303     (?$(G'&(B . #xC6C1)
304     (?$(G'((B . #xC6C2)
305     (?$(G'-(B . #xC6C3)
306     (?$(G'.(B . #xC6C4)
307     (?$(G'/(B . #xC6C5)
308     (?$(G'4(B . #xC6C6)
309     (?$(G'7(B . #xC6C7)
310     (?$(G':(B . #xC6C8)
311     (?$(G'<(B . #xC6C9)
312     (?$(G'B(B . #xC6CA)
313     (?$(G'G(B . #xC6CB)
314     (?$(G'N(B . #xC6CC)
315     (?$(G'S(B . #xC6CD)
316     (?$(G'T(B . #xC6CE)
317     (?$(G'U(B . #xC6CF)
318     (?$(G'Y(B . #xC6D0)
319     (?$(G'Z(B . #xC6D1)
320     (?$(G'a(B . #xC6D2)
321     (?$(G'f(B . #xC6D3)
322     (?$(G()(B . #xC6D4)
323     (?$(G(*(B . #xC6D5)
324     (?$(G(c(B . #xC6D6)
325     (?$(G(l(B . #xC6D7)
326     ;; Diacritical Marks
327     ; ((japanese-jisx0208 . #x212F) . (#xC6D8 . #xC6D9))
328     ;; Japanese Kana Supplement
329     ; ((japanese-jisx0208 . #x2133) . (#xC6DA . #xC6E3))
330     ;; Japanese Hiragana
331     ; ((japanese-jisx0208 . #x2421) . (#xC6E7 . #xC77A))
332     ;; Japanese Katakana
333     ; ((japanese-jisx0208 . #x2521) . (#xC77B . #xC7F2))
334     ;; Cyrillic Characters
335     ; ((japanese-jisx0208 . #x2721) . (#xC7F3 . #xC854))
336     ; ((japanese-jisx0208 . #x2751) . (#xC855 . #xC875))
337     ;; Special Chinese Characters
338     (?$(J!#(B . #xC879)
339     (?$(J!$(B . #xC87B)
340     (?$(J!*(B . #xC87D)
341     (?$(J!R(B . #xC8A2)
342
343     ;; JIS X 0208 NOT SIGN (cf. U+00AC)
344     ; (?$B"L(B . #xC8CD)
345     ;; JIS X 0212 BROKEN BAR (cf. U+00A6)
346     ; (?$(D"C(B . #xC8CE)
347
348     ;; GB 2312 characters
349     ; (?$A!d(B . #xC8CF)
350     ; (?$A!e(B . #xC8D0)
351        ;;;;; C8D1 - Japanese `($B3t(B)'
352     ; (?$A!m(B . #xC8D2)
353        ;;;;; C8D2 - Tel.
354
355     ;; Level 2 Ideographs
356     ((chinese-cns11643-2 . #x2121) . (#xC940 . #xC949))
357     (?$(GDB(B . #xC94A);; a duplicate of #xA461
358     ((chinese-cns11643-2 . #x212B) . (#xC94B . #xC96B))
359     ((chinese-cns11643-2 . #x214D) . (#xC96C . #xC9BD))
360     (?$(H!L(B . #xC9BE)
361     ((chinese-cns11643-2 . #x217D) . (#xC9BF . #xC9EC))
362     ((chinese-cns11643-2 . #x224E) . (#xC9ED . #xCAF6))
363     (?$(H"M(B . #xCAF7)
364     ((chinese-cns11643-2 . #x2439) . (#xCAF8 . #xD6CB))
365     (?$(H>c(B . #xD6CC)
366     ((chinese-cns11643-2 . #x3770) . (#xD6CD . #xD779))
367     (?$(H?j(B . #xD77A)
368     ((chinese-cns11643-2 . #x387E) . (#xD77B . #xDADE))
369     (?$(H7o(B . #xDADF)
370     ((chinese-cns11643-2 . #x3E64) . (#xDAE0 . #xDBA6))
371     ((chinese-cns11643-2 . #x3F6B) . (#xDBA7 . #xDDFB))
372     (?$(HAv(B . #xDDFC);; a duplicate of #xDCD1
373     ((chinese-cns11643-2 . #x4424) . (#xDDFD . #xE8A2))
374     ((chinese-cns11643-2 . #x554C) . (#xE8A3 . #xE975))
375     ((chinese-cns11643-2 . #x5723) . (#xE976 . #xEB5A))
376     ((chinese-cns11643-2 . #x5A29) . (#xEB5B . #xEBF0))
377     (?$(HUK(B . #xEBF1)
378     ((chinese-cns11643-2 . #x5B3F) . (#xEBF2 . #xECDD))
379     (?$(HW"(B . #xECDE)
380     ((chinese-cns11643-2 . #x5C6A) . (#xECDF . #xEDA9))
381     ((chinese-cns11643-2 . #x5D75) . (#xEDAA . #xEEEA))
382     (?$(Hd/(B . #xEEEB)
383     ((chinese-cns11643-2 . #x6039) . (#xEEEC . #xF055))
384     (?$(H]t(B . #xF056)
385     ((chinese-cns11643-2 . #x6243) . (#xF057 . #xF0CA))
386     (?$(HZ((B . #xF0CB)
387     ((chinese-cns11643-2 . #x6337) . (#xF0CC . #xF162))
388     ((chinese-cns11643-2 . #x6430) . (#xF163 . #xF16A))
389     (?$(Hga(B . #xF16B)
390     ((chinese-cns11643-2 . #x6438) . (#xF16C . #xF267))
391     (?$(Hi4(B . #xF268)
392     ((chinese-cns11643-2 . #x6573) . (#xF269 . #xF2C2))
393     ((chinese-cns11643-2 . #x664E) . (#xF2C3 . #xF374))
394     ((chinese-cns11643-2 . #x6762) . (#xF375 . #xF465))
395     ((chinese-cns11643-2 . #x6935) . (#xF466 . #xF4B4))
396     (?$(HfM(B . #xF4B5)
397     ((chinese-cns11643-2 . #x6962) . (#xF4B6 . #xF4FC))
398     ((chinese-cns11643-2 . #x6A4C) . (#xF4FD . #xF662))
399     (?$(HjK(B . #xF663)
400     ((chinese-cns11643-2 . #x6C52) . (#xF664 . #xF976))
401     ((chinese-cns11643-2 . #x7167) . (#xF977 . #xF9C3))
402     (?$(Hqf(B . #xF9C4)
403     (?$(Hr4(B . #xF9C5)
404     (?$(Hr@(B . #xF9C6)
405     ((chinese-cns11643-2 . #x7235) . (#xF9C7 . #xF9D1))
406     ((chinese-cns11643-2 . #x7241) . (#xF9D2 . #xF9D5))
407
408     ;; Additional Ideographs
409     (?$(IC7(B . #xF9D6)
410     (?$(IOP(B . #xF9D7)
411     (?$(IDN(B . #xF9D8)
412     (?$(IPJ(B . #xF9D9)
413     (?$(I,](B . #xF9DA)
414     (?$(I=~(B . #xF9DB)
415     (?$(IK\(B . #xF9DC)
416    )
417  ))
418)
419
420;;
421(provide 'china-util)
422
423;;; arch-tag: 5a47b084-b9ac-420e-8191-70c5b3a14836
424;;; china-util.el ends here
425