1;;; china-util.el --- utilities for Chinese -*- coding: iso-2022-7bit -*- 2 3;; Copyright (C) 1995, 2001, 2002, 2003, 2004, 2005, 2006, 2007 4;; Free Software Foundation, Inc. 5;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 6;; 2005, 2006, 2007 7;; National Institute of Advanced Industrial Science and Technology (AIST) 8;; Registration Number H14PRO021 9 10;; Keywords: mule, multilingual, Chinese 11 12;; This file is part of GNU Emacs. 13 14;; GNU Emacs is free software; you can redistribute it and/or modify 15;; it under the terms of the GNU General Public License as published by 16;; the Free Software Foundation; either version 2, or (at your option) 17;; any later version. 18 19;; GNU Emacs is distributed in the hope that it will be useful, 20;; but WITHOUT ANY WARRANTY; without even the implied warranty of 21;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22;; GNU General Public License for more details. 23 24;; You should have received a copy of the GNU General Public License 25;; along with GNU Emacs; see the file COPYING. If not, write to the 26;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 27;; Boston, MA 02110-1301, USA. 28 29;;; Commentary: 30 31;;; Code: 32 33;; Hz/ZW/EUC-TW encoding stuff 34 35;; HZ is an encoding method for Chinese character set GB2312 used 36;; widely in Internet. It is very similar to 7-bit environment of 37;; ISO-2022. The difference is that HZ uses the sequence "~{" and 38;; "~}" for designating GB2312 and ASCII respectively, hence, it 39;; doesn't uses ESC (0x1B) code. 40 41;; ZW is another encoding method for Chinese character set GB2312. It 42;; encodes Chinese characters line by line by starting each line with 43;; the sequence "zW". It also uses only 7-bit as HZ. 44 45;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is 46;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with 47;; a single shift escape followed by three bytes: the first gives the 48;; plane, the second and third the character code. Note that characters 49;; of plane 1 are (redundantly) accessible with a single shift escape 50;; also. 51 52;; ISO-2022 escape sequence to designate GB2312. 53(defvar iso2022-gb-designation "\e$A") 54;; HZ escape sequence to designate GB2312. 55(defvar hz-gb-designnation "~{") 56;; ISO-2022 escape sequence to designate ASCII. 57(defvar iso2022-ascii-designation "\e(B") 58;; HZ escape sequence to designate ASCII. 59(defvar hz-ascii-designnation "~}") 60;; Regexp of ZW sequence to start GB2312. 61(defvar zw-start-gb "^zW") 62;; Regexp for start of GB2312 in an encoding mixture of HZ and ZW. 63(defvar hz/zw-start-gb 64 (concat hz-gb-designnation "\\|" zw-start-gb "\\|[^\0-\177]")) 65 66(defvar decode-hz-line-continuation nil 67 "Flag to tell if we should care line continuation convention of Hz.") 68 69(defconst hz-set-msb-table 70 (eval-when-compile 71 (let ((chars nil) 72 (i 0)) 73 (while (< i 33) 74 (push i chars) 75 (setq i (1+ i))) 76 (while (< i 127) 77 (push (+ i 128) chars) 78 (setq i (1+ i))) 79 (apply 'string (nreverse chars))))) 80 81;;;###autoload 82(defun decode-hz-region (beg end) 83 "Decode HZ/ZW encoded text in the current region. 84Return the length of resulting text." 85 (interactive "r") 86 (save-excursion 87 (save-restriction 88 (let (pos ch) 89 (narrow-to-region beg end) 90 91 ;; We, at first, convert HZ/ZW to `euc-china', 92 ;; then decode it. 93 94 ;; "~\n" -> "\n", "~~" -> "~" 95 (goto-char (point-min)) 96 (while (search-forward "~" nil t) 97 (setq ch (following-char)) 98 (if (or (= ch ?\n) (= ch ?~)) (delete-char -1))) 99 100 ;; "^zW...\n" -> Chinese GB2312 101 ;; "~{...~}" -> Chinese GB2312 102 (goto-char (point-min)) 103 (setq beg nil) 104 (while (re-search-forward hz/zw-start-gb nil t) 105 (setq pos (match-beginning 0) 106 ch (char-after pos)) 107 ;; Record the first position to start conversion. 108 (or beg (setq beg pos)) 109 (end-of-line) 110 (setq end (point)) 111 (if (>= ch 128) ; 8bit GB2312 112 nil 113 (goto-char pos) 114 (delete-char 2) 115 (setq end (- end 2)) 116 (if (= ch ?z) ; ZW -> euc-china 117 (progn 118 (translate-region (point) end hz-set-msb-table) 119 (goto-char end)) 120 (if (search-forward hz-ascii-designnation 121 (if decode-hz-line-continuation nil end) 122 t) 123 (delete-char -2)) 124 (setq end (point)) 125 (translate-region pos (point) hz-set-msb-table)))) 126 (if beg 127 (decode-coding-region beg end 'euc-china))) 128 (- (point-max) (point-min))))) 129 130;;;###autoload 131(defun decode-hz-buffer () 132 "Decode HZ/ZW encoded text in the current buffer." 133 (interactive) 134 (decode-hz-region (point-min) (point-max))) 135 136;;;###autoload 137(defun encode-hz-region (beg end) 138 "Encode the text in the current region to HZ. 139Return the length of resulting text." 140 (interactive "r") 141 (save-excursion 142 (save-restriction 143 (narrow-to-region beg end) 144 145 ;; "~" -> "~~" 146 (goto-char (point-min)) 147 (while (search-forward "~" nil t) (insert ?~)) 148 149 ;; Chinese GB2312 -> "~{...~}" 150 (goto-char (point-min)) 151 (if (re-search-forward "\\cc" nil t) 152 (let (pos) 153 (goto-char (setq pos (match-beginning 0))) 154 (encode-coding-region pos (point-max) 'iso-2022-7bit) 155 (goto-char pos) 156 (while (search-forward iso2022-gb-designation nil t) 157 (delete-char -3) 158 (insert hz-gb-designnation)) 159 (goto-char pos) 160 (while (search-forward iso2022-ascii-designation nil t) 161 (delete-char -3) 162 (insert hz-ascii-designnation)))) 163 (- (point-max) (point-min))))) 164 165;;;###autoload 166(defun encode-hz-buffer () 167 "Encode the text in the current buffer to HZ." 168 (interactive) 169 (encode-hz-region (point-min) (point-max))) 170 171;; The following sets up a translation table (big5-to-cns) from Big 5 172;; to CNS encoding, using some auxiliary functions to make the code 173;; more readable. 174 175;; Many kudos to Himi! The used code has been adapted from his 176;; mule-ucs package. 177 178(eval-when-compile 179(defun big5-to-flat-code (num) 180 "Convert NUM in Big 5 encoding to a `flat code'. 1810xA140 will be mapped to position 0, 0xA141 to position 1, etc. 182There are no gaps in the flat code." 183 184 (let ((hi (/ num 256)) 185 (lo (% num 256))) 186 (+ (* 157 (- hi #xa1)) 187 (- lo (if (>= lo #xa1) 98 64))))) 188 189(defun flat-code-to-big5 (num) 190 "Convert NUM from a `flat code' to Big 5 encoding. 191This is the inverse function of `big5-to-flat-code'." 192 193 (let ((hi (/ num 157)) 194 (lo (% num 157))) 195 (+ (* 256 (+ hi #xa1)) 196 (+ lo (if (< lo 63) 64 98))))) 197 198(defun euc-to-flat-code (num) 199 "Convert NUM in EUC encoding (in GL representation) to a `flat code'. 2000x2121 will be mapped to position 0, 0x2122 to position 1, etc. 201There are no gaps in the flat code." 202 203 (let ((hi (/ num 256)) 204 (lo (% num 256))) 205 (+ (* 94 (- hi #x21)) 206 (- lo #x21)))) 207 208(defun flat-code-to-euc (num) 209 "Convert NUM from a `flat code' to EUC encoding (in GL representation). 210The inverse function of `euc-to-flat-code'. The high and low bytes are 211returned in a list." 212 213 (let ((hi (/ num 94)) 214 (lo (% num 94))) 215 (list (+ hi #x21) (+ lo #x21)))) 216 217(defun expand-euc-big5-alist (alist) 218 "Create a translation table and fills it with data given in ALIST. 219Elements of ALIST can be either given as 220 221 ((euc-charset . startchar) . (big5-range-begin . big5-range-end)) 222 223or as 224 225 (euc-character . big5-charcode) 226 227The former maps a range of glyphs in an EUC charset (where STARTCHAR 228is in GL representation) to a certain range of Big 5 encoded 229characters, the latter maps a single glyph. Glyphs which can't be 230mapped will be represented with the byte 0xFF. 231 232The return value is the filled translation table." 233 234 (let ((chartable (make-char-table 'translation-table #xFF)) 235 char 236 big5 237 i 238 end 239 codepoint 240 charset) 241 (dolist (elem alist) 242 (setq char (car elem) 243 big5 (cdr elem)) 244 (cond ((and (consp char) 245 (consp big5)) 246 (setq i (big5-to-flat-code (car big5)) 247 end (big5-to-flat-code (cdr big5)) 248 codepoint (euc-to-flat-code (cdr char)) 249 charset (car char)) 250 (while (>= end i) 251 (aset chartable 252 (decode-big5-char (flat-code-to-big5 i)) 253 (apply (function make-char) 254 charset 255 (flat-code-to-euc codepoint))) 256 (setq i (1+ i) 257 codepoint (1+ codepoint)))) 258 ((and (char-valid-p char) 259 (numberp big5)) 260 (setq i (decode-big5-char big5)) 261 (aset chartable i char)) 262 (t 263 (error "Unknown slot type: %S" elem)))) 264 ;; the return value 265 chartable))) 266 267;; All non-CNS encodings are commented out. 268 269(define-translation-table 'big5-to-cns 270 (eval-when-compile 271 (expand-euc-big5-alist 272 '( 273 ;; Symbols 274 ((chinese-cns11643-1 . #x2121) . (#xA140 . #xA1F5)) 275 (?$(G"X(B . #xA1F6) 276 (?$(G"W(B . #xA1F7) 277 ((chinese-cns11643-1 . #x2259) . (#xA1F8 . #xA2AE)) 278 ((chinese-cns11643-1 . #x2421) . (#xA2AF . #xA3BF)) 279 ;; Control codes (vendor dependent) 280 ((chinese-cns11643-1 . #x4221) . (#xA3C0 . #xA3E0)) 281 ;; Level 1 Ideographs 282 ((chinese-cns11643-1 . #x4421) . (#xA440 . #xACFD)) 283 (?$(GWS(B . #xACFE) 284 ((chinese-cns11643-1 . #x5323) . (#xAD40 . #xAFCF)) 285 ((chinese-cns11643-1 . #x5754) . (#xAFD0 . #xBBC7)) 286 ((chinese-cns11643-1 . #x6B51) . (#xBBC8 . #xBE51)) 287 (?$(GkP(B . #xBE52) 288 ((chinese-cns11643-1 . #x6F5C) . (#xBE53 . #xC1AA)) 289 ((chinese-cns11643-1 . #x7536) . (#xC1AB . #xC2CA)) 290 (?$(Gu5(B . #xC2CB) 291 ((chinese-cns11643-1 . #x7737) . (#xC2CC . #xC360)) 292 ((chinese-cns11643-1 . #x782E) . (#xC361 . #xC3B8)) 293 (?$(Gxe(B . #xC3B9) 294 (?$(Gxd(B . #xC3BA) 295 ((chinese-cns11643-1 . #x7866) . (#xC3BB . #xC455)) 296 (?$(Gx-(B . #xC456) 297 ((chinese-cns11643-1 . #x7962) . (#xC457 . #xC67E)) 298 ;; Symbols 299 ((chinese-cns11643-1 . #x2621) . (#xC6A1 . #xC6BE)) 300 ;; Radicals 301 (?$(G'#(B . #xC6BF) 302 (?$(G'$(B . #xC6C0) 303 (?$(G'&(B . #xC6C1) 304 (?$(G'((B . #xC6C2) 305 (?$(G'-(B . #xC6C3) 306 (?$(G'.(B . #xC6C4) 307 (?$(G'/(B . #xC6C5) 308 (?$(G'4(B . #xC6C6) 309 (?$(G'7(B . #xC6C7) 310 (?$(G':(B . #xC6C8) 311 (?$(G'<(B . #xC6C9) 312 (?$(G'B(B . #xC6CA) 313 (?$(G'G(B . #xC6CB) 314 (?$(G'N(B . #xC6CC) 315 (?$(G'S(B . #xC6CD) 316 (?$(G'T(B . #xC6CE) 317 (?$(G'U(B . #xC6CF) 318 (?$(G'Y(B . #xC6D0) 319 (?$(G'Z(B . #xC6D1) 320 (?$(G'a(B . #xC6D2) 321 (?$(G'f(B . #xC6D3) 322 (?$(G()(B . #xC6D4) 323 (?$(G(*(B . #xC6D5) 324 (?$(G(c(B . #xC6D6) 325 (?$(G(l(B . #xC6D7) 326 ;; Diacritical Marks 327 ; ((japanese-jisx0208 . #x212F) . (#xC6D8 . #xC6D9)) 328 ;; Japanese Kana Supplement 329 ; ((japanese-jisx0208 . #x2133) . (#xC6DA . #xC6E3)) 330 ;; Japanese Hiragana 331 ; ((japanese-jisx0208 . #x2421) . (#xC6E7 . #xC77A)) 332 ;; Japanese Katakana 333 ; ((japanese-jisx0208 . #x2521) . (#xC77B . #xC7F2)) 334 ;; Cyrillic Characters 335 ; ((japanese-jisx0208 . #x2721) . (#xC7F3 . #xC854)) 336 ; ((japanese-jisx0208 . #x2751) . (#xC855 . #xC875)) 337 ;; Special Chinese Characters 338 (?$(J!#(B . #xC879) 339 (?$(J!$(B . #xC87B) 340 (?$(J!*(B . #xC87D) 341 (?$(J!R(B . #xC8A2) 342 343 ;; JIS X 0208 NOT SIGN (cf. U+00AC) 344 ; (?$B"L(B . #xC8CD) 345 ;; JIS X 0212 BROKEN BAR (cf. U+00A6) 346 ; (?$(D"C(B . #xC8CE) 347 348 ;; GB 2312 characters 349 ; (?$A!d(B . #xC8CF) 350 ; (?$A!e(B . #xC8D0) 351 ;;;;; C8D1 - Japanese `($B3t(B)' 352 ; (?$A!m(B . #xC8D2) 353 ;;;;; C8D2 - Tel. 354 355 ;; Level 2 Ideographs 356 ((chinese-cns11643-2 . #x2121) . (#xC940 . #xC949)) 357 (?$(GDB(B . #xC94A);; a duplicate of #xA461 358 ((chinese-cns11643-2 . #x212B) . (#xC94B . #xC96B)) 359 ((chinese-cns11643-2 . #x214D) . (#xC96C . #xC9BD)) 360 (?$(H!L(B . #xC9BE) 361 ((chinese-cns11643-2 . #x217D) . (#xC9BF . #xC9EC)) 362 ((chinese-cns11643-2 . #x224E) . (#xC9ED . #xCAF6)) 363 (?$(H"M(B . #xCAF7) 364 ((chinese-cns11643-2 . #x2439) . (#xCAF8 . #xD6CB)) 365 (?$(H>c(B . #xD6CC) 366 ((chinese-cns11643-2 . #x3770) . (#xD6CD . #xD779)) 367 (?$(H?j(B . #xD77A) 368 ((chinese-cns11643-2 . #x387E) . (#xD77B . #xDADE)) 369 (?$(H7o(B . #xDADF) 370 ((chinese-cns11643-2 . #x3E64) . (#xDAE0 . #xDBA6)) 371 ((chinese-cns11643-2 . #x3F6B) . (#xDBA7 . #xDDFB)) 372 (?$(HAv(B . #xDDFC);; a duplicate of #xDCD1 373 ((chinese-cns11643-2 . #x4424) . (#xDDFD . #xE8A2)) 374 ((chinese-cns11643-2 . #x554C) . (#xE8A3 . #xE975)) 375 ((chinese-cns11643-2 . #x5723) . (#xE976 . #xEB5A)) 376 ((chinese-cns11643-2 . #x5A29) . (#xEB5B . #xEBF0)) 377 (?$(HUK(B . #xEBF1) 378 ((chinese-cns11643-2 . #x5B3F) . (#xEBF2 . #xECDD)) 379 (?$(HW"(B . #xECDE) 380 ((chinese-cns11643-2 . #x5C6A) . (#xECDF . #xEDA9)) 381 ((chinese-cns11643-2 . #x5D75) . (#xEDAA . #xEEEA)) 382 (?$(Hd/(B . #xEEEB) 383 ((chinese-cns11643-2 . #x6039) . (#xEEEC . #xF055)) 384 (?$(H]t(B . #xF056) 385 ((chinese-cns11643-2 . #x6243) . (#xF057 . #xF0CA)) 386 (?$(HZ((B . #xF0CB) 387 ((chinese-cns11643-2 . #x6337) . (#xF0CC . #xF162)) 388 ((chinese-cns11643-2 . #x6430) . (#xF163 . #xF16A)) 389 (?$(Hga(B . #xF16B) 390 ((chinese-cns11643-2 . #x6438) . (#xF16C . #xF267)) 391 (?$(Hi4(B . #xF268) 392 ((chinese-cns11643-2 . #x6573) . (#xF269 . #xF2C2)) 393 ((chinese-cns11643-2 . #x664E) . (#xF2C3 . #xF374)) 394 ((chinese-cns11643-2 . #x6762) . (#xF375 . #xF465)) 395 ((chinese-cns11643-2 . #x6935) . (#xF466 . #xF4B4)) 396 (?$(HfM(B . #xF4B5) 397 ((chinese-cns11643-2 . #x6962) . (#xF4B6 . #xF4FC)) 398 ((chinese-cns11643-2 . #x6A4C) . (#xF4FD . #xF662)) 399 (?$(HjK(B . #xF663) 400 ((chinese-cns11643-2 . #x6C52) . (#xF664 . #xF976)) 401 ((chinese-cns11643-2 . #x7167) . (#xF977 . #xF9C3)) 402 (?$(Hqf(B . #xF9C4) 403 (?$(Hr4(B . #xF9C5) 404 (?$(Hr@(B . #xF9C6) 405 ((chinese-cns11643-2 . #x7235) . (#xF9C7 . #xF9D1)) 406 ((chinese-cns11643-2 . #x7241) . (#xF9D2 . #xF9D5)) 407 408 ;; Additional Ideographs 409 (?$(IC7(B . #xF9D6) 410 (?$(IOP(B . #xF9D7) 411 (?$(IDN(B . #xF9D8) 412 (?$(IPJ(B . #xF9D9) 413 (?$(I,](B . #xF9DA) 414 (?$(I=~(B . #xF9DB) 415 (?$(IK\(B . #xF9DC) 416 ) 417 )) 418) 419 420;; 421(provide 'china-util) 422 423;;; arch-tag: 5a47b084-b9ac-420e-8191-70c5b3a14836 424;;; china-util.el ends here 425