1;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*- 2 3;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007 4;; Free Software Foundation, Inc. 5;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007 6;; National Institute of Advanced Industrial Science and Technology (AIST) 7;; Registration Number H14PRO021 8 9;; Author: TAKAHASHI Naoto <ntakahas@m17n.org> 10;; Maintainer: FSF 11;; Keywords: multilingual, Unicode, UTF-8, i18n 12 13;; This file is part of GNU Emacs. 14 15;; GNU Emacs is free software; you can redistribute it and/or modify 16;; it under the terms of the GNU General Public License as published by 17;; the Free Software Foundation; either version 2, or (at your option) 18;; any later version. 19 20;; GNU Emacs is distributed in the hope that it will be useful, 21;; but WITHOUT ANY WARRANTY; without even the implied warranty of 22;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 23;; GNU General Public License for more details. 24 25;; You should have received a copy of the GNU General Public License 26;; along with GNU Emacs; see the file COPYING. If not, write to the 27;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 28;; Boston, MA 02110-1301, USA. 29 30;;; Commentary: 31 32;; The coding-system `mule-utf-8' basically supports encoding/decoding 33;; of the following character sets to and from UTF-8: 34;; 35;; ascii 36;; eight-bit-control 37;; latin-iso8859-1 38;; mule-unicode-0100-24ff 39;; mule-unicode-2500-33ff 40;; mule-unicode-e000-ffff 41;; 42;; On decoding, Unicode characters that do not fit into the above 43;; character sets are handled as `eight-bit-control' or 44;; `eight-bit-graphic' characters to retain the information about the 45;; original byte sequence and text properties record the corresponding 46;; unicode. 47;; 48;; Fixme: note that reading and writing invalid utf-8 may not be 49;; idempotent -- to represent the bytes to fix that needs a new charset. 50;; 51;; Characters from other character sets can be encoded with mule-utf-8 52;; by populating the translation table 53;; `utf-translation-table-for-encode'. Hash tables 54;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are 55;; used to support encoding and decoding of about a quarter of the CJK 56;; space between U+3400 and U+DFFF. 57 58;; UTF-8 is defined in RFC 3629. A sketch of the encoding is: 59 60;; scalar | utf-8 61;; value | 1st byte | 2nd byte | 3rd byte 62;; --------------------+-----------+-----------+---------- 63;; 0000 0000 0xxx xxxx | 0xxx xxxx | | 64;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | 65;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx 66 67;;; Code: 68 69(defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil) 70 "Char table mapping characters to latin-iso8859-1 or mule-unicode-*. 71 72If `unify-8859-on-encoding-mode' is non-nil, this table populates the 73translation-table named `utf-translation-table-for-encode'.") 74 75(define-translation-table 'utf-translation-table-for-encode) 76 77 78;; Map Cyrillic and Greek to iso-8859 charsets, which take half the 79;; space of mule-unicode. For Latin scripts this isn't very 80;; important. Hebrew and Arabic might go here too when there's proper 81;; support for them. 82 83(defvar utf-fragmentation-table (make-char-table 'translation-table nil) 84 "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*. 85 86If `utf-fragment-on-decoding' is non-nil, this table populates the 87translation-table named `utf-translation-table-for-decode'") 88 89(defvar utf-defragmentation-table (make-char-table 'translation-table nil) 90 "Char-table for reverse mapping of `utf-fragmentation-table'. 91 92If `utf-fragment-on-decoding' is non-nil and 93`unify-8859-on-encoding-mode' is nil, this table populates the 94translation-table named `utf-translation-table-for-encode'") 95 96(define-translation-table 'utf-translation-table-for-decode) 97 98 99(defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) 100 "Hash table mapping Emacs CJK character sets to Unicode code points. 101 102If `utf-translate-cjk-mode' is non-nil, this table populates the 103translation-hash-table named `utf-subst-table-for-encode'.") 104 105(define-translation-hash-table 'utf-subst-table-for-encode 106 ucs-mule-cjk-to-unicode) 107 108(defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq) 109 "Hash table mapping Unicode code points to Emacs CJK character sets. 110 111If `utf-translate-cjk-mode' is non-nil, this table populates the 112translation-hash-table named `utf-subst-table-for-decode'.") 113 114(define-translation-hash-table 'utf-subst-table-for-decode 115 ucs-unicode-to-mule-cjk) 116 117(mapc 118 (lambda (pair) 119 (aset utf-fragmentation-table (car pair) (cdr pair)) 120 (aset utf-defragmentation-table (cdr pair) (car pair))) 121 '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) (?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B) 122 (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) (?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,F@(B) 123 (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) (?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B) 124 (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) (?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B) 125 (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) (?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B) 126 (?$,1' (B . ?,FP(B) (?$,1'!(B . ?,FQ(B) (?$,1'#(B . ?,FS(B) (?$,1'$(B . ?,FT(B) (?$,1'%(B . ?,FU(B) 127 (?$,1'&(B . ?,FV(B) (?$,1''(B . ?,FW(B) (?$,1'((B . ?,FX(B) (?$,1')(B . ?,FY(B) (?$,1'*(B . ?,FZ(B) 128 (?$,1'+(B . ?,F[(B) (?$,1',(B . ?,F\(B) (?$,1'-(B . ?,F](B) (?$,1'.(B . ?,F^(B) (?$,1'/(B . ?,F_(B) 129 (?$,1'0(B . ?,F`(B) (?$,1'1(B . ?,Fa(B) (?$,1'2(B . ?,Fb(B) (?$,1'3(B . ?,Fc(B) (?$,1'4(B . ?,Fd(B) 130 (?$,1'5(B . ?,Fe(B) (?$,1'6(B . ?,Ff(B) (?$,1'7(B . ?,Fg(B) (?$,1'8(B . ?,Fh(B) (?$,1'9(B . ?,Fi(B) 131 (?$,1':(B . ?,Fj(B) (?$,1';(B . ?,Fk(B) (?$,1'<(B . ?,Fl(B) (?$,1'=(B . ?,Fm(B) (?$,1'>(B . ?,Fn(B) 132 (?$,1'?(B . ?,Fo(B) (?$,1'@(B . ?,Fp(B) (?$,1'A(B . ?,Fq(B) (?$,1'B(B . ?,Fr(B) (?$,1'C(B . ?,Fs(B) 133 (?$,1'D(B . ?,Ft(B) (?$,1'E(B . ?,Fu(B) (?$,1'F(B . ?,Fv(B) (?$,1'G(B . ?,Fw(B) (?$,1'H(B . ?,Fx(B) 134 (?$,1'I(B . ?,Fy(B) (?$,1'J(B . ?,Fz(B) (?$,1'K(B . ?,F{(B) (?$,1'L(B . ?,F|(B) (?$,1'M(B . ?,F}(B) 135 (?$,1'N(B . ?,F~(B) 136 137 (?$,1(!(B . ?,L!(B) (?$,1("(B . ?,L"(B) (?$,1(#(B . ?,L#(B) (?$,1($(B . ?,L$(B) 138 (?$,1(%(B . ?,L%(B) (?$,1(&(B . ?,L&(B) (?$,1('(B . ?,L'(B) (?$,1(((B . ?,L((B) (?$,1()(B . ?,L)(B) 139 (?$,1(*(B . ?,L*(B) (?$,1(+(B . ?,L+(B) (?$,1(,(B . ?,L,(B) (?$,1(.(B . ?,L.(B) (?$,1(/(B . ?,L/(B) 140 (?$,1(0(B . ?,L0(B) (?$,1(1(B . ?,L1(B) (?$,1(2(B . ?,L2(B) (?$,1(3(B . ?,L3(B) (?$,1(4(B . ?,L4(B) 141 (?$,1(5(B . ?,L5(B) (?$,1(6(B . ?,L6(B) (?$,1(7(B . ?,L7(B) (?$,1(8(B . ?,L8(B) (?$,1(9(B . ?,L9(B) 142 (?$,1(:(B . ?,L:(B) (?$,1(;(B . ?,L;(B) (?$,1(<(B . ?,L<(B) (?$,1(=(B . ?,L=(B) (?$,1(>(B . ?,L>(B) 143 (?$,1(?(B . ?,L?(B) (?$,1(@(B . ?,L@(B) (?$,1(A(B . ?,LA(B) (?$,1(B(B . ?,LB(B) (?$,1(C(B . ?,LC(B) 144 (?$,1(D(B . ?,LD(B) (?$,1(E(B . ?,LE(B) (?$,1(F(B . ?,LF(B) (?$,1(G(B . ?,LG(B) (?$,1(H(B . ?,LH(B) 145 (?$,1(I(B . ?,LI(B) (?$,1(J(B . ?,LJ(B) (?$,1(K(B . ?,LK(B) (?$,1(L(B . ?,LL(B) (?$,1(M(B . ?,LM(B) 146 (?$,1(N(B . ?,LN(B) (?$,1(O(B . ?,LO(B) (?$,1(P(B . ?,LP(B) (?$,1(Q(B . ?,LQ(B) (?$,1(R(B . ?,LR(B) 147 (?$,1(S(B . ?,LS(B) (?$,1(T(B . ?,LT(B) (?$,1(U(B . ?,LU(B) (?$,1(V(B . ?,LV(B) (?$,1(W(B . ?,LW(B) 148 (?$,1(X(B . ?,LX(B) (?$,1(Y(B . ?,LY(B) (?$,1(Z(B . ?,LZ(B) (?$,1([(B . ?,L[(B) (?$,1(\(B . ?,L\(B) 149 (?$,1(](B . ?,L](B) (?$,1(^(B . ?,L^(B) (?$,1(_(B . ?,L_(B) (?$,1(`(B . ?,L`(B) (?$,1(a(B . ?,La(B) 150 (?$,1(b(B . ?,Lb(B) (?$,1(c(B . ?,Lc(B) (?$,1(d(B . ?,Ld(B) (?$,1(e(B . ?,Le(B) (?$,1(f(B . ?,Lf(B) 151 (?$,1(g(B . ?,Lg(B) (?$,1(h(B . ?,Lh(B) (?$,1(i(B . ?,Li(B) (?$,1(j(B . ?,Lj(B) (?$,1(k(B . ?,Lk(B) 152 (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) (?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B) 153 (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) (?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B) 154 (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) (?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B) 155 (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B))) 156 157 158(defcustom utf-fragment-on-decoding nil 159 "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets. 160Setting this means that the relevant Cyrillic and Greek characters are 161decoded into the iso8859 charsets rather than into 162mule-unicode-0100-24ff. The iso8859 charsets take half as much space 163in the buffer, but using them may affect how the buffer can be re-encoded 164and may require a different input method to search for them, for instance. 165See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode' 166for mechanisms to make this largely transparent. 167 168Setting this variable outside customize has no effect." 169 :set (lambda (s v) 170 (if v 171 (progn 172 (define-translation-table 'utf-translation-table-for-decode 173 utf-fragmentation-table) 174 ;; Even if unify-8859-on-encoding-mode is off, make 175 ;; mule-utf-* encode characters in 176 ;; utf-fragmentation-table. 177 (unless (eq (get 'utf-translation-table-for-encode 178 'translation-table) 179 ucs-mule-to-mule-unicode) 180 (define-translation-table 'utf-translation-table-for-encode 181 utf-defragmentation-table))) 182 (define-translation-table 'utf-translation-table-for-decode) 183 ;; When unify-8859-on-encoding-mode is off, be sure to make 184 ;; mule-utf-* disabled for characters in 185 ;; utf-fragmentation-table. 186 (unless (eq (get 'utf-translation-table-for-encode 187 'translation-table) 188 ucs-mule-to-mule-unicode) 189 (define-translation-table 'utf-translation-table-for-encode))) 190 (set-default s v)) 191 :version "22.1" 192 :type 'boolean 193 :group 'mule) 194 195 196(defconst utf-translate-cjk-charsets '(chinese-gb2312 197 chinese-big5-1 chinese-big5-2 198 japanese-jisx0208 japanese-jisx0212 199 katakana-jisx0201 200 korean-ksc5601) 201 "List of charsets supported by `utf-translate-cjk-mode'.") 202 203(defvar utf-translate-cjk-lang-env nil 204 "Language environment in which tables for `utf-translate-cjk-mode' is loaded. 205The value nil means that the tables are not yet loaded.") 206 207(defvar utf-translate-cjk-unicode-range) 208 209;; String generated from utf-translate-cjk-unicode-range. It is 210;; suitable for an argument to skip-chars-forward. 211(defvar utf-translate-cjk-unicode-range-string nil) 212 213(defun utf-translate-cjk-set-unicode-range (range) 214 (setq utf-translate-cjk-unicode-range range) 215 (setq utf-translate-cjk-unicode-range-string 216 (let ((decode-char-no-trans 217 #'(lambda (x) 218 (cond ((< x #x100) (make-char 'latin-iso8859-1 x)) 219 ((< x #x2500) 220 (setq x (- x #x100)) 221 (make-char 'mule-unicode-0100-24ff 222 (+ (/ x 96) 32) (+ (% x 96) 32))) 223 ((< x #x3400) 224 (setq x (- x #x2500)) 225 (make-char 'mule-unicode-2500-33ff 226 (+ (/ x 96) 32) (+ (% x 96) 32))) 227 (t 228 (setq x (- x #xe000)) 229 (make-char 'mule-unicode-e000-ffff 230 (+ (/ x 96) 32) (+ (% x 96) 32)))))) 231 ranges from to) 232 (dolist (elt range) 233 (setq from (max #xA0 (car elt)) to (min #xffff (cdr elt))) 234 (if (and (>= to #x3400) (< to #xE000)) 235 (setq to #x33FF)) 236 (cond ((< from #x100) 237 (if (>= to #xE000) 238 (setq ranges (cons (cons #xE000 to) ranges) 239 to #x33FF)) 240 (if (>= to #x2500) 241 (setq ranges (cons (cons #x2500 to) ranges) 242 to #x24FF)) 243 (if (>= to #x100) 244 (setq ranges (cons (cons #x100 to) ranges) 245 to #xFF))) 246 ((< from #x2500) 247 (if (>= to #xE000) 248 (setq ranges (cons (cons #xE000 to) ranges) 249 to #x33FF)) 250 (if (>= to #x2500) 251 (setq ranges (cons (cons #x2500 to) ranges) 252 to #x24FF))) 253 ((< from #x3400) 254 (if (>= to #xE000) 255 (setq ranges (cons (cons #xE000 to) ranges) 256 to #x33FF)))) 257 (if (<= from to) 258 (setq ranges (cons (cons from to) ranges)))) 259 (mapconcat #'(lambda (x) 260 (format "%c-%c" 261 (funcall decode-char-no-trans (car x)) 262 (funcall decode-char-no-trans (cdr x)))) 263 ranges ""))) 264 ;; These forces loading and settting tables for 265 ;; utf-translate-cjk-mode. 266 (setq utf-translate-cjk-lang-env nil 267 ucs-mule-cjk-to-unicode (make-hash-table :test 'eq) 268 ucs-unicode-to-mule-cjk (make-hash-table :test 'eq))) 269 270(defcustom utf-translate-cjk-unicode-range '((#x2e80 . #xd7a3) 271 (#xff00 . #xffef)) 272 "List of Unicode code ranges supported by `utf-translate-cjk-mode'. 273Setting this variable directly does not take effect; 274use either \\[customize] or the function 275`utf-translate-cjk-set-unicode-range'." 276 :version "22.1" 277 :type '(repeat (cons integer integer)) 278 :set (lambda (symbol value) 279 (utf-translate-cjk-set-unicode-range value)) 280 :group 'mule) 281 282;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'. 283(defsubst utf-translate-cjk-substitutable-p (code-point) 284 (let ((tail utf-translate-cjk-unicode-range) 285 elt) 286 (while tail 287 (setq elt (car tail) tail (cdr tail)) 288 (if (and (>= code-point (car elt)) (<= code-point (cdr elt))) 289 (setq tail nil) 290 (setq elt nil))) 291 elt)) 292 293(defun utf-translate-cjk-load-tables () 294 "Load tables for `utf-translate-cjk-mode'." 295 ;; Fixme: Allow the use of the CJK charsets to be 296 ;; customized by reordering and possible omission. 297 (let ((redefined (< (hash-table-size ucs-mule-cjk-to-unicode) 43000))) 298 (if redefined 299 ;; Redefine them with realistic initial sizes and a 300 ;; smallish rehash size to avoid wasting significant 301 ;; space after they're built. 302 (setq ucs-mule-cjk-to-unicode 303 (make-hash-table :test 'eq :size 43000 :rehash-size 1000) 304 ucs-unicode-to-mule-cjk 305 (make-hash-table :test 'eq :size 21500 :rehash-size 1000))) 306 307 ;; Load the files explicitly, to avoid having to keep 308 ;; around the large tables they contain (as well as the 309 ;; ones which get built). 310 ;; Here we bind coding-system-for-read to nil so that coding tags 311 ;; in the files are respected even if the files are not yet 312 ;; byte-compiled 313 (let ((coding-system-for-read nil) 314 ;; We must avoid clobbering this variable, in case the load 315 ;; files below use different coding systems. 316 (last-coding-system-used last-coding-system-used)) 317 (cond ((string= "Korean" current-language-environment) 318 (load "subst-jis") 319 (load "subst-big5") 320 (load "subst-gb2312") 321 (load "subst-ksc")) 322 ((string= "Chinese-BIG5" current-language-environment) 323 (load "subst-jis") 324 (load "subst-ksc") 325 (load "subst-gb2312") 326 (load "subst-big5")) 327 ((string= "Chinese-GB" current-language-environment) 328 (load "subst-jis") 329 (load "subst-ksc") 330 (load "subst-big5") 331 (load "subst-gb2312")) 332 (t 333 (load "subst-ksc") 334 (load "subst-gb2312") 335 (load "subst-big5") 336 (load "subst-jis")))) ; jis covers as much as big5, gb2312 337 338 (when redefined 339 (define-translation-hash-table 'utf-subst-table-for-decode 340 ucs-unicode-to-mule-cjk) 341 (define-translation-hash-table 'utf-subst-table-for-encode 342 ucs-mule-cjk-to-unicode) 343 (set-char-table-extra-slot (get 'utf-translation-table-for-encode 344 'translation-table) 345 1 ucs-mule-cjk-to-unicode)) 346 347 (setq utf-translate-cjk-lang-env current-language-environment))) 348 349(defun utf-lookup-subst-table-for-decode (code-point) 350 (if (and utf-translate-cjk-mode 351 (not utf-translate-cjk-lang-env) 352 (utf-translate-cjk-substitutable-p code-point)) 353 (utf-translate-cjk-load-tables)) 354 (gethash code-point 355 (get 'utf-subst-table-for-decode 'translation-hash-table))) 356 357 358(defun utf-lookup-subst-table-for-encode (char) 359 (if (and utf-translate-cjk-mode 360 (not utf-translate-cjk-lang-env) 361 (memq (char-charset char) utf-translate-cjk-charsets)) 362 (utf-translate-cjk-load-tables)) 363 (gethash char 364 (get 'utf-subst-table-for-encode 'translation-hash-table))) 365 366(define-minor-mode utf-translate-cjk-mode 367 "Toggle whether UTF based coding systems de/encode CJK characters. 368If ARG is an integer, enable if ARG is positive and disable if 369zero or negative. This is a minor mode. 370Enabling this allows the coding systems mule-utf-8, 371mule-utf-16le and mule-utf-16be to encode characters in the charsets 372`korean-ksc5601', `chinese-gb2312', `chinese-big5-1', 373`chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to 374decode the corresponding unicodes into such characters. 375 376Where the charsets overlap, the one preferred for decoding is chosen 377according to the language environment in effect when this option is 378turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for 379Chinese-Big5 and jisx for other environments. 380 381This mode is on by default. If you are not interested in CJK 382characters and want to avoid some overhead on encoding/decoding 383by the above coding systems, you can customize the user option 384`utf-translate-cjk-mode' to nil." 385 :init-value t 386 :version "22.1" 387 :type 'boolean 388 :group 'mule 389 :global t 390 (if utf-translate-cjk-mode 391 (progn 392 (define-translation-hash-table 'utf-subst-table-for-decode 393 ucs-unicode-to-mule-cjk) 394 (define-translation-hash-table 'utf-subst-table-for-encode 395 ucs-mule-cjk-to-unicode) 396 (set-char-table-extra-slot (get 'utf-translation-table-for-encode 397 'translation-table) 398 1 ucs-mule-cjk-to-unicode)) 399 (define-translation-hash-table 'utf-subst-table-for-decode 400 (make-hash-table :test 'eq)) 401 (define-translation-hash-table 'utf-subst-table-for-encode 402 (make-hash-table :test 'eq)) 403 (set-char-table-extra-slot (get 'utf-translation-table-for-encode 404 'translation-table) 405 1 nil)) 406 407 ;; Update safe-chars of mule-utf-* coding systems. 408 (dolist (elt (coding-system-list t)) 409 (if (string-match "^mule-utf" (symbol-name elt)) 410 (let ((safe-charsets (coding-system-get elt 'safe-charsets)) 411 (safe-chars (coding-system-get elt 'safe-chars)) 412 (need-update nil)) 413 (dolist (charset utf-translate-cjk-charsets) 414 (unless (eq utf-translate-cjk-mode (memq charset safe-charsets)) 415 (setq safe-charsets 416 (if utf-translate-cjk-mode 417 (cons charset safe-charsets) 418 (delq charset safe-charsets)) 419 need-update t) 420 (aset safe-chars (make-char charset) utf-translate-cjk-mode))) 421 (when need-update 422 (coding-system-put elt 'safe-charsets safe-charsets) 423 (define-coding-system-internal elt)))))) 424 425(define-ccl-program ccl-mule-utf-untrans 426 ;; R0 is an untranslatable Unicode code-point (U+3500..U+DFFF or 427 ;; U+10000..U+10FFFF) or an invaid byte (#x00..#xFF). Write 428 ;; eight-bit-control/graphic sequence (2 to 4 chars) representing 429 ;; UTF-8 sequence of r0. Registers r4, r5, r6 are modified. 430 ;; 431 ;; This is a subrountine because we assume that this is called very 432 ;; rarely (so we don't have to worry about the overhead of the 433 ;; call). 434 `(0 435 ((r5 = ,(charset-id 'eight-bit-control)) 436 (r6 = ,(charset-id 'eight-bit-graphic)) 437 (if (r0 < #x100) 438 ((r4 = ((r0 >> 6) | #xC0)) 439 (write-multibyte-character r6 r4)) 440 ((if (r0 < #x10000) 441 ((r4 = ((r0 >> 12) | #xE0)) 442 (write-multibyte-character r6 r4)) 443 ((r4 = ((r0 >> 18) | #xF0)) 444 (write-multibyte-character r6 r4) 445 (r4 = (((r0 >> 12) & #x3F) | #x80)) 446 (if (r4 < #xA0) 447 (write-multibyte-character r5 r4) 448 (write-multibyte-character r6 r4)))) 449 (r4 = (((r0 >> 6) & #x3F) | #x80)) 450 (if (r4 < #xA0) 451 (write-multibyte-character r5 r4) 452 (write-multibyte-character r6 r4)))) 453 (r4 = ((r0 & #x3F) | #x80)) 454 (if (r4 < #xA0) 455 (write-multibyte-character r5 r4) 456 (write-multibyte-character r6 r4))))) 457 458(define-ccl-program ccl-decode-mule-utf-8 459 ;; 460 ;; charset | bytes in utf-8 | bytes in emacs 461 ;; -----------------------+----------------+--------------- 462 ;; ascii | 1 | 1 463 ;; -----------------------+----------------+--------------- 464 ;; eight-bit-control | 2 | 2 465 ;; eight-bit-graphic | 2 | 1 466 ;; latin-iso8859-1 | 2 | 2 467 ;; -----------------------+----------------+--------------- 468 ;; mule-unicode-0100-24ff | 2 | 4 469 ;; (< 0800) | | 470 ;; -----------------------+----------------+--------------- 471 ;; mule-unicode-0100-24ff | 3 | 4 472 ;; (>= 8000) | | 473 ;; mule-unicode-2500-33ff | 3 | 4 474 ;; mule-unicode-e000-ffff | 3 | 4 475 ;; -----------------------+----------------+--------------- 476 ;; invalid byte | 1 | 2 477 ;; 478 ;; Thus magnification factor is two. 479 ;; 480 `(2 481 ((r6 = ,(charset-id 'latin-iso8859-1)) 482 (read r0) 483 (loop 484 (if (r0 < #x80) 485 ;; 1-byte encoding, i.e., ascii 486 (write-read-repeat r0)) 487 (if (r0 < #xc2) 488 ;; continuation byte (invalid here) or 1st byte of overlong 489 ;; 2-byte sequence. 490 ((call ccl-mule-utf-untrans) 491 (r6 = ,(charset-id 'latin-iso8859-1)) 492 (read r0) 493 (repeat))) 494 495 ;; Read the 2nd byte. 496 (read r1) 497 (if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte 498 ((call ccl-mule-utf-untrans) 499 (r6 = ,(charset-id 'latin-iso8859-1)) 500 ;; Handle it in the next loop. 501 (r0 = r1) 502 (repeat))) 503 504 (if (r0 < #xe0) 505 ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx 506 ((r1 &= #x3F) 507 (r1 |= ((r0 & #x1F) << 6)) 508 ;; Now r1 holds scalar value. We don't have to check 509 ;; `overlong sequence' because r0 >= 0xC2. 510 511 (if (r1 >= 256) 512 ;; mule-unicode-0100-24ff (< 0800) 513 ((r0 = r1) 514 (lookup-integer utf-subst-table-for-decode r0 r1) 515 (if (r7 == 0) 516 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) 517 (r1 -= #x0100) 518 (r2 = (((r1 / 96) + 32) << 7)) 519 (r1 %= 96) 520 (r1 += (r2 + 32)) 521 (translate-character 522 utf-translation-table-for-decode r0 r1))) 523 (write-multibyte-character r0 r1) 524 (read r0) 525 (repeat)) 526 (if (r1 >= 160) 527 ;; latin-iso8859-1 528 ((r0 = r1) 529 (lookup-integer utf-subst-table-for-decode r0 r1) 530 (if (r7 == 0) 531 ((r1 -= 128) 532 (write-multibyte-character r6 r1)) 533 ((write-multibyte-character r0 r1))) 534 (read r0) 535 (repeat)) 536 ;; eight-bit-control 537 ((r0 = ,(charset-id 'eight-bit-control)) 538 (write-multibyte-character r0 r1) 539 (read r0) 540 (repeat)))))) 541 542 ;; Read the 3rd bytes. 543 (read r2) 544 (if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte 545 ((call ccl-mule-utf-untrans) 546 (r0 = r1) 547 (call ccl-mule-utf-untrans) 548 (r6 = ,(charset-id 'latin-iso8859-1)) 549 ;; Handle it in the next loop. 550 (r0 = r2) 551 (repeat))) 552 553 (if (r0 < #xF0) 554 ;; 3byte encoding 555 ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx 556 ((r3 = ((r0 & #xF) << 12)) 557 (r3 |= ((r1 & #x3F) << 6)) 558 (r3 |= (r2 & #x3F)) 559 560 (if (r3 < #x800) ; `overlong sequence' 561 ((call ccl-mule-utf-untrans) 562 (r0 = r1) 563 (call ccl-mule-utf-untrans) 564 (r0 = r2) 565 (call ccl-mule-utf-untrans) 566 (r6 = ,(charset-id 'latin-iso8859-1)) 567 (read r0) 568 (repeat))) 569 570 (if (r3 < #x2500) 571 ;; mule-unicode-0100-24ff (>= 0800) 572 ((r0 = r3) 573 (lookup-integer utf-subst-table-for-decode r0 r1) 574 (if (r7 == 0) 575 ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) 576 (r3 -= #x0100) 577 (r3 //= 96) 578 (r1 = (r7 + 32)) 579 (r1 += ((r3 + 32) << 7)) 580 (translate-character 581 utf-translation-table-for-decode r0 r1))) 582 (write-multibyte-character r0 r1) 583 (read r0) 584 (repeat))) 585 586 (if (r3 < #x3400) 587 ;; mule-unicode-2500-33ff 588 ((r0 = r3) ; don't zap r3 589 (lookup-integer utf-subst-table-for-decode r0 r1) 590 (if (r7 == 0) 591 ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) 592 (r3 -= #x2500) 593 (r3 //= 96) 594 (r1 = (r7 + 32)) 595 (r1 += ((r3 + 32) << 7)))) 596 (write-multibyte-character r0 r1) 597 (read r0) 598 (repeat))) 599 600 (if (r3 < #xE000) 601 ;; Try to convert to CJK chars, else 602 ;; keep them as eight-bit-{control|graphic}. 603 ((r0 = r3) 604 (lookup-integer utf-subst-table-for-decode r3 r1) 605 (if r7 606 ;; got a translation 607 ((write-multibyte-character r3 r1) 608 (read r0) 609 (repeat)) 610 ((call ccl-mule-utf-untrans) 611 (r6 = ,(charset-id 'latin-iso8859-1)) 612 (read r0) 613 (repeat))))) 614 615 ;; mule-unicode-e000-ffff 616 ;; Fixme: fffe and ffff are invalid. 617 (r0 = r3) ; don't zap r3 618 (lookup-integer utf-subst-table-for-decode r0 r1) 619 (if (r7 == 0) 620 ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) 621 (r3 -= #xe000) 622 (r3 //= 96) 623 (r1 = (r7 + 32)) 624 (r1 += ((r3 + 32) << 7)))) 625 (write-multibyte-character r0 r1) 626 (read r0) 627 (repeat))) 628 629 ;; Read the 4th bytes. 630 (read r3) 631 (if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte 632 ((call ccl-mule-utf-untrans) 633 (r0 = r1) 634 (call ccl-mule-utf-untrans) 635 (r0 = r2) 636 (call ccl-mule-utf-untrans) 637 (r6 = ,(charset-id 'latin-iso8859-1)) 638 ;; Handle it in the next loop. 639 (r0 = r3) 640 (repeat))) 641 642 (if (r0 < #xF8) 643 ;; 4-byte encoding: 644 ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx 645 ;; keep those bytes as eight-bit-{control|graphic} 646 ;; Fixme: allow lookup in utf-subst-table-for-decode. 647 ((r4 = ((r0 & #x7) << 18)) 648 (r4 |= ((r1 & #x3F) << 12)) 649 (r4 |= ((r2 & #x3F) << 6)) 650 (r4 |= (r3 & #x3F)) 651 652 (if (r4 < #x10000) ; `overlong sequence' 653 ((call ccl-mule-utf-untrans) 654 (r0 = r1) 655 (call ccl-mule-utf-untrans) 656 (r0 = r2) 657 (call ccl-mule-utf-untrans) 658 (r0 = r3) 659 (call ccl-mule-utf-untrans)) 660 ((r0 = r4) 661 (call ccl-mule-utf-untrans)))) 662 663 ;; Unsupported sequence. 664 ((call ccl-mule-utf-untrans) 665 (r0 = r1) 666 (call ccl-mule-utf-untrans) 667 (r0 = r2) 668 (call ccl-mule-utf-untrans) 669 (r0 = r3) 670 (call ccl-mule-utf-untrans))) 671 (r6 = ,(charset-id 'latin-iso8859-1)) 672 (read r0) 673 (repeat))) 674 675 676 ;; At EOF... 677 (if (r0 >= 0) 678 ;; r0 >= #x80 679 ((call ccl-mule-utf-untrans) 680 (if (r1 >= 0) 681 ((r0 = r1) 682 (call ccl-mule-utf-untrans) 683 (if (r2 >= 0) 684 ((r0 = r2) 685 (call ccl-mule-utf-untrans) 686 (if (r3 >= 0) 687 ((r0 = r3) 688 (call ccl-mule-utf-untrans)))))))))) 689 690 "CCL program to decode UTF-8. 691Basic decoding is done into the charsets ascii, latin-iso8859-1 and 692mule-unicode-*, but see also `utf-fragmentation-table' and 693`ucs-mule-cjk-to-unicode'. 694Encodings of un-representable Unicode characters are decoded asis into 695eight-bit-control and eight-bit-graphic characters.") 696 697(define-ccl-program ccl-mule-utf-8-encode-untrans 698 ;; UTF-8 decoder generates an UTF-8 sequence represented by a 699 ;; sequence eight-bit-control/graphic chars for an untranslatable 700 ;; character and an invalid byte. 701 ;; 702 ;; This CCL parses that sequence (the first byte is already in r1), 703 ;; writes out the original bytes of that sequence, and sets r5 to 704 ;; -1. 705 ;; 706 ;; If the eight-bit-control/graphic sequence is shorter than what r1 707 ;; suggests, it sets r5 and r6 to the last character read that 708 ;; should be handled by the next loop of a caller. 709 ;; 710 ;; Note: For UTF-8 validation, we only check if a character is 711 ;; eight-bit-control/graphic or not. It may result in incorrect 712 ;; handling of random binary data, but such a data can't be encoded 713 ;; by UTF-8 anyway. At least, UTF-8 decoders doesn't generate such 714 ;; a sequence even if a source contains invalid byte-sequence. 715 `(0 716 (;; Read the 2nd byte. 717 (read-multibyte-character r5 r6) 718 (r0 = (r5 != ,(charset-id 'eight-bit-control))) 719 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) 720 ((write r1) ; invalid UTF-8 721 (r1 = -1) 722 (end))) 723 724 (if (r1 <= #xC3) 725 ;; 2-byte sequence for an originally invalid byte. 726 ((r6 &= #x3F) 727 (r6 |= ((r1 & #x1F) << 6)) 728 (write r6) 729 (r5 = -1) 730 (end))) 731 732 (write r1 r6) 733 (r2 = r1) 734 (r1 = -1) 735 ;; Read the 3rd byte. 736 (read-multibyte-character r5 r6) 737 (r0 = (r5 != ,(charset-id 'eight-bit-control))) 738 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) 739 (end)) ; invalid UTF-8 740 (write r6) 741 (if (r2 < #xF0) 742 ;; 3-byte sequence for an untranslated character. 743 ((r5 = -1) 744 (end))) 745 ;; Read the 4th byte. 746 (read-multibyte-character r5 r6) 747 (r0 = (r5 != ,(charset-id 'eight-bit-control))) 748 (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0) 749 (end)) ; invalid UTF-8 750 ;; 4-byte sequence for an untranslated character. 751 (write r6) 752 (r5 = -1) 753 (end)) 754 755 ;; At EOF... 756 ((r5 = -1) 757 (if (r1 >= 0) 758 (write r1))))) 759 760(define-ccl-program ccl-encode-mule-utf-8 761 `(1 762 ((r5 = -1) 763 (loop 764 (if (r5 < 0) 765 (read-multibyte-character r0 r1) 766 ;; Pre-read character is in r5 (charset-ID) and r6 (code-point). 767 ((r0 = r5) 768 (r1 = r6) 769 (r5 = -1))) 770 (translate-character utf-translation-table-for-encode r0 r1) 771 772 (if (r0 == ,(charset-id 'ascii)) 773 (write-repeat r1)) 774 775 (if (r0 == ,(charset-id 'latin-iso8859-1)) 776 ;; r1 scalar utf-8 777 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx 778 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 779 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 780 ((write ((r1 >> 6) | #xc2)) 781 (r1 &= #x3f) 782 (r1 |= #x80) 783 (write-repeat r1))) 784 785 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) 786 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) 787 ;; #x3f80 == (0011 1111 1000 0000)b 788 (r1 &= #x7f) 789 (r1 += (r0 + 224)) ; 240 == -32 + #x0100 790 ;; now r1 holds scalar value 791 (if (r1 < #x0800) 792 ;; 2byte encoding 793 ((write ((r1 >> 6) | #xC0)) 794 (r1 &= #x3F) 795 (r1 |= #x80) 796 (write-repeat r1)) 797 ;; 3byte encoding 798 ((write ((r1 >> 12) | #xE0)) 799 (write (((r1 & #x0FC0) >> 6) | #x80)) 800 (r1 &= #x3F) 801 (r1 |= #x80) 802 (write-repeat r1))))) 803 804 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) 805 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) 806 (r1 &= #x7f) 807 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 808 ;; now r1 holds scalar value 809 (write ((r1 >> 12) | #xE0)) 810 (write (((r1 & #x0FC0) >> 6) | #x80)) 811 (r1 &= #x3F) 812 (r1 |= #x80) 813 (write-repeat r1))) 814 815 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) 816 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) 817 (r1 &= #x7f) 818 (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000 819 ;; now r1 holds scalar value 820 (write ((r1 >> 12) | #xE0)) 821 (write (((r1 & #x0FC0) >> 6) | #x80)) 822 (r1 &= #x3F) 823 (r1 |= #x80) 824 (write-repeat r1))) 825 826 (if (r0 == ,(charset-id 'eight-bit-control)) 827 ;; r1 scalar utf-8 828 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx 829 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 830 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 831 ((write #xC2) 832 (write-repeat r1))) 833 834 (if (r0 == ,(charset-id 'eight-bit-graphic)) 835 ;; r1 scalar utf-8 836 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx 837 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 838 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 839 ((r0 = (r1 >= #xC0)) 840 (r0 &= (r1 <= #xC3)) 841 (r4 = (r1 >= #xE1)) 842 (r4 &= (r1 <= #xF7)) 843 (r0 |= r4) 844 (if r0 845 ((call ccl-mule-utf-8-encode-untrans) 846 (repeat)) 847 (write-repeat r1)))) 848 849 (lookup-character utf-subst-table-for-encode r0 r1) 850 (if r7 ; lookup succeeded 851 (if (r0 < #x800) 852 ;; 2byte encoding 853 ((write ((r0 >> 6) | #xC0)) 854 (r0 = ((r0 & #x3F) | #x80)) 855 (write-repeat r0)) 856 ;; 3byte encoding 857 ((write ((r0 >> 12) | #xE0)) 858 (write (((r0 & #x0FC0) >> 6) | #x80)) 859 (r0 = ((r0 & #x3F) | #x80)) 860 (write-repeat r0)))) 861 862 ;; Unsupported character. 863 ;; Output U+FFFD, which is `ef bf bd' in UTF-8. 864 (write #xef) 865 (write #xbf) 866 (write-repeat #xbd)))) 867 "CCL program to encode into UTF-8.") 868 869 870(define-ccl-program ccl-untranslated-to-ucs 871 `(0 872 (if (r1 == 0) 873 nil 874 (if (r0 <= #xC3) ; 2-byte encoding 875 ((r0 = ((r0 & #x3) << 6)) 876 (r0 |= (r1 & #x3F)) 877 (r1 = 2)) 878 (if (r2 == 0) 879 (r1 = 0) 880 (if (r0 < #xF0) ; 3-byte encoding, as above 881 ((r0 = ((r0 & #xF) << 12)) 882 (r0 |= ((r1 & #x3F) << 6)) 883 (r0 |= (r2 & #x3F)) 884 (r1 = 3)) 885 (if (r3 == 0) 886 (r1 = 0) 887 ((r0 = ((r0 & #x7) << 18)) 888 (r0 |= ((r1 & #x3F) << 12)) 889 (r0 |= ((r2 & #x3F) << 6)) 890 (r0 |= (r3 & #x3F)) 891 (r1 = 4)))))))) 892 "Decode 2-, 3-, or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0. 893Set r1 to the byte length. r0 == 0 for invalid sequence.") 894 895(defvar utf-8-ccl-regs (make-vector 8 0)) 896 897(defsubst utf-8-untranslated-to-ucs () 898 "Return the UCS code for an untranslated sequence of raw bytes t point. 899Only for 3- or 4-byte sequences." 900 (aset utf-8-ccl-regs 0 (or (char-after) 0)) 901 (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) 902 (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) 903 (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) 904 (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs)) 905 906(defun utf-8-help-echo (window object position) 907 (format "Untranslated Unicode U+%04X" 908 (get-char-property position 'untranslated-utf-8 object))) 909 910;; We compose the untranslatable sequences into a single character, 911;; and move point to the next character. 912;; This is infelicitous for editing, because there's currently no 913;; mechanism for treating compositions as atomic, but is OK for 914;; display. They are composed to U+FFFD with help-echo which 915;; indicates the unicodes they represent. This function GCs too much. 916 917;; If utf-translate-cjk-mode is non-nil, this function is called with 918;; HASH-TABLE which translates CJK characters into some of CJK 919;; charsets. 920 921(defsubst utf-8-compose (hash-table) 922 "Put a suitable composition on an untranslatable sequence at point. 923If HASH-TABLE is non-nil, try to translate CJK characters by it at first. 924Move point to the end of the sequence." 925 (utf-8-untranslated-to-ucs) 926 (let ((l (aref utf-8-ccl-regs 1)) 927 ch) 928 (if (> l 0) 929 (if (and hash-table 930 (setq ch (gethash (aref utf-8-ccl-regs 0) hash-table))) 931 (progn 932 (insert ch) 933 (delete-region (point) (min (point-max) (+ l (point))))) 934 (setq ch (aref utf-8-ccl-regs 0)) 935 (put-text-property (point) (min (point-max) (+ l (point))) 936 'untranslated-utf-8 ch) 937 (put-text-property (point) (min (point-max) (+ l (point))) 938 'help-echo 'utf-8-help-echo) 939 (if (= l 2) 940 (put-text-property (point) (min (point-max) (+ l (point))) 941 'display (propertize (format "\\%03o" ch) 942 'face 'escape-glyph)) 943 (compose-region (point) (+ l (point)) ?$,3u=(B)) 944 (forward-char l)) 945 (forward-char 1)))) 946 947(defcustom utf-8-compose-scripts nil 948 "*Non-nil means compose various scripts on decoding utf-8 text." 949 :group 'mule 950 :version "22.1" 951 :type 'boolean) 952 953(defun utf-8-post-read-conversion (length) 954 "Compose untranslated utf-8 sequences into single characters. 955If `utf-translate-cjk-mode' is non-nil, tries to translate CJK characters. 956Also compose particular scripts if `utf-8-compose-scripts' is non-nil." 957 (save-excursion 958 (save-restriction 959 (narrow-to-region (point) (+ (point) length)) 960 ;; Can't do eval-when-compile to insert a multibyte constant 961 ;; version of the string in the loop, since it's always loaded as 962 ;; unibyte from a byte-compiled file. 963 (let ((range (string-as-multibyte "^\xc0-\xc3\xe1-\xf7")) 964 (buffer-multibyte enable-multibyte-characters) 965 hash-table ch) 966 (set-buffer-multibyte t) 967 (when utf-translate-cjk-mode 968 (unless utf-translate-cjk-lang-env 969 ;; Check these characters in utf-translate-cjk-range. 970 ;; We may have to translate them to CJK charsets. 971 (skip-chars-forward 972 (concat range utf-translate-cjk-unicode-range-string)) 973 (unless (eobp) 974 (utf-translate-cjk-load-tables) 975 (setq range 976 (concat range utf-translate-cjk-unicode-range-string))) 977 (setq hash-table (get 'utf-subst-table-for-decode 978 'translation-hash-table)))) 979 (while (and (skip-chars-forward range) 980 (not (eobp))) 981 (setq ch (following-char)) 982 (if (< ch 256) 983 (utf-8-compose hash-table) 984 (if (and hash-table 985 (setq ch (gethash (encode-char ch 'ucs) hash-table))) 986 (progn 987 (insert ch) 988 (delete-char 1)) 989 (forward-char 1)))) 990 (or buffer-multibyte 991 (set-buffer-multibyte nil))) 992 993 (when (and utf-8-compose-scripts (> length 1)) 994 ;; These currently have definitions which cover the relevant 995 ;; unicodes. We could avoid loading thai-util &c by checking 996 ;; whether the region contains any characters with the appropriate 997 ;; categories. There aren't yet Unicode-based rules for Tibetan. 998 (diacritic-compose-region (point-max) (point-min)) 999 (thai-compose-region (point-max) (point-min)) 1000 (lao-compose-region (point-max) (point-min)) 1001 (devanagari-compose-region (point-max) (point-min)) 1002 (malayalam-compose-region (point-max) (point-min)) 1003 (tamil-compose-region (point-max) (point-min))) 1004 (- (point-max) (point-min))))) 1005 1006(defun utf-8-pre-write-conversion (beg end) 1007 "Prepare for `utf-translate-cjk-mode' to encode text between BEG and END. 1008This is used as a post-read-conversion of utf-8 coding system." 1009 (if (and utf-translate-cjk-mode 1010 (not utf-translate-cjk-lang-env) 1011 (if (stringp beg) 1012 (string-match "\\cc\\|\\cj\\|\\ch" beg) 1013 (save-excursion 1014 (goto-char beg) 1015 (re-search-forward "\\cc\\|\\cj\\|\\ch" end t)))) 1016 (utf-translate-cjk-load-tables)) 1017 nil) 1018 1019(make-coding-system 1020 'mule-utf-8 4 ?u 1021 "UTF-8 encoding for Emacs-supported Unicode characters. 1022It supports Unicode characters of these ranges: 1023 U+0000..U+33FF, U+E000..U+FFFF. 1024They correspond to these Emacs character sets: 1025 ascii, latin-iso8859-1, mule-unicode-0100-24ff, 1026 mule-unicode-2500-33ff, mule-unicode-e000-ffff 1027 1028On decoding (e.g. reading a file), Unicode characters not in the above 1029ranges are decoded into sequences of eight-bit-control and 1030eight-bit-graphic characters to preserve their byte sequences. The 1031byte sequence is preserved on i/o for valid utf-8, but not necessarily 1032for invalid utf-8. 1033 1034On encoding (e.g. writing a file), Emacs characters not belonging to 1035any of the character sets listed above are encoded into the UTF-8 byte 1036sequence representing U+FFFD (REPLACEMENT CHARACTER)." 1037 1038 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) 1039 `((safe-charsets 1040 ascii 1041 eight-bit-control 1042 eight-bit-graphic 1043 latin-iso8859-1 1044 mule-unicode-0100-24ff 1045 mule-unicode-2500-33ff 1046 mule-unicode-e000-ffff 1047 ,@(if utf-translate-cjk-mode 1048 utf-translate-cjk-charsets)) 1049 (mime-charset . utf-8) 1050 (coding-category . coding-category-utf-8) 1051 (valid-codes (0 . 255)) 1052 (pre-write-conversion . utf-8-pre-write-conversion) 1053 (post-read-conversion . utf-8-post-read-conversion) 1054 (translation-table-for-encode . utf-translation-table-for-encode) 1055 (dependency unify-8859-on-encoding-mode 1056 unify-8859-on-decoding-mode 1057 utf-fragment-on-decoding 1058 utf-translate-cjk-mode))) 1059 1060(define-coding-system-alias 'utf-8 'mule-utf-8) 1061 1062;; I think this needs special private charsets defined for the 1063;; untranslated sequences, if it's going to work well. 1064 1065;;; (defun utf-8-compose-function (pos to pattern &optional string) 1066;;; (let* ((prop (get-char-property pos 'composition string)) 1067;;; (l (and prop (- (cadr prop) (car prop))))) 1068;;; (cond ((and l (> l (- to pos))) 1069;;; (delete-region pos to)) 1070;;; ((and (> (char-after pos) 224) 1071;;; (< (char-after pos) 256) 1072;;; (save-restriction 1073;;; (narrow-to-region pos to) 1074;;; (utf-8-compose))) 1075;;; t)))) 1076 1077;;; (dotimes (i 96) 1078;;; (aset composition-function-table 1079;;; (+ 128 i) 1080;;; `((,(string-as-multibyte "[\200-\237\240-\377]") 1081;;; . utf-8-compose-function)))) 1082 1083;;; arch-tag: b08735b7-753b-4ae6-b754-0f3efe4515c5 1084;;; utf-8.el ends here 1085