1;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*-
2
3;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007
4;;   Free Software Foundation, Inc.
5;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007
6;;   National Institute of Advanced Industrial Science and Technology (AIST)
7;;   Registration Number H14PRO021
8
9;; Author: TAKAHASHI Naoto  <ntakahas@m17n.org>
10;; Maintainer: FSF
11;; Keywords: multilingual, Unicode, UTF-8, i18n
12
13;; This file is part of GNU Emacs.
14
15;; GNU Emacs is free software; you can redistribute it and/or modify
16;; it under the terms of the GNU General Public License as published by
17;; the Free Software Foundation; either version 2, or (at your option)
18;; any later version.
19
20;; GNU Emacs is distributed in the hope that it will be useful,
21;; but WITHOUT ANY WARRANTY; without even the implied warranty of
22;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23;; GNU General Public License for more details.
24
25;; You should have received a copy of the GNU General Public License
26;; along with GNU Emacs; see the file COPYING.  If not, write to the
27;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
28;; Boston, MA 02110-1301, USA.
29
30;;; Commentary:
31
32;; The coding-system `mule-utf-8' basically supports encoding/decoding
33;; of the following character sets to and from UTF-8:
34;;
35;;   ascii
36;;   eight-bit-control
37;;   latin-iso8859-1
38;;   mule-unicode-0100-24ff
39;;   mule-unicode-2500-33ff
40;;   mule-unicode-e000-ffff
41;;
42;; On decoding, Unicode characters that do not fit into the above
43;; character sets are handled as `eight-bit-control' or
44;; `eight-bit-graphic' characters to retain the information about the
45;; original byte sequence and text properties record the corresponding
46;; unicode.
47;;
48;; Fixme: note that reading and writing invalid utf-8 may not be
49;; idempotent -- to represent the bytes to fix that needs a new charset.
50;;
51;; Characters from other character sets can be encoded with mule-utf-8
52;; by populating the translation table
53;; `utf-translation-table-for-encode'.  Hash tables
54;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are
55;; used to support encoding and decoding of about a quarter of the CJK
56;; space between U+3400 and U+DFFF.
57
58;; UTF-8 is defined in RFC 3629.  A sketch of the encoding is:
59
60;;        scalar       |               utf-8
61;;        value        | 1st byte  | 2nd byte  | 3rd byte
62;; --------------------+-----------+-----------+----------
63;; 0000 0000 0xxx xxxx | 0xxx xxxx |           |
64;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
65;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
66
67;;; Code:
68
69(defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil)
70  "Char table mapping characters to latin-iso8859-1 or mule-unicode-*.
71
72If `unify-8859-on-encoding-mode' is non-nil, this table populates the
73translation-table named `utf-translation-table-for-encode'.")
74
75(define-translation-table 'utf-translation-table-for-encode)
76
77
78;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
79;; space of mule-unicode.  For Latin scripts this isn't very
80;; important.  Hebrew and Arabic might go here too when there's proper
81;; support for them.
82
83(defvar utf-fragmentation-table (make-char-table 'translation-table nil)
84  "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*.
85
86If `utf-fragment-on-decoding' is non-nil, this table populates the
87translation-table named `utf-translation-table-for-decode'")
88
89(defvar utf-defragmentation-table (make-char-table 'translation-table nil)
90  "Char-table for reverse mapping of `utf-fragmentation-table'.
91
92If `utf-fragment-on-decoding' is non-nil and
93`unify-8859-on-encoding-mode' is nil, this table populates the
94translation-table named `utf-translation-table-for-encode'")
95
96(define-translation-table 'utf-translation-table-for-decode)
97
98
99(defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq)
100  "Hash table mapping Emacs CJK character sets to Unicode code points.
101
102If `utf-translate-cjk-mode' is non-nil, this table populates the
103translation-hash-table named `utf-subst-table-for-encode'.")
104
105(define-translation-hash-table 'utf-subst-table-for-encode
106  ucs-mule-cjk-to-unicode)
107
108(defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq)
109  "Hash table mapping Unicode code points to Emacs CJK character sets.
110
111If `utf-translate-cjk-mode' is non-nil, this table populates the
112translation-hash-table named `utf-subst-table-for-decode'.")
113
114(define-translation-hash-table 'utf-subst-table-for-decode
115  ucs-unicode-to-mule-cjk)
116
117(mapc
118 (lambda (pair)
119   (aset utf-fragmentation-table (car pair) (cdr pair))
120   (aset utf-defragmentation-table (cdr pair) (car pair)))
121 '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) (?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B)
122   (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) (?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,F@(B)
123   (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) (?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B)
124   (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) (?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B)
125   (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) (?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B)
126   (?$,1' (B . ?,FP(B) (?$,1'!(B . ?,FQ(B) (?$,1'#(B . ?,FS(B) (?$,1'$(B . ?,FT(B) (?$,1'%(B . ?,FU(B)
127   (?$,1'&(B . ?,FV(B) (?$,1''(B . ?,FW(B) (?$,1'((B . ?,FX(B) (?$,1')(B . ?,FY(B) (?$,1'*(B . ?,FZ(B)
128   (?$,1'+(B . ?,F[(B) (?$,1',(B . ?,F\(B) (?$,1'-(B . ?,F](B) (?$,1'.(B . ?,F^(B) (?$,1'/(B . ?,F_(B)
129   (?$,1'0(B . ?,F`(B) (?$,1'1(B . ?,Fa(B) (?$,1'2(B . ?,Fb(B) (?$,1'3(B . ?,Fc(B) (?$,1'4(B . ?,Fd(B)
130   (?$,1'5(B . ?,Fe(B) (?$,1'6(B . ?,Ff(B) (?$,1'7(B . ?,Fg(B) (?$,1'8(B . ?,Fh(B) (?$,1'9(B . ?,Fi(B)
131   (?$,1':(B . ?,Fj(B) (?$,1';(B . ?,Fk(B) (?$,1'<(B . ?,Fl(B) (?$,1'=(B . ?,Fm(B) (?$,1'>(B . ?,Fn(B)
132   (?$,1'?(B . ?,Fo(B) (?$,1'@(B . ?,Fp(B) (?$,1'A(B . ?,Fq(B) (?$,1'B(B . ?,Fr(B) (?$,1'C(B . ?,Fs(B)
133   (?$,1'D(B . ?,Ft(B) (?$,1'E(B . ?,Fu(B) (?$,1'F(B . ?,Fv(B) (?$,1'G(B . ?,Fw(B) (?$,1'H(B . ?,Fx(B)
134   (?$,1'I(B . ?,Fy(B) (?$,1'J(B . ?,Fz(B) (?$,1'K(B . ?,F{(B) (?$,1'L(B . ?,F|(B) (?$,1'M(B . ?,F}(B)
135   (?$,1'N(B . ?,F~(B)
136
137   (?$,1(!(B . ?,L!(B) (?$,1("(B . ?,L"(B) (?$,1(#(B . ?,L#(B) (?$,1($(B . ?,L$(B)
138   (?$,1(%(B . ?,L%(B) (?$,1(&(B . ?,L&(B) (?$,1('(B . ?,L'(B) (?$,1(((B . ?,L((B) (?$,1()(B . ?,L)(B)
139   (?$,1(*(B . ?,L*(B) (?$,1(+(B . ?,L+(B) (?$,1(,(B . ?,L,(B) (?$,1(.(B . ?,L.(B) (?$,1(/(B . ?,L/(B)
140   (?$,1(0(B . ?,L0(B) (?$,1(1(B . ?,L1(B) (?$,1(2(B . ?,L2(B) (?$,1(3(B . ?,L3(B) (?$,1(4(B . ?,L4(B)
141   (?$,1(5(B . ?,L5(B) (?$,1(6(B . ?,L6(B) (?$,1(7(B . ?,L7(B) (?$,1(8(B . ?,L8(B) (?$,1(9(B . ?,L9(B)
142   (?$,1(:(B . ?,L:(B) (?$,1(;(B . ?,L;(B) (?$,1(<(B . ?,L<(B) (?$,1(=(B . ?,L=(B) (?$,1(>(B . ?,L>(B)
143   (?$,1(?(B . ?,L?(B) (?$,1(@(B . ?,L@(B) (?$,1(A(B . ?,LA(B) (?$,1(B(B . ?,LB(B) (?$,1(C(B . ?,LC(B)
144   (?$,1(D(B . ?,LD(B) (?$,1(E(B . ?,LE(B) (?$,1(F(B . ?,LF(B) (?$,1(G(B . ?,LG(B) (?$,1(H(B . ?,LH(B)
145   (?$,1(I(B . ?,LI(B) (?$,1(J(B . ?,LJ(B) (?$,1(K(B . ?,LK(B) (?$,1(L(B . ?,LL(B) (?$,1(M(B . ?,LM(B)
146   (?$,1(N(B . ?,LN(B) (?$,1(O(B . ?,LO(B) (?$,1(P(B . ?,LP(B) (?$,1(Q(B . ?,LQ(B) (?$,1(R(B . ?,LR(B)
147   (?$,1(S(B . ?,LS(B) (?$,1(T(B . ?,LT(B) (?$,1(U(B . ?,LU(B) (?$,1(V(B . ?,LV(B) (?$,1(W(B . ?,LW(B)
148   (?$,1(X(B . ?,LX(B) (?$,1(Y(B . ?,LY(B) (?$,1(Z(B . ?,LZ(B) (?$,1([(B . ?,L[(B) (?$,1(\(B . ?,L\(B)
149   (?$,1(](B . ?,L](B) (?$,1(^(B . ?,L^(B) (?$,1(_(B . ?,L_(B) (?$,1(`(B . ?,L`(B) (?$,1(a(B . ?,La(B)
150   (?$,1(b(B . ?,Lb(B) (?$,1(c(B . ?,Lc(B) (?$,1(d(B . ?,Ld(B) (?$,1(e(B . ?,Le(B) (?$,1(f(B . ?,Lf(B)
151   (?$,1(g(B . ?,Lg(B) (?$,1(h(B . ?,Lh(B) (?$,1(i(B . ?,Li(B) (?$,1(j(B . ?,Lj(B) (?$,1(k(B . ?,Lk(B)
152   (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) (?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B)
153   (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) (?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B)
154   (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) (?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B)
155   (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B)))
156
157
158(defcustom utf-fragment-on-decoding nil
159  "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets.
160Setting this means that the relevant Cyrillic and Greek characters are
161decoded into the iso8859 charsets rather than into
162mule-unicode-0100-24ff.  The iso8859 charsets take half as much space
163in the buffer, but using them may affect how the buffer can be re-encoded
164and may require a different input method to search for them, for instance.
165See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode'
166for mechanisms to make this largely transparent.
167
168Setting this variable outside customize has no effect."
169  :set (lambda (s v)
170	 (if v
171	     (progn
172	       (define-translation-table 'utf-translation-table-for-decode
173		 utf-fragmentation-table)
174	       ;; Even if unify-8859-on-encoding-mode is off, make
175	       ;; mule-utf-* encode characters in
176	       ;; utf-fragmentation-table.
177	       (unless (eq (get 'utf-translation-table-for-encode
178				'translation-table)
179			   ucs-mule-to-mule-unicode)
180		 (define-translation-table 'utf-translation-table-for-encode
181		   utf-defragmentation-table)))
182	   (define-translation-table 'utf-translation-table-for-decode)
183	   ;; When unify-8859-on-encoding-mode is off, be sure to make
184	   ;; mule-utf-* disabled for characters in
185	   ;; utf-fragmentation-table.
186	   (unless (eq (get 'utf-translation-table-for-encode
187			    'translation-table)
188		       ucs-mule-to-mule-unicode)
189	     (define-translation-table 'utf-translation-table-for-encode)))
190	 (set-default s v))
191  :version "22.1"
192  :type 'boolean
193  :group 'mule)
194
195
196(defconst utf-translate-cjk-charsets '(chinese-gb2312
197				       chinese-big5-1 chinese-big5-2
198				       japanese-jisx0208 japanese-jisx0212
199				       katakana-jisx0201
200				       korean-ksc5601)
201  "List of charsets supported by `utf-translate-cjk-mode'.")
202
203(defvar utf-translate-cjk-lang-env nil
204  "Language environment in which tables for `utf-translate-cjk-mode' is loaded.
205The value nil means that the tables are not yet loaded.")
206
207(defvar utf-translate-cjk-unicode-range)
208
209;; String generated from utf-translate-cjk-unicode-range.  It is
210;; suitable for an argument to skip-chars-forward.
211(defvar utf-translate-cjk-unicode-range-string nil)
212
213(defun utf-translate-cjk-set-unicode-range (range)
214  (setq utf-translate-cjk-unicode-range range)
215  (setq utf-translate-cjk-unicode-range-string
216	(let ((decode-char-no-trans
217	       #'(lambda (x)
218		   (cond ((< x #x100) (make-char 'latin-iso8859-1 x))
219			 ((< x #x2500)
220			  (setq x (- x #x100))
221			  (make-char 'mule-unicode-0100-24ff
222				     (+ (/ x 96) 32) (+ (% x 96) 32)))
223			 ((< x #x3400)
224			  (setq x (- x #x2500))
225			  (make-char 'mule-unicode-2500-33ff
226				     (+ (/ x 96) 32) (+ (% x 96) 32)))
227			 (t
228			  (setq x (- x #xe000))
229			  (make-char 'mule-unicode-e000-ffff
230				     (+ (/ x 96) 32) (+ (% x 96) 32))))))
231	      ranges from to)
232	  (dolist (elt range)
233	    (setq from (max #xA0 (car elt)) to (min #xffff (cdr elt)))
234	    (if (and (>= to #x3400) (< to #xE000))
235		(setq to #x33FF))
236	    (cond ((< from #x100)
237		   (if (>= to #xE000)
238		       (setq ranges (cons (cons #xE000 to) ranges)
239			     to #x33FF))
240		   (if (>= to #x2500)
241		       (setq ranges (cons (cons #x2500 to) ranges)
242			     to #x24FF))
243		   (if (>= to #x100)
244		       (setq ranges (cons (cons #x100 to) ranges)
245			     to #xFF)))
246		  ((< from #x2500)
247		   (if (>= to #xE000)
248		       (setq ranges (cons (cons #xE000 to) ranges)
249			     to #x33FF))
250		   (if (>= to #x2500)
251		       (setq ranges (cons (cons #x2500 to) ranges)
252			     to #x24FF)))
253		  ((< from #x3400)
254		   (if (>= to #xE000)
255		       (setq ranges (cons (cons #xE000 to) ranges)
256			     to #x33FF))))
257	    (if (<= from to)
258		(setq ranges (cons (cons from to) ranges))))
259	  (mapconcat #'(lambda (x)
260			 (format "%c-%c"
261				 (funcall decode-char-no-trans (car x))
262				 (funcall decode-char-no-trans (cdr x))))
263		     ranges "")))
264  ;; These forces loading and settting tables for
265  ;; utf-translate-cjk-mode.
266  (setq utf-translate-cjk-lang-env nil
267	ucs-mule-cjk-to-unicode (make-hash-table :test 'eq)
268	ucs-unicode-to-mule-cjk (make-hash-table :test 'eq)))
269
270(defcustom utf-translate-cjk-unicode-range '((#x2e80 . #xd7a3)
271					     (#xff00 . #xffef))
272  "List of Unicode code ranges supported by `utf-translate-cjk-mode'.
273Setting this variable directly does not take effect;
274use either \\[customize] or the function
275`utf-translate-cjk-set-unicode-range'."
276  :version "22.1"
277  :type '(repeat (cons integer integer))
278  :set (lambda (symbol value)
279	 (utf-translate-cjk-set-unicode-range value))
280  :group 'mule)
281
282;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'.
283(defsubst utf-translate-cjk-substitutable-p (code-point)
284  (let ((tail utf-translate-cjk-unicode-range)
285	elt)
286    (while tail
287      (setq elt (car tail) tail (cdr tail))
288      (if (and (>= code-point (car elt)) (<= code-point (cdr elt)))
289	  (setq tail nil)
290	(setq elt nil)))
291    elt))
292
293(defun utf-translate-cjk-load-tables ()
294  "Load tables for `utf-translate-cjk-mode'."
295  ;; Fixme: Allow the use of the CJK charsets to be
296  ;; customized by reordering and possible omission.
297  (let ((redefined (< (hash-table-size ucs-mule-cjk-to-unicode) 43000)))
298    (if redefined
299	;; Redefine them with realistic initial sizes and a
300	;; smallish rehash size to avoid wasting significant
301	;; space after they're built.
302	(setq ucs-mule-cjk-to-unicode
303	      (make-hash-table :test 'eq :size 43000 :rehash-size 1000)
304	      ucs-unicode-to-mule-cjk
305	      (make-hash-table :test 'eq :size 21500 :rehash-size 1000)))
306
307    ;; Load the files explicitly, to avoid having to keep
308    ;; around the large tables they contain (as well as the
309    ;; ones which get built).
310    ;; Here we bind coding-system-for-read to nil so that coding tags
311    ;; in the files are respected even if the files are not yet
312    ;; byte-compiled
313    (let ((coding-system-for-read nil)
314	  ;; We must avoid clobbering this variable, in case the load
315	  ;; files below use different coding systems.
316	  (last-coding-system-used last-coding-system-used))
317      (cond ((string= "Korean" current-language-environment)
318	     (load "subst-jis")
319	     (load "subst-big5")
320	     (load "subst-gb2312")
321	     (load "subst-ksc"))
322	    ((string= "Chinese-BIG5" current-language-environment)
323	     (load "subst-jis")
324	     (load "subst-ksc")
325	     (load "subst-gb2312")
326	     (load "subst-big5"))
327	    ((string= "Chinese-GB" current-language-environment)
328	     (load "subst-jis")
329	     (load "subst-ksc")
330	     (load "subst-big5")
331	     (load "subst-gb2312"))
332	    (t
333	     (load "subst-ksc")
334	     (load "subst-gb2312")
335	     (load "subst-big5")
336	     (load "subst-jis")))) ; jis covers as much as big5, gb2312
337
338    (when redefined
339      (define-translation-hash-table 'utf-subst-table-for-decode
340	ucs-unicode-to-mule-cjk)
341      (define-translation-hash-table 'utf-subst-table-for-encode
342	ucs-mule-cjk-to-unicode)
343      (set-char-table-extra-slot (get 'utf-translation-table-for-encode
344				      'translation-table)
345				 1 ucs-mule-cjk-to-unicode))
346
347    (setq utf-translate-cjk-lang-env current-language-environment)))
348
349(defun utf-lookup-subst-table-for-decode (code-point)
350  (if (and utf-translate-cjk-mode
351	   (not utf-translate-cjk-lang-env)
352	   (utf-translate-cjk-substitutable-p code-point))
353      (utf-translate-cjk-load-tables))
354  (gethash code-point
355	   (get 'utf-subst-table-for-decode 'translation-hash-table)))
356
357
358(defun utf-lookup-subst-table-for-encode (char)
359  (if (and utf-translate-cjk-mode
360	   (not utf-translate-cjk-lang-env)
361	   (memq (char-charset char) utf-translate-cjk-charsets))
362      (utf-translate-cjk-load-tables))
363  (gethash char
364	   (get 'utf-subst-table-for-encode 'translation-hash-table)))
365
366(define-minor-mode utf-translate-cjk-mode
367  "Toggle whether UTF based coding systems de/encode CJK characters.
368If ARG is an integer, enable if ARG is positive and disable if
369zero or negative.  This is a minor mode.
370Enabling this allows the coding systems mule-utf-8,
371mule-utf-16le and mule-utf-16be to encode characters in the charsets
372`korean-ksc5601', `chinese-gb2312', `chinese-big5-1',
373`chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to
374decode the corresponding unicodes into such characters.
375
376Where the charsets overlap, the one preferred for decoding is chosen
377according to the language environment in effect when this option is
378turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for
379Chinese-Big5 and jisx for other environments.
380
381This mode is on by default.  If you are not interested in CJK
382characters and want to avoid some overhead on encoding/decoding
383by the above coding systems, you can customize the user option
384`utf-translate-cjk-mode' to nil."
385  :init-value t
386  :version "22.1"
387  :type 'boolean
388  :group 'mule
389  :global t
390  (if utf-translate-cjk-mode
391      (progn
392	(define-translation-hash-table 'utf-subst-table-for-decode
393	  ucs-unicode-to-mule-cjk)
394	(define-translation-hash-table 'utf-subst-table-for-encode
395	  ucs-mule-cjk-to-unicode)
396	(set-char-table-extra-slot (get 'utf-translation-table-for-encode
397					'translation-table)
398				   1 ucs-mule-cjk-to-unicode))
399    (define-translation-hash-table 'utf-subst-table-for-decode
400      (make-hash-table :test 'eq))
401    (define-translation-hash-table 'utf-subst-table-for-encode
402      (make-hash-table :test 'eq))
403    (set-char-table-extra-slot (get 'utf-translation-table-for-encode
404				    'translation-table)
405			       1 nil))
406
407  ;; Update safe-chars of mule-utf-* coding systems.
408  (dolist (elt (coding-system-list t))
409    (if (string-match "^mule-utf" (symbol-name elt))
410	(let ((safe-charsets (coding-system-get elt 'safe-charsets))
411	      (safe-chars (coding-system-get elt 'safe-chars))
412	      (need-update nil))
413	  (dolist (charset utf-translate-cjk-charsets)
414	    (unless (eq utf-translate-cjk-mode (memq charset safe-charsets))
415	      (setq safe-charsets
416		    (if utf-translate-cjk-mode
417			(cons charset safe-charsets)
418		      (delq charset safe-charsets))
419		    need-update t)
420	      (aset safe-chars (make-char charset) utf-translate-cjk-mode)))
421	  (when need-update
422	    (coding-system-put elt 'safe-charsets safe-charsets)
423	    (define-coding-system-internal elt))))))
424
425(define-ccl-program ccl-mule-utf-untrans
426  ;; R0 is an untranslatable Unicode code-point (U+3500..U+DFFF or
427  ;; U+10000..U+10FFFF) or an invaid byte (#x00..#xFF).  Write
428  ;; eight-bit-control/graphic sequence (2 to 4 chars) representing
429  ;; UTF-8 sequence of r0.  Registers r4, r5, r6 are modified.
430  ;;
431  ;; This is a subrountine because we assume that this is called very
432  ;; rarely (so we don't have to worry about the overhead of the
433  ;; call).
434  `(0
435    ((r5 = ,(charset-id 'eight-bit-control))
436     (r6 = ,(charset-id 'eight-bit-graphic))
437     (if (r0 < #x100)
438	 ((r4 = ((r0 >> 6) | #xC0))
439	  (write-multibyte-character r6 r4))
440       ((if (r0 < #x10000)
441	    ((r4 = ((r0 >> 12) | #xE0))
442	     (write-multibyte-character r6 r4))
443	  ((r4 = ((r0 >> 18) | #xF0))
444	   (write-multibyte-character r6 r4)
445	   (r4 = (((r0 >> 12) & #x3F) | #x80))
446	   (if (r4 < #xA0)
447	       (write-multibyte-character r5 r4)
448	     (write-multibyte-character r6 r4))))
449	(r4 = (((r0 >> 6) & #x3F) | #x80))
450	(if (r4 < #xA0)
451	    (write-multibyte-character r5 r4)
452	  (write-multibyte-character r6 r4))))
453     (r4 = ((r0 & #x3F) | #x80))
454     (if (r4 < #xA0)
455	 (write-multibyte-character r5 r4)
456       (write-multibyte-character r6 r4)))))
457
458(define-ccl-program ccl-decode-mule-utf-8
459  ;;
460  ;;        charset         | bytes in utf-8 | bytes in emacs
461  ;; -----------------------+----------------+---------------
462  ;;         ascii          |       1        |       1
463  ;; -----------------------+----------------+---------------
464  ;;    eight-bit-control   |       2        |       2
465  ;;    eight-bit-graphic   |       2        |       1
466  ;;     latin-iso8859-1    |       2        |       2
467  ;; -----------------------+----------------+---------------
468  ;; mule-unicode-0100-24ff |       2        |       4
469  ;;        (< 0800)        |                |
470  ;; -----------------------+----------------+---------------
471  ;; mule-unicode-0100-24ff |       3        |       4
472  ;;        (>= 8000)       |                |
473  ;; mule-unicode-2500-33ff |       3        |       4
474  ;; mule-unicode-e000-ffff |       3        |       4
475  ;; -----------------------+----------------+---------------
476  ;;      invalid byte      |       1        |       2
477  ;;
478  ;; Thus magnification factor is two.
479  ;;
480  `(2
481    ((r6 = ,(charset-id 'latin-iso8859-1))
482     (read r0)
483     (loop
484      (if (r0 < #x80)
485	  ;; 1-byte encoding, i.e., ascii
486	  (write-read-repeat r0))
487      (if (r0 < #xc2)
488	  ;; continuation byte (invalid here) or 1st byte of overlong
489	  ;; 2-byte sequence.
490	  ((call ccl-mule-utf-untrans)
491	   (r6 = ,(charset-id 'latin-iso8859-1))
492	   (read r0)
493	   (repeat)))
494
495      ;; Read the 2nd byte.
496      (read r1)
497      (if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte
498	  ((call ccl-mule-utf-untrans)
499	   (r6 = ,(charset-id 'latin-iso8859-1))
500	   ;; Handle it in the next loop.
501	   (r0 = r1)
502	   (repeat)))
503
504      (if (r0 < #xe0)
505	  ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
506	  ((r1 &= #x3F)
507	   (r1 |= ((r0 & #x1F) << 6))
508	   ;; Now r1 holds scalar value.  We don't have to check
509	   ;; `overlong sequence' because r0 >= 0xC2.
510
511	   (if (r1 >= 256)
512	       ;; mule-unicode-0100-24ff (< 0800)
513	       ((r0 = r1)
514		(lookup-integer utf-subst-table-for-decode r0 r1)
515		(if (r7 == 0)
516		    ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
517		     (r1 -= #x0100)
518		     (r2 = (((r1 / 96) + 32) << 7))
519		     (r1 %= 96)
520		     (r1 += (r2 + 32))
521		     (translate-character
522		      utf-translation-table-for-decode r0 r1)))
523		(write-multibyte-character r0 r1)
524		(read r0)
525		(repeat))
526	     (if (r1 >= 160)
527		 ;; latin-iso8859-1
528		 ((r0 = r1)
529		  (lookup-integer utf-subst-table-for-decode r0 r1)
530		  (if (r7 == 0)
531		      ((r1 -= 128)
532		       (write-multibyte-character r6 r1))
533		    ((write-multibyte-character r0 r1)))
534		  (read r0)
535		  (repeat))
536	       ;; eight-bit-control
537	       ((r0 = ,(charset-id 'eight-bit-control))
538		(write-multibyte-character r0 r1)
539		(read r0)
540		(repeat))))))
541
542      ;; Read the 3rd bytes.
543      (read r2)
544      (if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte
545	  ((call ccl-mule-utf-untrans)
546	   (r0 = r1)
547	   (call ccl-mule-utf-untrans)
548	   (r6 = ,(charset-id 'latin-iso8859-1))
549	   ;; Handle it in the next loop.
550	   (r0 = r2)
551	   (repeat)))
552
553      (if (r0 < #xF0)
554	  ;; 3byte encoding
555	  ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
556	  ((r3 = ((r0 & #xF) << 12))
557	   (r3 |= ((r1 & #x3F) << 6))
558	   (r3 |= (r2 & #x3F))
559
560	   (if (r3 < #x800)		; `overlong sequence'
561	       ((call ccl-mule-utf-untrans)
562		(r0 = r1)
563		(call ccl-mule-utf-untrans)
564		(r0 = r2)
565		(call ccl-mule-utf-untrans)
566		(r6 = ,(charset-id 'latin-iso8859-1))
567		(read r0)
568		(repeat)))
569
570	   (if (r3 < #x2500)
571	       ;; mule-unicode-0100-24ff (>= 0800)
572	       ((r0 = r3)
573		(lookup-integer utf-subst-table-for-decode r0 r1)
574		(if (r7 == 0)
575		    ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
576		     (r3 -= #x0100)
577		     (r3 //= 96)
578		     (r1 = (r7 + 32))
579		     (r1 += ((r3 + 32) << 7))
580		     (translate-character
581		      utf-translation-table-for-decode r0 r1)))
582		(write-multibyte-character r0 r1)
583		(read r0)
584		(repeat)))
585
586	   (if (r3 < #x3400)
587	       ;; mule-unicode-2500-33ff
588	       ((r0 = r3)		; don't zap r3
589		(lookup-integer utf-subst-table-for-decode r0 r1)
590		(if (r7 == 0)
591		    ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
592		     (r3 -= #x2500)
593		     (r3 //= 96)
594		     (r1 = (r7 + 32))
595		     (r1 += ((r3 + 32) << 7))))
596		(write-multibyte-character r0 r1)
597		(read r0)
598		(repeat)))
599
600	   (if (r3 < #xE000)
601	       ;; Try to convert to CJK chars, else
602	       ;; keep them as eight-bit-{control|graphic}.
603	       ((r0 = r3)
604		(lookup-integer utf-subst-table-for-decode r3 r1)
605		(if r7
606		    ;; got a translation
607		    ((write-multibyte-character r3 r1)
608		     (read r0)
609		     (repeat))
610		  ((call ccl-mule-utf-untrans)
611		   (r6 = ,(charset-id 'latin-iso8859-1))
612		   (read r0)
613		   (repeat)))))
614
615	   ;; mule-unicode-e000-ffff
616	   ;; Fixme: fffe and ffff are invalid.
617	   (r0 = r3)		; don't zap r3
618	   (lookup-integer utf-subst-table-for-decode r0 r1)
619	   (if (r7 == 0)
620	       ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
621		(r3 -= #xe000)
622		(r3 //= 96)
623		(r1 = (r7 + 32))
624		(r1 += ((r3 + 32) << 7))))
625	   (write-multibyte-character r0 r1)
626	   (read r0)
627	   (repeat)))
628
629      ;; Read the 4th bytes.
630      (read r3)
631      (if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte
632	  ((call ccl-mule-utf-untrans)
633	   (r0 = r1)
634	   (call ccl-mule-utf-untrans)
635	   (r0 = r2)
636	   (call ccl-mule-utf-untrans)
637	   (r6 = ,(charset-id 'latin-iso8859-1))
638	   ;; Handle it in the next loop.
639	   (r0 = r3)
640	   (repeat)))
641
642      (if (r0 < #xF8)
643	  ;; 4-byte encoding:
644	  ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx
645	  ;; keep those bytes as eight-bit-{control|graphic}
646	  ;; Fixme: allow lookup in utf-subst-table-for-decode.
647	  ((r4 = ((r0 & #x7) << 18))
648	   (r4 |= ((r1 & #x3F) << 12))
649	   (r4 |= ((r2 & #x3F) << 6))
650	   (r4 |= (r3 & #x3F))
651
652	   (if (r4 < #x10000)		; `overlong sequence'
653	       ((call ccl-mule-utf-untrans)
654		(r0 = r1)
655		(call ccl-mule-utf-untrans)
656		(r0 = r2)
657		(call ccl-mule-utf-untrans)
658		(r0 = r3)
659		(call ccl-mule-utf-untrans))
660	     ((r0 = r4)
661	      (call ccl-mule-utf-untrans))))
662
663	;; Unsupported sequence.
664	((call ccl-mule-utf-untrans)
665	 (r0 = r1)
666	 (call ccl-mule-utf-untrans)
667	 (r0 = r2)
668	 (call ccl-mule-utf-untrans)
669	 (r0 = r3)
670	 (call ccl-mule-utf-untrans)))
671      (r6 = ,(charset-id 'latin-iso8859-1))
672      (read r0)
673      (repeat)))
674
675
676    ;; At EOF...
677    (if (r0 >= 0)
678	;; r0 >= #x80
679	((call ccl-mule-utf-untrans)
680	 (if (r1 >= 0)
681	     ((r0 = r1)
682	      (call ccl-mule-utf-untrans)
683	      (if (r2 >= 0)
684		  ((r0 = r2)
685		   (call ccl-mule-utf-untrans)
686		   (if (r3 >= 0)
687		       ((r0 = r3)
688			(call ccl-mule-utf-untrans))))))))))
689
690  "CCL program to decode UTF-8.
691Basic decoding is done into the charsets ascii, latin-iso8859-1 and
692mule-unicode-*, but see also `utf-fragmentation-table' and
693`ucs-mule-cjk-to-unicode'.
694Encodings of un-representable Unicode characters are decoded asis into
695eight-bit-control and eight-bit-graphic characters.")
696
697(define-ccl-program ccl-mule-utf-8-encode-untrans
698  ;; UTF-8 decoder generates an UTF-8 sequence represented by a
699  ;; sequence eight-bit-control/graphic chars for an untranslatable
700  ;; character and an invalid byte.
701  ;;
702  ;; This CCL parses that sequence (the first byte is already in r1),
703  ;; writes out the original bytes of that sequence, and sets r5 to
704  ;; -1.
705  ;;
706  ;; If the eight-bit-control/graphic sequence is shorter than what r1
707  ;; suggests, it sets r5 and r6 to the last character read that
708  ;; should be handled by the next loop of a caller.
709  ;;
710  ;; Note: For UTF-8 validation, we only check if a character is
711  ;; eight-bit-control/graphic or not.  It may result in incorrect
712  ;; handling of random binary data, but such a data can't be encoded
713  ;; by UTF-8 anyway.  At least, UTF-8 decoders doesn't generate such
714  ;; a sequence even if a source contains invalid byte-sequence.
715  `(0
716    (;; Read the 2nd byte.
717     (read-multibyte-character r5 r6)
718     (r0 = (r5 != ,(charset-id 'eight-bit-control)))
719     (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
720	 ((write r1)			; invalid UTF-8
721	  (r1 = -1)
722	  (end)))
723
724     (if (r1 <= #xC3)
725	 ;; 2-byte sequence for an originally invalid byte.
726	 ((r6 &= #x3F)
727	  (r6 |= ((r1 & #x1F) << 6))
728	  (write r6)
729	  (r5 = -1)
730	  (end)))
731
732     (write r1 r6)
733     (r2 = r1)
734     (r1 = -1)
735     ;; Read the 3rd byte.
736     (read-multibyte-character r5 r6)
737     (r0 = (r5 != ,(charset-id 'eight-bit-control)))
738     (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
739	 (end))				; invalid UTF-8
740     (write r6)
741     (if (r2 < #xF0)
742	 ;; 3-byte sequence for an untranslated character.
743	 ((r5 = -1)
744	  (end)))
745     ;; Read the 4th byte.
746     (read-multibyte-character r5 r6)
747     (r0 = (r5 != ,(charset-id 'eight-bit-control)))
748     (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
749	 (end))			; invalid UTF-8
750     ;; 4-byte sequence for an untranslated character.
751     (write r6)
752     (r5 = -1)
753     (end))
754
755    ;; At EOF...
756    ((r5 = -1)
757     (if (r1 >= 0)
758	 (write r1)))))
759
760(define-ccl-program ccl-encode-mule-utf-8
761  `(1
762    ((r5 = -1)
763     (loop
764      (if (r5 < 0)
765	  (read-multibyte-character r0 r1)
766	;; Pre-read character is in r5 (charset-ID) and r6 (code-point).
767	((r0 = r5)
768	 (r1 = r6)
769	 (r5 = -1)))
770      (translate-character utf-translation-table-for-encode r0 r1)
771
772      (if (r0 == ,(charset-id 'ascii))
773	  (write-repeat r1))
774
775      (if (r0 == ,(charset-id 'latin-iso8859-1))
776	  ;; r1          scalar                  utf-8
777	  ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
778	  ;; 20    0000 0000 1010 0000    1100 0010 1010 0000
779	  ;; 7f    0000 0000 1111 1111    1100 0011 1011 1111
780	  ((write ((r1 >> 6) | #xc2))
781	   (r1 &= #x3f)
782	   (r1 |= #x80)
783	   (write-repeat r1)))
784
785      (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
786	  ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
787	   ;; #x3f80 == (0011 1111 1000 0000)b
788	   (r1 &= #x7f)
789	   (r1 += (r0 + 224))		; 240 == -32 + #x0100
790	   ;; now r1 holds scalar value
791	   (if (r1 < #x0800)
792	       ;; 2byte encoding
793	       ((write ((r1 >> 6) | #xC0))
794		(r1 &= #x3F)
795		(r1 |= #x80)
796		(write-repeat r1))
797	     ;; 3byte encoding
798	     ((write ((r1 >> 12) | #xE0))
799	      (write  (((r1 & #x0FC0) >> 6) | #x80))
800	      (r1 &= #x3F)
801	      (r1 |= #x80)
802	      (write-repeat r1)))))
803
804      (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
805	  ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
806	   (r1 &= #x7f)
807	   (r1 += (r0 + 9440))		; 9440 == -32 + #x2500
808	   ;; now r1 holds scalar value
809	   (write ((r1 >> 12) | #xE0))
810	   (write  (((r1 & #x0FC0) >> 6) | #x80))
811	   (r1 &= #x3F)
812	   (r1 |= #x80)
813	   (write-repeat r1)))
814
815      (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
816	  ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
817	   (r1 &= #x7f)
818	   (r1 += (r0 + 57312))		; 57312 == -32 + #xe000
819	   ;; now r1 holds scalar value
820	   (write ((r1 >> 12) | #xE0))
821	   (write  (((r1 & #x0FC0) >> 6) | #x80))
822	   (r1 &= #x3F)
823	   (r1 |= #x80)
824	   (write-repeat r1)))
825
826      (if (r0 == ,(charset-id 'eight-bit-control))
827	  ;; r1          scalar                  utf-8
828	  ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
829	  ;; 80    0000 0000 1000 0000    1100 0010 1000 0000
830	  ;; 9f    0000 0000 1001 1111    1100 0010 1001 1111
831	  ((write #xC2)
832	   (write-repeat r1)))
833
834      (if (r0 == ,(charset-id 'eight-bit-graphic))
835	  ;; r1          scalar                  utf-8
836	  ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
837	  ;; a0    0000 0000 1010 0000    1100 0010 1010 0000
838	  ;; ff    0000 0000 1111 1111    1101 1111 1011 1111
839	  ((r0 = (r1 >= #xC0))
840	   (r0 &= (r1 <= #xC3))
841	   (r4 = (r1 >= #xE1))
842	   (r4 &= (r1 <= #xF7))
843	   (r0 |= r4)
844	   (if r0
845	       ((call ccl-mule-utf-8-encode-untrans)
846		(repeat))
847	     (write-repeat r1))))
848
849      (lookup-character utf-subst-table-for-encode r0 r1)
850      (if r7		; lookup succeeded
851	  (if (r0 < #x800)
852	      ;; 2byte encoding
853	      ((write ((r0 >> 6) | #xC0))
854	       (r0 = ((r0 & #x3F) | #x80))
855	       (write-repeat r0))
856	    ;; 3byte encoding
857	    ((write ((r0 >> 12) | #xE0))
858	     (write  (((r0 & #x0FC0) >> 6) | #x80))
859	     (r0 = ((r0 & #x3F) | #x80))
860	     (write-repeat r0))))
861
862      ;; Unsupported character.
863      ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
864      (write #xef)
865      (write #xbf)
866      (write-repeat #xbd))))
867  "CCL program to encode into UTF-8.")
868
869
870(define-ccl-program ccl-untranslated-to-ucs
871  `(0
872    (if (r1 == 0)
873	nil
874      (if (r0 <= #xC3)			; 2-byte encoding
875	  ((r0 = ((r0 & #x3) << 6))
876	   (r0 |= (r1 & #x3F))
877	   (r1 = 2))
878	(if (r2 == 0)
879	    (r1 = 0)
880	  (if (r0 < #xF0)		; 3-byte encoding, as above
881	      ((r0 = ((r0 & #xF) << 12))
882	       (r0 |= ((r1 & #x3F) << 6))
883	       (r0 |= (r2 & #x3F))
884	       (r1 = 3))
885	    (if (r3 == 0)
886		(r1 = 0)
887	      ((r0 = ((r0 & #x7) << 18))
888	       (r0 |= ((r1 & #x3F) << 12))
889	       (r0 |= ((r2 & #x3F) << 6))
890	       (r0 |= (r3 & #x3F))
891	       (r1 = 4))))))))
892  "Decode 2-, 3-, or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
893Set r1 to the byte length.  r0 == 0 for invalid sequence.")
894
895(defvar utf-8-ccl-regs (make-vector 8 0))
896
897(defsubst utf-8-untranslated-to-ucs ()
898  "Return the UCS code for an untranslated sequence of raw bytes t point.
899Only for 3- or 4-byte sequences."
900  (aset utf-8-ccl-regs 0 (or (char-after) 0))
901  (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0))
902  (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0))
903  (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0))
904  (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs))
905
906(defun utf-8-help-echo (window object position)
907  (format "Untranslated Unicode U+%04X"
908	  (get-char-property position 'untranslated-utf-8 object)))
909
910;; We compose the untranslatable sequences into a single character,
911;; and move point to the next character.
912;; This is infelicitous for editing, because there's currently no
913;; mechanism for treating compositions as atomic, but is OK for
914;; display.  They are composed to U+FFFD with help-echo which
915;; indicates the unicodes they represent.  This function GCs too much.
916
917;; If utf-translate-cjk-mode is non-nil, this function is called with
918;; HASH-TABLE which translates CJK characters into some of CJK
919;; charsets.
920
921(defsubst utf-8-compose (hash-table)
922  "Put a suitable composition on an untranslatable sequence at point.
923If HASH-TABLE is non-nil, try to translate CJK characters by it at first.
924Move point to the end of the sequence."
925  (utf-8-untranslated-to-ucs)
926  (let ((l (aref utf-8-ccl-regs 1))
927	ch)
928    (if (> l 0)
929	(if (and hash-table
930		 (setq ch (gethash (aref utf-8-ccl-regs 0)  hash-table)))
931	    (progn
932	      (insert ch)
933	      (delete-region (point) (min (point-max) (+ l (point)))))
934	  (setq ch (aref utf-8-ccl-regs 0))
935	  (put-text-property (point) (min (point-max) (+ l (point)))
936			     'untranslated-utf-8 ch)
937	  (put-text-property (point) (min (point-max) (+ l (point)))
938			     'help-echo 'utf-8-help-echo)
939	  (if (= l 2)
940	      (put-text-property (point) (min (point-max) (+ l (point)))
941				 'display (propertize (format "\\%03o" ch)
942						      'face 'escape-glyph))
943	    (compose-region (point) (+ l (point)) ?$,3u=(B))
944	  (forward-char l))
945      (forward-char 1))))
946
947(defcustom utf-8-compose-scripts nil
948  "*Non-nil means compose various scripts on decoding utf-8 text."
949  :group 'mule
950  :version "22.1"
951  :type 'boolean)
952
953(defun utf-8-post-read-conversion (length)
954  "Compose untranslated utf-8 sequences into single characters.
955If `utf-translate-cjk-mode' is non-nil, tries to translate CJK characters.
956Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
957  (save-excursion
958    (save-restriction
959      (narrow-to-region (point) (+ (point) length))
960      ;; Can't do eval-when-compile to insert a multibyte constant
961      ;; version of the string in the loop, since it's always loaded as
962      ;; unibyte from a byte-compiled file.
963      (let ((range (string-as-multibyte "^\xc0-\xc3\xe1-\xf7"))
964	    (buffer-multibyte enable-multibyte-characters)
965	    hash-table ch)
966	(set-buffer-multibyte t)
967	(when utf-translate-cjk-mode
968	  (unless utf-translate-cjk-lang-env
969	    ;; Check these characters in utf-translate-cjk-range.
970	    ;; We may have to translate them to CJK charsets.
971	    (skip-chars-forward
972	     (concat range utf-translate-cjk-unicode-range-string))
973	    (unless (eobp)
974	      (utf-translate-cjk-load-tables)
975	      (setq range
976		    (concat range utf-translate-cjk-unicode-range-string)))
977	    (setq hash-table (get 'utf-subst-table-for-decode
978				  'translation-hash-table))))
979	(while (and (skip-chars-forward range)
980		    (not (eobp)))
981	  (setq ch (following-char))
982	  (if (< ch 256)
983	      (utf-8-compose hash-table)
984	    (if (and hash-table
985		     (setq ch (gethash (encode-char ch 'ucs) hash-table)))
986		(progn
987		  (insert ch)
988		  (delete-char 1))
989	      (forward-char 1))))
990	(or buffer-multibyte
991	    (set-buffer-multibyte nil)))
992
993      (when (and utf-8-compose-scripts (> length 1))
994	;; These currently have definitions which cover the relevant
995	;; unicodes.  We could avoid loading thai-util &c by checking
996	;; whether the region contains any characters with the appropriate
997	;; categories.  There aren't yet Unicode-based rules for Tibetan.
998	(diacritic-compose-region (point-max) (point-min))
999	(thai-compose-region (point-max) (point-min))
1000	(lao-compose-region (point-max) (point-min))
1001	(devanagari-compose-region (point-max) (point-min))
1002	(malayalam-compose-region (point-max) (point-min))
1003	(tamil-compose-region (point-max) (point-min)))
1004      (- (point-max) (point-min)))))
1005
1006(defun utf-8-pre-write-conversion (beg end)
1007  "Prepare for `utf-translate-cjk-mode' to encode text between BEG and END.
1008This is used as a post-read-conversion of utf-8 coding system."
1009  (if (and utf-translate-cjk-mode
1010	   (not utf-translate-cjk-lang-env)
1011	   (if (stringp beg)
1012	       (string-match "\\cc\\|\\cj\\|\\ch" beg)
1013	     (save-excursion
1014	       (goto-char beg)
1015	       (re-search-forward "\\cc\\|\\cj\\|\\ch" end t))))
1016      (utf-translate-cjk-load-tables))
1017  nil)
1018
1019(make-coding-system
1020 'mule-utf-8 4 ?u
1021 "UTF-8 encoding for Emacs-supported Unicode characters.
1022It supports Unicode characters of these ranges:
1023    U+0000..U+33FF, U+E000..U+FFFF.
1024They correspond to these Emacs character sets:
1025    ascii, latin-iso8859-1, mule-unicode-0100-24ff,
1026    mule-unicode-2500-33ff, mule-unicode-e000-ffff
1027
1028On decoding (e.g. reading a file), Unicode characters not in the above
1029ranges are decoded into sequences of eight-bit-control and
1030eight-bit-graphic characters to preserve their byte sequences.  The
1031byte sequence is preserved on i/o for valid utf-8, but not necessarily
1032for invalid utf-8.
1033
1034On encoding (e.g. writing a file), Emacs characters not belonging to
1035any of the character sets listed above are encoded into the UTF-8 byte
1036sequence representing U+FFFD (REPLACEMENT CHARACTER)."
1037
1038 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
1039 `((safe-charsets
1040    ascii
1041    eight-bit-control
1042    eight-bit-graphic
1043    latin-iso8859-1
1044    mule-unicode-0100-24ff
1045    mule-unicode-2500-33ff
1046    mule-unicode-e000-ffff
1047    ,@(if utf-translate-cjk-mode
1048	  utf-translate-cjk-charsets))
1049   (mime-charset . utf-8)
1050   (coding-category . coding-category-utf-8)
1051   (valid-codes (0 . 255))
1052   (pre-write-conversion . utf-8-pre-write-conversion)
1053   (post-read-conversion . utf-8-post-read-conversion)
1054   (translation-table-for-encode . utf-translation-table-for-encode)
1055   (dependency unify-8859-on-encoding-mode
1056	       unify-8859-on-decoding-mode
1057	       utf-fragment-on-decoding
1058	       utf-translate-cjk-mode)))
1059
1060(define-coding-system-alias 'utf-8 'mule-utf-8)
1061
1062;; I think this needs special private charsets defined for the
1063;; untranslated sequences, if it's going to work well.
1064
1065;;; (defun utf-8-compose-function (pos to pattern &optional string)
1066;;;   (let* ((prop (get-char-property pos 'composition string))
1067;;; 	 (l (and prop (- (cadr prop) (car prop)))))
1068;;;     (cond ((and l (> l (- to pos)))
1069;;; 	   (delete-region pos to))
1070;;; 	  ((and (> (char-after pos) 224)
1071;;; 		(< (char-after pos) 256)
1072;;; 		(save-restriction
1073;;; 		  (narrow-to-region pos to)
1074;;; 		  (utf-8-compose)))
1075;;; 	   t))))
1076
1077;;; (dotimes (i 96)
1078;;;   (aset composition-function-table
1079;;; 	(+ 128 i)
1080;;; 	`((,(string-as-multibyte "[\200-\237\240-\377]")
1081;;; 	   . utf-8-compose-function))))
1082
1083;;; arch-tag: b08735b7-753b-4ae6-b754-0f3efe4515c5
1084;;; utf-8.el ends here
1085