1;;; utf7.el --- UTF-7 encoding/decoding for Emacs   -*-coding: iso-8859-1;-*-
2
3;; Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004,
4;;   2005, 2006, 2007 Free Software Foundation, Inc.
5
6;; Author: Jon K Hellan <hellan@acm.org>
7;; Maintainer: bugs@gnus.org
8;; Keywords: mail
9
10;; This file is part of GNU Emacs.
11
12;; GNU Emacs is free software; you can redistribute it and/or modify
13;; it under the terms of the GNU General Public License as published by
14;; the Free Software Foundation; either version 2, or (at your option)
15;; any later version.
16
17;; GNU Emacs is distributed in the hope that it will be useful,
18;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20;; GNU General Public License for more details.
21
22;; You should have received a copy of the GNU General Public License
23;; along with GNU Emacs; see the file COPYING.  If not, write to
24;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
25;; Boston, MA 02110-1301, USA.
26
27;;; Commentary:
28
29;; UTF-7 - A Mail-Safe Transformation Format of Unicode - RFC 2152
30;; This is a transformation format of Unicode that contains only 7-bit
31;; ASCII octets and is intended to be readable by humans in the limiting
32;; case that the document consists of characters from the US-ASCII
33;; repertoire.
34;; In short, runs of characters outside US-ASCII are encoded as base64
35;; inside delimiters.
36;; A variation of UTF-7 is specified in IMAP 4rev1 (RFC 2060) as the way
37;; to represent characters outside US-ASCII in mailbox names in IMAP.
38;; This library supports both variants, but the IMAP variation was the
39;; reason I wrote it.
40;; The routines convert UTF-7 -> UTF-16 (16 bit encoding of Unicode)
41;; -> current character set, and vice versa.
42;; However, until Emacs supports Unicode, the only Emacs character set
43;; supported here is ISO-8859.1, which can trivially be converted to/from
44;; Unicode.
45;; When decoding results in a character outside the Emacs character set,
46;; an error is thrown.  It is up to the application to recover.
47
48;; UTF-7 should be done by providing a coding system.  Mule-UCS does
49;; already, but I don't know if it does the IMAP version and it's not
50;; clear whether that should really be a coding system.  The UTF-16
51;; part of the conversion can be done with coding systems available
52;; with Mule-UCS or some versions of Emacs.  Unfortunately these were
53;; done wrongly (regarding handling of byte-order marks and how the
54;; variants were named), so we don't have a consistent name for the
55;; necessary coding system.  The code below doesn't seem to DTRT
56;; generally.  E.g.:
57;;
58;; (utf7-encode "a+�")
59;;   => "a+ACsAow-"
60;;
61;; $ echo "a+�"|iconv -f iso-8859-1 -t utf-7
62;; a+-+AKM
63;;
64;;  -- fx
65
66
67;;; Code:
68
69(require 'base64)
70(eval-when-compile (require 'cl))
71(require 'mm-util)
72
73(defconst utf7-direct-encoding-chars " -%'-*,-[]-}"
74  "Character ranges which do not need escaping in UTF-7.")
75
76(defconst utf7-imap-direct-encoding-chars
77  (concat utf7-direct-encoding-chars "+\\~")
78  "Character ranges which do not need escaping in the IMAP variant of UTF-7.")
79
80(defconst utf7-utf-16-coding-system
81  (cond ((mm-coding-system-p 'utf-16-be-no-signature) ; Mule-UCS
82	 'utf-16-be-no-signature)
83	((and (mm-coding-system-p 'utf-16-be) ; Emacs 21.3, Emacs 22
84	      ;; Avoid versions with BOM.
85	      (= 2 (length (encode-coding-string "a" 'utf-16-be))))
86	 'utf-16-be)
87	((mm-coding-system-p 'utf-16-be-nosig) ; ?
88	 'utf-16-be-nosig))
89  "Coding system which encodes big endian UTF-16 without a BOM signature.")
90
91(defsubst utf7-imap-get-pad-length (len modulus)
92  "Return required length of padding for IMAP modified base64 fragment."
93  (mod (- len) modulus))
94
95(defun utf7-encode-internal (&optional for-imap)
96  "Encode text in (temporary) buffer as UTF-7.
97Use IMAP modification if FOR-IMAP is non-nil."
98  (let ((start (point-min))
99	(end (point-max)))
100    (narrow-to-region start end)
101    (goto-char start)
102    (let* ((esc-char (if for-imap ?& ?+))
103	   (direct-encoding-chars
104	    (if for-imap utf7-imap-direct-encoding-chars
105	      utf7-direct-encoding-chars))
106	   (not-direct-encoding-chars (concat "^" direct-encoding-chars)))
107      (while (not (eobp))
108	(skip-chars-forward direct-encoding-chars)
109	(unless (eobp)
110	  (insert esc-char)
111	  (let ((p (point))
112		(fc (following-char))
113		(run-length
114		 (skip-chars-forward not-direct-encoding-chars)))
115	    (if (and (= fc esc-char)
116		     (= run-length 1))	; Lone esc-char?
117		(delete-backward-char 1) ; Now there's one too many
118	      (utf7-fragment-encode p (point) for-imap))
119	    (insert "-")))))))
120
121(defun utf7-fragment-encode (start end &optional for-imap)
122  "Encode text from START to END in buffer as UTF-7 escape fragment.
123Use IMAP modification if FOR-IMAP is non-nil."
124  (save-restriction
125    (narrow-to-region start end)
126    (funcall (utf7-get-u16char-converter 'to-utf-16))
127    (mm-with-unibyte-current-buffer
128      (base64-encode-region start (point-max)))
129    (goto-char start)
130    (let ((pm (point-max)))
131      (when for-imap
132	(while (search-forward "/" nil t)
133	  (replace-match ",")))
134      (skip-chars-forward "^= \t\n" pm)
135      (delete-region (point) pm))))
136
137(defun utf7-decode-internal (&optional for-imap)
138  "Decode UTF-7 text in (temporary) buffer.
139Use IMAP modification if FOR-IMAP is non-nil."
140  (let ((start (point-min))
141	(end (point-max)))
142    (goto-char start)
143    (let* ((esc-pattern (concat "^" (char-to-string (if for-imap ?& ?+))))
144	   (base64-chars (concat "A-Za-z0-9+"
145				 (char-to-string (if for-imap ?, ?/)))))
146      (while (not (eobp))
147	(skip-chars-forward esc-pattern)
148	(unless (eobp)
149	  (forward-char)
150	  (let ((p (point))
151		(run-length (skip-chars-forward base64-chars)))
152	    (when (and (not (eobp)) (= (following-char) ?-))
153	      (delete-char 1))
154	    (unless (= run-length 0)	; Encoded lone esc-char?
155	      (save-excursion
156		(utf7-fragment-decode p (point) for-imap)
157		(goto-char p)
158		(delete-backward-char 1)))))))))
159
160(defun utf7-fragment-decode (start end &optional for-imap)
161  "Decode base64 encoded fragment from START to END of UTF-7 text in buffer.
162Use IMAP modification if FOR-IMAP is non-nil."
163  (save-restriction
164    (narrow-to-region start end)
165    (when for-imap
166      (goto-char start)
167      (while (search-forward "," nil 'move-to-end) (replace-match "/")))
168    (let ((pl (utf7-imap-get-pad-length (- end start) 4)))
169      (insert (make-string pl ?=))
170      (base64-decode-region start (+ end pl)))
171    (funcall (utf7-get-u16char-converter 'from-utf-16))))
172
173(defun utf7-get-u16char-converter (which-way)
174  "Return a function to convert between UTF-16 and current character set."
175  (if utf7-utf-16-coding-system
176      (if (eq which-way 'to-utf-16)
177	  (lambda ()
178	    (encode-coding-region (point-min) (point-max)
179				  utf7-utf-16-coding-system))
180	(lambda ()
181	  (decode-coding-region (point-min) (point-max)
182				utf7-utf-16-coding-system)))
183    ;; Add test to check if we are really Latin-1.
184    (if (eq which-way 'to-utf-16)
185	'utf7-latin1-u16-char-converter
186      'utf7-u16-latin1-char-converter)))
187
188(defun utf7-latin1-u16-char-converter ()
189  "Convert latin 1 (ISO-8859.1) characters to 16 bit Unicode.
190Characters are converted to raw byte pairs in narrowed buffer."
191  (mm-encode-coding-region (point-min) (point-max) 'iso-8859-1)
192  (mm-disable-multibyte)
193  (goto-char (point-min))
194  (while (not (eobp))
195    (insert 0)
196    (forward-char)))
197
198(defun utf7-u16-latin1-char-converter ()
199  "Convert 16 bit Unicode characters to latin 1 (ISO-8859.1).
200Characters are in raw byte pairs in narrowed buffer."
201  (goto-char (point-min))
202  (while (not (eobp))
203    (if (= 0 (following-char))
204	(delete-char 1)
205	(error "Unable to convert from Unicode"))
206    (forward-char))
207  (mm-decode-coding-region (point-min) (point-max) 'iso-8859-1)
208  (mm-enable-multibyte))
209
210(defun utf7-encode (string &optional for-imap)
211  "Encode UTF-7 STRING.  Use IMAP modification if FOR-IMAP is non-nil."
212  (let ((default-enable-multibyte-characters t))
213    (with-temp-buffer
214      (insert string)
215      (utf7-encode-internal for-imap)
216      (buffer-string))))
217
218(defun utf7-decode (string &optional for-imap)
219  "Decode UTF-7 STRING.  Use IMAP modification if FOR-IMAP is non-nil."
220  (let ((default-enable-multibyte-characters nil))
221    (with-temp-buffer
222      (insert string)
223      (utf7-decode-internal for-imap)
224      (mm-enable-multibyte)
225      (buffer-string))))
226
227(provide 'utf7)
228
229;;; arch-tag: 96078b55-85c7-4161-aed2-932c24b282c7
230;;; utf7.el ends here
231