1;;; tibet-util.el --- utilities for Tibetan -*- coding: iso-2022-7bit; -*- 2 3;; Copyright (C) 1997, 2001, 2002, 2003, 2004, 2005, 2006, 2007 4;; Free Software Foundation, Inc. 5;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 6;; 2005, 2006, 2007 7;; National Institute of Advanced Industrial Science and Technology (AIST) 8;; Registration Number H14PRO021 9 10;; Keywords: multilingual, Tibetan 11 12;; This file is part of GNU Emacs. 13 14;; GNU Emacs is free software; you can redistribute it and/or modify 15;; it under the terms of the GNU General Public License as published by 16;; the Free Software Foundation; either version 2, or (at your option) 17;; any later version. 18 19;; GNU Emacs is distributed in the hope that it will be useful, 20;; but WITHOUT ANY WARRANTY; without even the implied warranty of 21;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22;; GNU General Public License for more details. 23 24;; You should have received a copy of the GNU General Public License 25;; along with GNU Emacs; see the file COPYING. If not, write to the 26;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 27;; Boston, MA 02110-1301, USA. 28 29;; Author: Toru TOMABECHI, <Toru.Tomabechi@orient.unil.ch> 30 31;; Created: Feb. 17. 1997 32 33;;; History: 34;; 1997.03.13 Modification in treatment of text properties; 35;; Support for some special signs and punctuations. 36;; 1999.10.25 Modification for a new composition way by K.Handa. 37 38;;; Commentary: 39 40;;; Code: 41 42(defconst tibetan-obsolete-glyphs 43 `(("$(7!=(B" . "$(8!=(B") ; 2 col <-> 1 col 44 ("$(7!?(B" . "$(8!?(B") 45 ("$(7!@(B" . "$(8!@(B") 46 ("$(7!A(B" . "$(8!A(B") 47 ("$(7"`(B" . "$(8"`(B") 48 ("$(7!;(B" . "$(8!;(B") 49 ("$(7!D(B" . "$(8!D(B") 50 ;; Yes these are dirty. But ... 51 ("$(7!>(B $(7!>(B" . ,(compose-string "$(7!>(B $(7!>(B" 0 3 [?$(7!>(B (Br . Bl) ? (Br . Bl) ?$(7!>(B])) 52 ("$(7!4!5!5(B" . ,(compose-string 53 "$(7#R#S#S#S(B" 0 4 54 [?$(7#R(B (Br . Bl) ?$(7#S(B (Br . Bl) ?$(7#S(B (Br . Bl) ?$(7#S(B])) 55 ("$(7!4!5(B" . ,(compose-string "$(7#R#S#S(B" 0 3 [?$(7#R(B (Br . Bl) ?$(7#S(B (Br . Bl) ?$(7#S(B])) 56 ("$(7!6(B" . ,(compose-string "$(7#R#S!I(B" 0 3 [?$(7#R(B (Br . Bl) ?$(7#S(B (br . tr) ?$(7!I(B])) 57 ("$(7!4(B" . ,(compose-string "$(7#R#S(B" 0 2 [?$(7#R(B (Br . Bl) ?$(7#S(B])))) 58 59;;;###autoload 60(defun tibetan-char-p (ch) 61 "Check if char CH is Tibetan character. 62Returns non-nil if CH is Tibetan. Otherwise, returns nil." 63 (memq (char-charset ch) '(tibetan tibetan-1-column))) 64 65;;; Functions for Tibetan <-> Tibetan-transcription. 66 67;;;###autoload 68(defun tibetan-tibetan-to-transcription (str) 69 "Transcribe Tibetan string STR and return the corresponding Roman string." 70 (let (;; Accumulate transcriptions here in reverse order. 71 (trans nil) 72 (len (length str)) 73 (i 0) 74 ch this-trans) 75 (while (< i len) 76 (let ((idx (string-match tibetan-precomposition-rule-regexp str i))) 77 (if (eq idx i) 78 ;; Ith character and the followings matches precomposable 79 ;; Tibetan sequence. 80 (setq i (match-end 0) 81 this-trans 82 (car (rassoc 83 (cdr (assoc (match-string 0 str) 84 tibetan-precomposition-rule-alist)) 85 tibetan-precomposed-transcription-alist))) 86 (setq ch (substring str i (1+ i)) 87 i (1+ i) 88 this-trans 89 (car (or (rassoc ch tibetan-consonant-transcription-alist) 90 (rassoc ch tibetan-vowel-transcription-alist) 91 (rassoc ch tibetan-subjoined-transcription-alist))))) 92 (setq trans (cons this-trans trans)))) 93 (apply 'concat (nreverse trans)))) 94 95;;;###autoload 96(defun tibetan-transcription-to-tibetan (str) 97 "Convert Tibetan Roman string STR to Tibetan character string. 98The returned string has no composition information." 99 (let (;; Case is significant. 100 (case-fold-search nil) 101 (idx 0) 102 ;; Accumulate Tibetan strings here in reverse order. 103 (t-str-list nil) 104 i subtrans) 105 (while (setq i (string-match tibetan-regexp str idx)) 106 (if (< idx i) 107 ;; STR contains a pattern that doesn't match Tibetan 108 ;; transcription. Include the pattern as is. 109 (setq t-str-list (cons (substring str idx i) t-str-list))) 110 (setq subtrans (match-string 0 str) 111 idx (match-end 0)) 112 (let ((t-char (cdr (assoc subtrans 113 tibetan-precomposed-transcription-alist)))) 114 (if t-char 115 ;; SUBTRANS corresponds to a transcription for 116 ;; precomposable Tibetan sequence. 117 (setq t-char (car (rassoc t-char 118 tibetan-precomposition-rule-alist))) 119 (setq t-char 120 (cdr 121 (or (assoc subtrans tibetan-consonant-transcription-alist) 122 (assoc subtrans tibetan-vowel-transcription-alist) 123 (assoc subtrans tibetan-modifier-transcription-alist) 124 (assoc subtrans tibetan-subjoined-transcription-alist))))) 125 (setq t-str-list (cons t-char t-str-list)))) 126 (if (< idx (length str)) 127 (setq t-str-list (cons (substring str idx) t-str-list))) 128 (apply 'concat (nreverse t-str-list)))) 129 130;;; 131;;; Functions for composing/decomposing Tibetan sequence. 132;;; 133;;; A Tibetan syllable is typically structured as follows: 134;;; 135;;; [Prefix] C [C+] V [M] [Suffix [Post suffix]] 136;;; 137;;; where C's are all vertically stacked, V appears below or above 138;;; consonant cluster and M is always put above the C[C+]V combination. 139;;; (Sanskrit visarga, though it is a vowel modifier, is considered 140;;; to be a punctuation.) 141;;; 142;;; Here are examples of the words "bsgrubs" and "hfauM" 143;;; 144;;; 4$(7"70"714%qx!"U0"G###C"U14"70"714"G0"G1(B 4$(7"Hx!"Rx!"Ur'"_0"H"R"U"_1(B 145;;; 146;;; M 147;;; b s b s h 148;;; g fa 149;;; r u 150;;; u 151;;; 152;;; Consonants `'' ($(7"A(B), `w' ($(7">(B), `y' ($(7"B(B), `r' ($(7"C(B) take special 153;;; forms when they are used as subjoined consonant. Consonant `r' 154;;; takes another special form when used as superjoined in such a case 155;;; as "rka", while it does not change its form when conjoined with 156;;; subjoined `'', `w' or `y' as in "rwa", "rya". 157 158;; Append a proper composition rule and glyph to COMPONENTS to compose 159;; CHAR with a composition that has COMPONENTS. 160 161(defun tibetan-add-components (components char) 162 (let ((last (last components)) 163 (stack-upper '(tc . bc)) 164 (stack-under '(bc . tc)) 165 rule comp-vowel tmp) 166 ;; Special treatment for 'a chung. 167 ;; If 'a follows a consonant, turn it into the subjoined form. 168 ;; * Disabled by Tomabechi 2000/06/09 * 169 ;; Because in Unicode, $(7"A(B may follow directly a consonant without 170 ;; any intervening vowel, as in 4$(7"90"914""0"""Q14"A0"A1!;(B=4$(7"90"91(B 4$(7""0""1(B 4$(7"A0"A1(B not 4$(7"90"91(B 4$(7""0""1(B $(7"Q(B 4$(7"A0"A1(B 171 ;;(if (and (= char ?$(7"A(B) 172 ;; (aref (char-category-set (car last)) ?0)) 173 ;; (setq char ?$(7"R(B)) ;; modified for new font by Tomabechi 1999/12/10 174 175 ;; Composite vowel signs are decomposed before being added 176 ;; Added by Tomabechi 2000/06/08 177 (if (memq char '(?$(7"T(B ?$(7"V(B ?$(7"W(B ?$(7"X(B ?$(7"Y(B ?$(7"Z(B ?$(7"b(B)) 178 (setq comp-vowel 179 (copy-sequence 180 (cddr (assoc (char-to-string char) 181 tibetan-composite-vowel-alist))) 182 char 183 (cadr (assoc (char-to-string char) 184 tibetan-composite-vowel-alist)))) 185 (cond 186 ;; Compose upper vowel sign vertically over. 187 ((aref (char-category-set char) ?2) 188 (setq rule stack-upper)) 189 190 ;; Compose lower vowel sign vertically under. 191 ((aref (char-category-set char) ?3) 192 (if (eq char ?$(7"Q(B) ;; `$(7"Q(B' should not visible when composed. 193 (setq rule nil) 194 (setq rule stack-under))) 195 ;; Transform ra-mgo (superscribed r) if followed by a subjoined 196 ;; consonant other than w, ', y, r. 197 ((and (= (car last) ?$(7"C(B) 198 (not (memq char '(?$(7#>(B ?$(7"R(B ?$(7#B(B ?$(7#C(B)))) 199 (setcar last ?$(7!"(B) ;; modified for newfont by Tomabechi 1999/12/10 200 (setq rule stack-under)) 201 ;; Transform initial base consonant if followed by a subjoined 202 ;; consonant but 'a. 203 (t 204 (let ((laststr (char-to-string (car last)))) 205 (if (and (/= char ?$(7"R(B) ;; modified for new font by Tomabechi 206 (string-match "[$(7"!(B-$(7"="?"@"D(B-$(7"J"K(B]" laststr)) 207 (setcar last (string-to-char 208 (cdr (assoc (char-to-string (car last)) 209 tibetan-base-to-subjoined-alist))))) 210 (setq rule stack-under)))) 211 212 (if rule 213 (setcdr last (list rule char))) 214 ;; Added by Tomabechi 2000/06/08 215 (if comp-vowel 216 (nconc last comp-vowel)) 217 )) 218 219;;;###autoload 220(defun tibetan-compose-string (str) 221 "Compose Tibetan string STR." 222 (let ((idx 0)) 223 ;; `$(7"A(B' is included in the pattern for subjoined consonants 224 ;; because we treat it specially in tibetan-add-components. 225 ;; (This feature is removed by Tomabechi 2000/06/08) 226 (while (setq idx (string-match tibetan-composable-pattern str idx)) 227 (let ((from idx) 228 (to (match-end 0)) 229 components) 230 (if (eq (string-match tibetan-precomposition-rule-regexp str idx) idx) 231 (setq idx (match-end 0) 232 components 233 (list (string-to-char 234 (cdr 235 (assoc (match-string 0 str) 236 tibetan-precomposition-rule-alist))))) 237 (setq components (list (aref str idx)) 238 idx (1+ idx))) 239 (while (< idx to) 240 (tibetan-add-components components (aref str idx)) 241 (setq idx (1+ idx))) 242 (compose-string str from to components)))) 243 str) 244 245;;;###autoload 246(defun tibetan-compose-region (beg end) 247 "Compose Tibetan text the region BEG and END." 248 (interactive "r") 249 (let (str result chars) 250 (save-excursion 251 (save-restriction 252 (narrow-to-region beg end) 253 (goto-char (point-min)) 254 ;; `$(7"A(B' is included in the pattern for subjoined consonants 255 ;; because we treat it specially in tibetan-add-components. 256 ;; (This feature is removed by Tomabechi 2000/06/08) 257 (while (re-search-forward tibetan-composable-pattern nil t) 258 (let ((from (match-beginning 0)) 259 (to (match-end 0)) 260 components) 261 (goto-char from) 262 (if (looking-at tibetan-precomposition-rule-regexp) 263 (progn 264 (setq components 265 (list (string-to-char 266 (cdr 267 (assoc (match-string 0) 268 tibetan-precomposition-rule-alist))))) 269 (goto-char (match-end 0))) 270 (setq components (list (char-after from))) 271 (forward-char 1)) 272 (while (< (point) to) 273 (tibetan-add-components components (following-char)) 274 (forward-char 1)) 275 (compose-region from to components))))))) 276 277(defvar tibetan-decompose-precomposition-alist 278 (mapcar (function (lambda (x) (cons (string-to-char (cdr x)) (car x)))) 279 tibetan-precomposition-rule-alist)) 280 281;;;###autoload 282(defun tibetan-decompose-region (from to) 283 "Decompose Tibetan text in the region FROM and TO. 284This is different from decompose-region because precomposed Tibetan characters 285are decomposed into normal Tibetan character sequences." 286 (interactive "r") 287 (save-restriction 288 (narrow-to-region from to) 289 (decompose-region from to) 290 (goto-char from) 291 (while (not (eobp)) 292 (let* ((char (following-char)) 293 (slot (assq char tibetan-decompose-precomposition-alist))) 294 (if slot 295 (progn 296 (delete-char 1) 297 (insert (cdr slot))) 298 (forward-char 1)))))) 299 300 301;;;###autoload 302(defun tibetan-decompose-string (str) 303 "Decompose Tibetan string STR. 304This is different from decompose-string because precomposed Tibetan characters 305are decomposed into normal Tibetan character sequences." 306 (let ((new "") 307 (len (length str)) 308 (idx 0) 309 char slot) 310 (while (< idx len) 311 (setq char (aref str idx) 312 slot (assq (aref str idx) tibetan-decompose-precomposition-alist) 313 new (concat new (if slot (cdr slot) (char-to-string char))) 314 idx (1+ idx))) 315 new)) 316 317;;;###autoload 318(defun tibetan-composition-function (from to pattern &optional string) 319 (if string 320 (tibetan-compose-string string) 321 (tibetan-compose-region from to)) 322 (- to from)) 323 324;;; 325;;; This variable is used to avoid repeated decomposition. 326;;; 327(setq-default tibetan-decomposed nil) 328 329;;;###autoload 330(defun tibetan-decompose-buffer () 331 "Decomposes Tibetan characters in the buffer into their components. 332See also the documentation of the function `tibetan-decompose-region'." 333 (interactive) 334 (make-local-variable 'tibetan-decomposed) 335 (cond ((not tibetan-decomposed) 336 (tibetan-decompose-region (point-min) (point-max)) 337 (setq tibetan-decomposed t)))) 338 339;;;###autoload 340(defun tibetan-compose-buffer () 341 "Composes Tibetan character components in the buffer. 342See also docstring of the function tibetan-compose-region." 343 (interactive) 344 (make-local-variable 'tibetan-decomposed) 345 (tibetan-compose-region (point-min) (point-max)) 346 (setq tibetan-decomposed nil)) 347 348;;;###autoload 349(defun tibetan-post-read-conversion (len) 350 (save-excursion 351 (save-restriction 352 (let ((buffer-modified-p (buffer-modified-p))) 353 (narrow-to-region (point) (+ (point) len)) 354 (tibetan-compose-region (point-min) (point-max)) 355 (set-buffer-modified-p buffer-modified-p) 356 (make-local-variable 'tibetan-decomposed) 357 (setq tibetan-decomposed nil) 358 (- (point-max) (point-min)))))) 359 360 361;;;###autoload 362(defun tibetan-pre-write-conversion (from to) 363 (setq tibetan-decomposed-temp tibetan-decomposed) 364 (let ((old-buf (current-buffer))) 365 (set-buffer (generate-new-buffer " *temp*")) 366 (if (stringp from) 367 (insert from) 368 (insert-buffer-substring old-buf from to)) 369 (if (not tibetan-decomposed-temp) 370 (tibetan-decompose-region (point-min) (point-max))) 371 ;; Should return nil as annotations. 372 nil)) 373 374 375;;; 376;;; Unicode-related definitions. 377;;; 378 379(defvar tibetan-canonicalize-for-unicode-alist 380 '(("$(7"Q(B" . "") ;; remove vowel a 381 ("$(7"T(B" . "$(7"R"S(B") ;; decompose vowels whose use is ``discouraged'' in Unicode 3.0 382 ("$(7"V(B" . "$(7"R"U(B") 383 ("$(7"W(B" . "$(7#C"a(B") 384 ("$(7"X(B" . "$(7#C"R"a(B") 385 ("$(7"Y(B" . "$(7#D"a(B") 386 ("$(7"Z(B" . "$(7#D"R"a(B") 387 ("$(7"b(B" . "$(7"R"a(B")) 388 "Rules for canonicalizing Tibetan vowels for Unicode.") 389 390(defvar tibetan-canonicalize-for-unicode-regexp 391 "[$(7"Q"T"V"W"X"Y"Z"b(B]" 392 "Regexp for Tibetan vowels to be canonicalized in Unicode.") 393 394(defun tibetan-canonicalize-for-unicode-region (from to) 395 (save-restriction 396 (narrow-to-region from to) 397 (goto-char from) 398 (while (re-search-forward tibetan-canonicalize-for-unicode-regexp nil t) 399 (let ( 400 ;;(from (match-beginning 0)) 401 ;;(to (match-end 0)) 402 (canonical-form 403 (cdr (assoc (match-string 0) 404 tibetan-canonicalize-for-unicode-alist)))) 405 ;;(goto-char from) 406 ;;(delete-region from to) 407 ;;(insert canonical-form) 408 (replace-match canonical-form) 409 )))) 410 411(defvar tibetan-strict-unicode t 412 "*Flag to control Tibetan canonicalizing for Unicode. 413 414If non-nil, the vowel a is removed and composite vowels are decomposed 415before writing buffer in Unicode. See also 416`tibetan-canonicalize-for-unicode-regexp' and 417`tibetan-canonicalize-for-unicode-alist'.") 418 419;;;###autoload 420(defun tibetan-pre-write-canonicalize-for-unicode (from to) 421 (let ((old-buf (current-buffer)) 422 (strict-unicode tibetan-strict-unicode)) 423 (set-buffer (generate-new-buffer " *temp*")) 424 (if (stringp from) 425 (insert from) 426 (insert-buffer-substring old-buf from to)) 427 (if strict-unicode 428 (tibetan-canonicalize-for-unicode-region (point-min) (point-max))) 429 ;; Should return nil as annotations. 430 nil)) 431 432(provide 'tibet-util) 433 434;;; arch-tag: 7a7333e8-1584-446c-b39c-a02b9def265d 435;;; tibet-util.el ends here 436