1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3 4# 5# Copyright 2015, NICTA 6# 7# This software may be distributed and modified according to the terms of 8# the BSD 2-Clause license. Note that NO WARRANTY is provided. 9# See "LICENSE_BSD2.txt" for details. 10# 11# @TAG(NICTA_BSD) 12# 13 14import codecs, collections, numbers, re, types 15from .exception import IsaSymbolsException 16 17class Symbol(object): 18 def __init__(self, ascii_text, code_point, group=None, font=None, 19 abbreviations=None): 20 assert isinstance(ascii_text, basestring) 21 assert isinstance(code_point, numbers.Number) 22 assert isinstance(group, (basestring, types.NoneType)) 23 assert isinstance(font, (basestring, types.NoneType)) 24 assert isinstance(abbreviations, (collections.Iterable, 25 types.NoneType)) 26 27 self.ascii_text = ascii_text 28 self.code_point = code_point 29 self.group = group 30 self.font = font 31 self.abbreviations = abbreviations or [] 32 33def _extract_property(prop): 34 # Values of the symbol fields can contain '���' which is intended to 35 # indicate a space. 36 return prop.replace(u'���', ' ') 37 38class Translator(object): 39 def __init__(self, symbols_text): 40 assert isinstance(symbols_text, basestring) 41 42 self.symbols = [] 43 44 for number, line in enumerate(x.strip() for x in 45 symbols_text.split('\n')): 46 47 if line.startswith('#'): 48 # Comment 49 continue 50 51 if line == '': 52 # Blank line 53 continue 54 55 items = line.split() 56 if len(items) < 3 or len(items) % 2 == 0: 57 raise IsaSymbolsException('%d: unexpected line format' % 58 number) 59 60 fields = {'ascii_text':items[0]} 61 62 for k, v in zip(*[iter(items[1:])] * 2): 63 64 if k == 'code:': 65 try: 66 code = int(_extract_property(v), 16) 67 except ValueError: 68 raise IsaSymbolsException('%d: invalid code field' % 69 number) 70 fields['code_point'] = code 71 72 elif k == 'group:': 73 fields['group'] = _extract_property(v) 74 75 elif k == 'font:': 76 fields['font'] = _extract_property(v) 77 78 elif k == 'abbrev:': 79 if 'abbreviations' not in fields: 80 fields['abbreviations'] = [] 81 fields['abbreviations'].append(_extract_property(v)) 82 83 else: 84 raise IsaSymbolsException('%d: unexpected field %s' % 85 (number, k)) 86 87 self.symbols.append(Symbol(**fields)) 88 89 # Translation dictionaries that we'll lazily initialise later. 90 self._utf8_to_ascii = None 91 self._utf8_to_tex = None 92 self._hshifts_tex = None # Handling for sub-/super-scripts 93 94 def encode(self, data): 95 for symbol in self.symbols: 96 if symbol.ascii_text.startswith('\\<^'): 97 continue 98 data = data.replace(symbol.ascii_text, unichr(symbol.code_point)) 99 return data 100 101 def decode(self, data): 102 103 if self._utf8_to_ascii is None: 104 # First time this function has been called; init the dictionary. 105 self._utf8_to_ascii = {unichr(s.code_point): s.ascii_text 106 for s in self.symbols} 107 108 return ''.join(self._utf8_to_ascii.get(c, c) for c in data) 109 110 def to_tex(self, data): 111 ''' 112 Translate a string of Isabelle text into its TeX representation. This 113 assumes the input is in unicode form. You probably do *not* want to use 114 this functionality to go straight from THY files to TeX. The motivation 115 for this is translating inline code in Markdown into something TeX-able. 116 In particular, we assume all relevant Isabelle styles and preamble are 117 already in scope. 118 ''' 119 120 if self._utf8_to_tex is None: 121 # First time this function has been called; init the dictionary. 122 # XXX: Excuse this horror (don't worry; it used to be worse). 123 self._utf8_to_tex = { 124 unichr(s.code_point): 125 '{\\isasym%s}' % s.ascii_text[2:-1] 126 for s in self.symbols 127 if s.ascii_text.startswith('\\<') and 128 s.ascii_text.endswith('>')} 129 130 if self._hshifts_tex is None: 131 # XXX: Hardcoded 132 self._hshifts_tex = ( 133 (re.compile(r'\\<\^sub>(.)'), r'\\textsubscript{\1}'), 134 (re.compile(r'\\<\^sup>(.)'), r'\\textsuperscript{\1}'), 135 (re.compile(r'\\<\^bold>(.)'), r'\\textbf{\1}'), 136 (re.compile(r'\\<\^bsub>'), r'\\textsubscript{'), 137 (re.compile(r'\\<\^bsup>'), r'\\textsuperscript{'), 138 (re.compile(r'\\<\^esu\(b|p\)>'), '}'), 139 ) 140 141 return ''.join(self._utf8_to_tex.get(c, c) for c in 142 reduce(lambda a, (regex, rep): regex.sub(rep, a), self._hshifts_tex, 143 data)) 144 145def make_translator(symbols_file): 146 assert isinstance(symbols_file, basestring) 147 with codecs.open(symbols_file, 'r', 'utf-8') as f: 148 return Translator(f.read()) 149