1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4#
5# Copyright 2015, NICTA
6#
7# This software may be distributed and modified according to the terms of
8# the BSD 2-Clause license. Note that NO WARRANTY is provided.
9# See "LICENSE_BSD2.txt" for details.
10#
11# @TAG(NICTA_BSD)
12#
13
14import codecs, collections, numbers, re, types
15from .exception import IsaSymbolsException
16
17class Symbol(object):
18    def __init__(self, ascii_text, code_point, group=None, font=None,
19            abbreviations=None):
20        assert isinstance(ascii_text, basestring)
21        assert isinstance(code_point, numbers.Number)
22        assert isinstance(group, (basestring, types.NoneType))
23        assert isinstance(font, (basestring, types.NoneType))
24        assert isinstance(abbreviations, (collections.Iterable,
25            types.NoneType))
26
27        self.ascii_text = ascii_text
28        self.code_point = code_point
29        self.group = group
30        self.font = font
31        self.abbreviations = abbreviations or []
32
33def _extract_property(prop):
34    # Values of the symbol fields can contain '���' which is intended to
35    # indicate a space.
36    return prop.replace(u'���', ' ')
37
38class Translator(object):
39    def __init__(self, symbols_text):
40        assert isinstance(symbols_text, basestring)
41
42        self.symbols = []
43
44        for number, line in enumerate(x.strip() for x in
45                symbols_text.split('\n')):
46
47            if line.startswith('#'):
48                # Comment
49                continue
50
51            if line == '':
52                # Blank line
53                continue
54
55            items = line.split()
56            if len(items) < 3 or len(items) % 2 == 0:
57                raise IsaSymbolsException('%d: unexpected line format' %
58                    number)
59
60            fields = {'ascii_text':items[0]}
61
62            for k, v in zip(*[iter(items[1:])] * 2):
63
64                if k == 'code:':
65                    try:
66                        code = int(_extract_property(v), 16)
67                    except ValueError:
68                        raise IsaSymbolsException('%d: invalid code field' %
69                            number)
70                    fields['code_point'] = code
71
72                elif k == 'group:':
73                    fields['group'] = _extract_property(v)
74
75                elif k == 'font:':
76                    fields['font'] = _extract_property(v)
77
78                elif k == 'abbrev:':
79                    if 'abbreviations' not in fields:
80                        fields['abbreviations'] = []
81                    fields['abbreviations'].append(_extract_property(v))
82
83                else:
84                    raise IsaSymbolsException('%d: unexpected field %s' %
85                        (number, k))
86
87            self.symbols.append(Symbol(**fields))
88
89        # Translation dictionaries that we'll lazily initialise later.
90        self._utf8_to_ascii = None
91        self._utf8_to_tex = None
92        self._hshifts_tex = None # Handling for sub-/super-scripts
93
94    def encode(self, data):
95        for symbol in self.symbols:
96            if symbol.ascii_text.startswith('\\<^'):
97                continue
98            data = data.replace(symbol.ascii_text, unichr(symbol.code_point))
99        return data
100
101    def decode(self, data):
102
103        if self._utf8_to_ascii is None:
104            # First time this function has been called; init the dictionary.
105            self._utf8_to_ascii = {unichr(s.code_point): s.ascii_text
106                for s in self.symbols}
107
108        return ''.join(self._utf8_to_ascii.get(c, c) for c in data)
109
110    def to_tex(self, data):
111        '''
112        Translate a string of Isabelle text into its TeX representation. This
113        assumes the input is in unicode form. You probably do *not* want to use
114        this functionality to go straight from THY files to TeX. The motivation
115        for this is translating inline code in Markdown into something TeX-able.
116        In particular, we assume all relevant Isabelle styles and preamble are
117        already in scope.
118        '''
119
120        if self._utf8_to_tex is None:
121            # First time this function has been called; init the dictionary.
122            # XXX: Excuse this horror (don't worry; it used to be worse).
123            self._utf8_to_tex = {
124                unichr(s.code_point):
125                    '{\\isasym%s}' % s.ascii_text[2:-1]
126                for s in self.symbols
127                if s.ascii_text.startswith('\\<') and
128                    s.ascii_text.endswith('>')}
129
130        if self._hshifts_tex is None:
131            # XXX: Hardcoded
132            self._hshifts_tex = (
133                (re.compile(r'\\<\^sub>(.)'), r'\\textsubscript{\1}'),
134                (re.compile(r'\\<\^sup>(.)'), r'\\textsuperscript{\1}'),
135                (re.compile(r'\\<\^bold>(.)'), r'\\textbf{\1}'),
136                (re.compile(r'\\<\^bsub>'), r'\\textsubscript{'),
137                (re.compile(r'\\<\^bsup>'), r'\\textsuperscript{'),
138                (re.compile(r'\\<\^esu\(b|p\)>'), '}'),
139            )
140
141        return ''.join(self._utf8_to_tex.get(c, c) for c in
142            reduce(lambda a, (regex, rep): regex.sub(rep, a), self._hshifts_tex,
143                data))
144
145def make_translator(symbols_file):
146    assert isinstance(symbols_file, basestring)
147    with codecs.open(symbols_file, 'r', 'utf-8') as f:
148        return Translator(f.read())
149