1#include "vterm_internal.h"
2
3#include <stdio.h>
4
5#define UNICODE_INVALID 0xFFFD
6
7#ifdef DEBUG
8# define DEBUG_PRINT_UTF8
9#endif
10
11struct UTF8DecoderData {
12  // number of bytes remaining in this codepoint
13  int bytes_remaining;
14
15  // number of bytes total in this codepoint once it's finished
16  // (for detecting overlongs)
17  int bytes_total;
18
19  int this_cp;
20};
21
22static void init_utf8(VTermEncoding *enc, void *data_)
23{
24  struct UTF8DecoderData *data = data_;
25
26  data->bytes_remaining = 0;
27  data->bytes_total     = 0;
28}
29
30static void decode_utf8(VTermEncoding *enc, void *data_,
31                        uint32_t cp[], int *cpi, int cplen,
32                        const char bytes[], size_t *pos, size_t bytelen)
33{
34  struct UTF8DecoderData *data = data_;
35
36#ifdef DEBUG_PRINT_UTF8
37  printf("BEGIN UTF-8\n");
38#endif
39
40  for( ; *pos < bytelen; (*pos)++) {
41    unsigned char c = bytes[*pos];
42
43#ifdef DEBUG_PRINT_UTF8
44    printf(" pos=%zd c=%02x rem=%d\n", *pos, c, data->bytes_remaining);
45#endif
46
47    if(c < 0x20)
48      return;
49
50    else if(c >= 0x20 && c < 0x80) {
51      if(data->bytes_remaining)
52        cp[(*cpi)++] = UNICODE_INVALID;
53
54      cp[(*cpi)++] = c;
55#ifdef DEBUG_PRINT_UTF8
56      printf(" UTF-8 char: U+%04x\n", c);
57#endif
58      data->bytes_remaining = 0;
59    }
60
61    else if(c >= 0x80 && c < 0xc0) {
62      if(!data->bytes_remaining) {
63        cp[(*cpi)++] = UNICODE_INVALID;
64        continue;
65      }
66
67      data->this_cp <<= 6;
68      data->this_cp |= c & 0x3f;
69      data->bytes_remaining--;
70
71      if(!data->bytes_remaining) {
72#ifdef DEBUG_PRINT_UTF8
73        printf(" UTF-8 raw char U+%04x bytelen=%d ", data->this_cp, data->bytes_total);
74#endif
75        // Check for overlong sequences
76        switch(data->bytes_total) {
77        case 2:
78          if(data->this_cp <  0x0080) data->this_cp = UNICODE_INVALID; break;
79        case 3:
80          if(data->this_cp <  0x0800) data->this_cp = UNICODE_INVALID; break;
81        case 4:
82          if(data->this_cp < 0x10000) data->this_cp = UNICODE_INVALID; break;
83        case 5:
84          if(data->this_cp < 0x200000) data->this_cp = UNICODE_INVALID; break;
85        case 6:
86          if(data->this_cp < 0x4000000) data->this_cp = UNICODE_INVALID; break;
87        }
88        // Now look for plain invalid ones
89        if((data->this_cp >= 0xD800 && data->this_cp <= 0xDFFF) ||
90           data->this_cp == 0xFFFE ||
91           data->this_cp == 0xFFFF)
92          data->this_cp = UNICODE_INVALID;
93#ifdef DEBUG_PRINT_UTF8
94        printf(" char: U+%04x\n", data->this_cp);
95#endif
96        cp[(*cpi)++] = data->this_cp;
97      }
98    }
99
100    else if(c >= 0xc0 && c < 0xe0) {
101      if(data->bytes_remaining)
102        cp[(*cpi)++] = UNICODE_INVALID;
103
104      data->this_cp = c & 0x1f;
105      data->bytes_total = 2;
106      data->bytes_remaining = 1;
107    }
108
109    else if(c >= 0xe0 && c < 0xf0) {
110      if(data->bytes_remaining)
111        cp[(*cpi)++] = UNICODE_INVALID;
112
113      data->this_cp = c & 0x0f;
114      data->bytes_total = 3;
115      data->bytes_remaining = 2;
116    }
117
118    else if(c >= 0xf0 && c < 0xf8) {
119      if(data->bytes_remaining)
120        cp[(*cpi)++] = UNICODE_INVALID;
121
122      data->this_cp = c & 0x07;
123      data->bytes_total = 4;
124      data->bytes_remaining = 3;
125    }
126
127    else if(c >= 0xf8 && c < 0xfc) {
128      if(data->bytes_remaining)
129        cp[(*cpi)++] = UNICODE_INVALID;
130
131      data->this_cp = c & 0x03;
132      data->bytes_total = 5;
133      data->bytes_remaining = 4;
134    }
135
136    else if(c >= 0xfc && c < 0xfe) {
137      if(data->bytes_remaining)
138        cp[(*cpi)++] = UNICODE_INVALID;
139
140      data->this_cp = c & 0x01;
141      data->bytes_total = 6;
142      data->bytes_remaining = 5;
143    }
144
145    else {
146      cp[(*cpi)++] = UNICODE_INVALID;
147    }
148  }
149}
150
151static VTermEncoding encoding_utf8 = {
152  .init   = &init_utf8,
153  .decode = &decode_utf8,
154};
155
156static void decode_usascii(VTermEncoding *enc, void *data,
157                           uint32_t cp[], int *cpi, int cplen,
158                           const char bytes[], size_t *pos, size_t bytelen)
159{
160  for(; *pos < bytelen; (*pos)++) {
161    unsigned char c = bytes[*pos];
162
163    if(c < 0x20 || c >= 0x80)
164      return;
165
166    cp[(*cpi)++] = c;
167  }
168}
169
170static VTermEncoding encoding_usascii = {
171  .decode = &decode_usascii,
172};
173
174struct StaticTableEncoding {
175  const VTermEncoding enc;
176  const uint32_t chars[128];
177};
178
179static void decode_table(VTermEncoding *enc, void *data,
180                         uint32_t cp[], int *cpi, int cplen,
181                         const char bytes[], size_t *pos, size_t bytelen)
182{
183  struct StaticTableEncoding *table = (struct StaticTableEncoding *)enc;
184
185  for(; *pos < bytelen; (*pos)++) {
186    unsigned char c = (bytes[*pos]) & 0x7f;
187
188    if(c < 0x20)
189      return;
190
191    if(table->chars[c])
192      cp[(*cpi)++] = table->chars[c];
193    else
194      cp[(*cpi)++] = c;
195  }
196}
197
198#include "encoding/DECdrawing.inc"
199#include "encoding/uk.inc"
200
201static struct {
202  VTermEncodingType type;
203  char designation;
204  VTermEncoding *enc;
205}
206encodings[] = {
207  { ENC_UTF8,      'u', &encoding_utf8 },
208  { ENC_SINGLE_94, '0', (VTermEncoding*)&encoding_DECdrawing },
209  { ENC_SINGLE_94, 'A', (VTermEncoding*)&encoding_uk },
210  { ENC_SINGLE_94, 'B', &encoding_usascii },
211  { 0, 0 },
212};
213
214VTermEncoding *vterm_lookup_encoding(VTermEncodingType type, char designation)
215{
216  int i;
217  for(i = 0; encodings[i].designation; i++)
218    if(encodings[i].type == type && encodings[i].designation == designation)
219      return encodings[i].enc;
220  return NULL;
221}
222