1#include "vterm_internal.h" 2 3#define UNICODE_INVALID 0xFFFD 4 5#if defined(DEBUG) && DEBUG > 1 6# include <stdio.h> 7# define DEBUG_PRINT_UTF8 8#endif 9 10struct UTF8DecoderData { 11 // number of bytes remaining in this codepoint 12 int bytes_remaining; 13 14 // number of bytes total in this codepoint once it's finished 15 // (for detecting overlongs) 16 int bytes_total; 17 18 int this_cp; 19}; 20 21static void init_utf8(VTermEncoding *enc, void *data_) 22{ 23 struct UTF8DecoderData *data = data_; 24 25 data->bytes_remaining = 0; 26 data->bytes_total = 0; 27} 28 29static void decode_utf8(VTermEncoding *enc, void *data_, 30 uint32_t cp[], int *cpi, int cplen, 31 const char bytes[], size_t *pos, size_t bytelen) 32{ 33 struct UTF8DecoderData *data = data_; 34 35#ifdef DEBUG_PRINT_UTF8 36 printf("BEGIN UTF-8\n"); 37#endif 38 39 for(; *pos < bytelen && *cpi < cplen; (*pos)++) { 40 unsigned char c = bytes[*pos]; 41 42#ifdef DEBUG_PRINT_UTF8 43 printf(" pos=%zd c=%02x rem=%d\n", *pos, c, data->bytes_remaining); 44#endif 45 46 if(c < 0x20) 47 return; 48 49 else if(c >= 0x20 && c < 0x80) { 50 if(data->bytes_remaining) 51 cp[(*cpi)++] = UNICODE_INVALID; 52 53 cp[(*cpi)++] = c; 54#ifdef DEBUG_PRINT_UTF8 55 printf(" UTF-8 char: U+%04x\n", c); 56#endif 57 data->bytes_remaining = 0; 58 } 59 60 else if(c >= 0x80 && c < 0xc0) { 61 if(!data->bytes_remaining) { 62 cp[(*cpi)++] = UNICODE_INVALID; 63 continue; 64 } 65 66 data->this_cp <<= 6; 67 data->this_cp |= c & 0x3f; 68 data->bytes_remaining--; 69 70 if(!data->bytes_remaining) { 71#ifdef DEBUG_PRINT_UTF8 72 printf(" UTF-8 raw char U+%04x bytelen=%d ", data->this_cp, data->bytes_total); 73#endif 74 // Check for overlong sequences 75 switch(data->bytes_total) { 76 case 2: 77 if(data->this_cp < 0x0080) data->this_cp = UNICODE_INVALID; 78 break; 79 case 3: 80 if(data->this_cp < 0x0800) data->this_cp = UNICODE_INVALID; 81 break; 82 case 4: 83 if(data->this_cp < 0x10000) data->this_cp = UNICODE_INVALID; 84 break; 85 case 5: 86 if(data->this_cp < 0x200000) data->this_cp = UNICODE_INVALID; 87 break; 88 case 6: 89 if(data->this_cp < 0x4000000) data->this_cp = UNICODE_INVALID; 90 break; 91 } 92 // Now look for plain invalid ones 93 if((data->this_cp >= 0xD800 && data->this_cp <= 0xDFFF) || 94 data->this_cp == 0xFFFE || 95 data->this_cp == 0xFFFF) 96 data->this_cp = UNICODE_INVALID; 97#ifdef DEBUG_PRINT_UTF8 98 printf(" char: U+%04x\n", data->this_cp); 99#endif 100 cp[(*cpi)++] = data->this_cp; 101 } 102 } 103 104 else if(c >= 0xc0 && c < 0xe0) { 105 if(data->bytes_remaining) 106 cp[(*cpi)++] = UNICODE_INVALID; 107 108 data->this_cp = c & 0x1f; 109 data->bytes_total = 2; 110 data->bytes_remaining = 1; 111 } 112 113 else if(c >= 0xe0 && c < 0xf0) { 114 if(data->bytes_remaining) 115 cp[(*cpi)++] = UNICODE_INVALID; 116 117 data->this_cp = c & 0x0f; 118 data->bytes_total = 3; 119 data->bytes_remaining = 2; 120 } 121 122 else if(c >= 0xf0 && c < 0xf8) { 123 if(data->bytes_remaining) 124 cp[(*cpi)++] = UNICODE_INVALID; 125 126 data->this_cp = c & 0x07; 127 data->bytes_total = 4; 128 data->bytes_remaining = 3; 129 } 130 131 else if(c >= 0xf8 && c < 0xfc) { 132 if(data->bytes_remaining) 133 cp[(*cpi)++] = UNICODE_INVALID; 134 135 data->this_cp = c & 0x03; 136 data->bytes_total = 5; 137 data->bytes_remaining = 4; 138 } 139 140 else if(c >= 0xfc && c < 0xfe) { 141 if(data->bytes_remaining) 142 cp[(*cpi)++] = UNICODE_INVALID; 143 144 data->this_cp = c & 0x01; 145 data->bytes_total = 6; 146 data->bytes_remaining = 5; 147 } 148 149 else { 150 cp[(*cpi)++] = UNICODE_INVALID; 151 } 152 } 153} 154 155static VTermEncoding encoding_utf8 = { 156 .init = &init_utf8, 157 .decode = &decode_utf8, 158}; 159 160static void decode_usascii(VTermEncoding *enc, void *data, 161 uint32_t cp[], int *cpi, int cplen, 162 const char bytes[], size_t *pos, size_t bytelen) 163{ 164 int is_gr = bytes[*pos] & 0x80; 165 166 for(; *pos < bytelen && *cpi < cplen; (*pos)++) { 167 unsigned char c = bytes[*pos] ^ is_gr; 168 169 if(c < 0x20 || c >= 0x80) 170 return; 171 172 cp[(*cpi)++] = c; 173 } 174} 175 176static VTermEncoding encoding_usascii = { 177 .decode = &decode_usascii, 178}; 179 180struct StaticTableEncoding { 181 const VTermEncoding enc; 182 const uint32_t chars[128]; 183}; 184 185static void decode_table(VTermEncoding *enc, void *data, 186 uint32_t cp[], int *cpi, int cplen, 187 const char bytes[], size_t *pos, size_t bytelen) 188{ 189 struct StaticTableEncoding *table = (struct StaticTableEncoding *)enc; 190 int is_gr = bytes[*pos] & 0x80; 191 192 for(; *pos < bytelen && *cpi < cplen; (*pos)++) { 193 unsigned char c = bytes[*pos] ^ is_gr; 194 195 if(c < 0x20 || c >= 0x80) 196 return; 197 198 if(table->chars[c]) 199 cp[(*cpi)++] = table->chars[c]; 200 else 201 cp[(*cpi)++] = c; 202 } 203} 204 205#include "encoding/DECdrawing.inc" 206#include "encoding/uk.inc" 207 208static struct { 209 VTermEncodingType type; 210 char designation; 211 VTermEncoding *enc; 212} 213encodings[] = { 214 { ENC_UTF8, 'u', &encoding_utf8 }, 215 { ENC_SINGLE_94, '0', (VTermEncoding*)&encoding_DECdrawing }, 216 { ENC_SINGLE_94, 'A', (VTermEncoding*)&encoding_uk }, 217 { ENC_SINGLE_94, 'B', &encoding_usascii }, 218 { 0 }, 219}; 220 221/* This ought to be INTERNAL but isn't because it's used by unit testing */ 222VTermEncoding *vterm_lookup_encoding(VTermEncodingType type, char designation) 223{ 224 int i; 225 for(i = 0; encodings[i].designation; i++) 226 if(encodings[i].type == type && encodings[i].designation == designation) 227 return encodings[i].enc; 228 return NULL; 229} 230