1#include "vterm_internal.h" 2 3#include <stdio.h> 4 5#define UNICODE_INVALID 0xFFFD 6 7#ifdef DEBUG 8# define DEBUG_PRINT_UTF8 9#endif 10 11struct UTF8DecoderData { 12 // number of bytes remaining in this codepoint 13 int bytes_remaining; 14 15 // number of bytes total in this codepoint once it's finished 16 // (for detecting overlongs) 17 int bytes_total; 18 19 int this_cp; 20}; 21 22static void init_utf8(VTermEncoding *enc, void *data_) 23{ 24 struct UTF8DecoderData *data = data_; 25 26 data->bytes_remaining = 0; 27 data->bytes_total = 0; 28} 29 30static void decode_utf8(VTermEncoding *enc, void *data_, 31 uint32_t cp[], int *cpi, int cplen, 32 const char bytes[], size_t *pos, size_t bytelen) 33{ 34 struct UTF8DecoderData *data = data_; 35 36#ifdef DEBUG_PRINT_UTF8 37 printf("BEGIN UTF-8\n"); 38#endif 39 40 for( ; *pos < bytelen; (*pos)++) { 41 unsigned char c = bytes[*pos]; 42 43#ifdef DEBUG_PRINT_UTF8 44 printf(" pos=%zd c=%02x rem=%d\n", *pos, c, data->bytes_remaining); 45#endif 46 47 if(c < 0x20) 48 return; 49 50 else if(c >= 0x20 && c < 0x80) { 51 if(data->bytes_remaining) 52 cp[(*cpi)++] = UNICODE_INVALID; 53 54 cp[(*cpi)++] = c; 55#ifdef DEBUG_PRINT_UTF8 56 printf(" UTF-8 char: U+%04x\n", c); 57#endif 58 data->bytes_remaining = 0; 59 } 60 61 else if(c >= 0x80 && c < 0xc0) { 62 if(!data->bytes_remaining) { 63 cp[(*cpi)++] = UNICODE_INVALID; 64 continue; 65 } 66 67 data->this_cp <<= 6; 68 data->this_cp |= c & 0x3f; 69 data->bytes_remaining--; 70 71 if(!data->bytes_remaining) { 72#ifdef DEBUG_PRINT_UTF8 73 printf(" UTF-8 raw char U+%04x bytelen=%d ", data->this_cp, data->bytes_total); 74#endif 75 // Check for overlong sequences 76 switch(data->bytes_total) { 77 case 2: 78 if(data->this_cp < 0x0080) data->this_cp = UNICODE_INVALID; break; 79 case 3: 80 if(data->this_cp < 0x0800) data->this_cp = UNICODE_INVALID; break; 81 case 4: 82 if(data->this_cp < 0x10000) data->this_cp = UNICODE_INVALID; break; 83 case 5: 84 if(data->this_cp < 0x200000) data->this_cp = UNICODE_INVALID; break; 85 case 6: 86 if(data->this_cp < 0x4000000) data->this_cp = UNICODE_INVALID; break; 87 } 88 // Now look for plain invalid ones 89 if((data->this_cp >= 0xD800 && data->this_cp <= 0xDFFF) || 90 data->this_cp == 0xFFFE || 91 data->this_cp == 0xFFFF) 92 data->this_cp = UNICODE_INVALID; 93#ifdef DEBUG_PRINT_UTF8 94 printf(" char: U+%04x\n", data->this_cp); 95#endif 96 cp[(*cpi)++] = data->this_cp; 97 } 98 } 99 100 else if(c >= 0xc0 && c < 0xe0) { 101 if(data->bytes_remaining) 102 cp[(*cpi)++] = UNICODE_INVALID; 103 104 data->this_cp = c & 0x1f; 105 data->bytes_total = 2; 106 data->bytes_remaining = 1; 107 } 108 109 else if(c >= 0xe0 && c < 0xf0) { 110 if(data->bytes_remaining) 111 cp[(*cpi)++] = UNICODE_INVALID; 112 113 data->this_cp = c & 0x0f; 114 data->bytes_total = 3; 115 data->bytes_remaining = 2; 116 } 117 118 else if(c >= 0xf0 && c < 0xf8) { 119 if(data->bytes_remaining) 120 cp[(*cpi)++] = UNICODE_INVALID; 121 122 data->this_cp = c & 0x07; 123 data->bytes_total = 4; 124 data->bytes_remaining = 3; 125 } 126 127 else if(c >= 0xf8 && c < 0xfc) { 128 if(data->bytes_remaining) 129 cp[(*cpi)++] = UNICODE_INVALID; 130 131 data->this_cp = c & 0x03; 132 data->bytes_total = 5; 133 data->bytes_remaining = 4; 134 } 135 136 else if(c >= 0xfc && c < 0xfe) { 137 if(data->bytes_remaining) 138 cp[(*cpi)++] = UNICODE_INVALID; 139 140 data->this_cp = c & 0x01; 141 data->bytes_total = 6; 142 data->bytes_remaining = 5; 143 } 144 145 else { 146 cp[(*cpi)++] = UNICODE_INVALID; 147 } 148 } 149} 150 151static VTermEncoding encoding_utf8 = { 152 .init = &init_utf8, 153 .decode = &decode_utf8, 154}; 155 156static void decode_usascii(VTermEncoding *enc, void *data, 157 uint32_t cp[], int *cpi, int cplen, 158 const char bytes[], size_t *pos, size_t bytelen) 159{ 160 for(; *pos < bytelen; (*pos)++) { 161 unsigned char c = bytes[*pos]; 162 163 if(c < 0x20 || c >= 0x80) 164 return; 165 166 cp[(*cpi)++] = c; 167 } 168} 169 170static VTermEncoding encoding_usascii = { 171 .decode = &decode_usascii, 172}; 173 174struct StaticTableEncoding { 175 const VTermEncoding enc; 176 const uint32_t chars[128]; 177}; 178 179static void decode_table(VTermEncoding *enc, void *data, 180 uint32_t cp[], int *cpi, int cplen, 181 const char bytes[], size_t *pos, size_t bytelen) 182{ 183 struct StaticTableEncoding *table = (struct StaticTableEncoding *)enc; 184 185 for(; *pos < bytelen; (*pos)++) { 186 unsigned char c = (bytes[*pos]) & 0x7f; 187 188 if(c < 0x20) 189 return; 190 191 if(table->chars[c]) 192 cp[(*cpi)++] = table->chars[c]; 193 else 194 cp[(*cpi)++] = c; 195 } 196} 197 198#include "encoding/DECdrawing.inc" 199#include "encoding/uk.inc" 200 201static struct { 202 VTermEncodingType type; 203 char designation; 204 VTermEncoding *enc; 205} 206encodings[] = { 207 { ENC_UTF8, 'u', &encoding_utf8 }, 208 { ENC_SINGLE_94, '0', (VTermEncoding*)&encoding_DECdrawing }, 209 { ENC_SINGLE_94, 'A', (VTermEncoding*)&encoding_uk }, 210 { ENC_SINGLE_94, 'B', &encoding_usascii }, 211 { 0, 0 }, 212}; 213 214VTermEncoding *vterm_lookup_encoding(VTermEncodingType type, char designation) 215{ 216 int i; 217 for(i = 0; encodings[i].designation; i++) 218 if(encodings[i].type == type && encodings[i].designation == designation) 219 return encodings[i].enc; 220 return NULL; 221} 222