1" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c. 2" The format of the UnicodeData.txt file is explained here: 3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html 4" For the other files see the header. 5" 6" Usage: Vim -S <this-file> 7" 8" Author: Bram Moolenaar 9" Last Update: 2010 Jan 12 10 11" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops. 12func! ParseDataToProps() 13 let s:dataprops = [] 14 let lnum = 1 15 while lnum <= line('$') 16 let l = split(getline(lnum), '\s*;\s*', 1) 17 if len(l) != 15 18 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15' 19 return 20 endif 21 call add(s:dataprops, l) 22 let lnum += 1 23 endwhile 24endfunc 25 26" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops. 27func! ParseFoldProps() 28 let s:foldprops = [] 29 let lnum = 1 30 while lnum <= line('$') 31 let line = getline(lnum) 32 if line !~ '^#' && line !~ '^\s*$' 33 let l = split(line, '\s*;\s*', 1) 34 if len(l) != 4 35 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4' 36 return 37 endif 38 call add(s:foldprops, l) 39 endif 40 let lnum += 1 41 endwhile 42endfunc 43 44" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops. 45func! ParseWidthProps() 46 let s:widthprops = [] 47 let lnum = 1 48 while lnum <= line('$') 49 let line = getline(lnum) 50 if line !~ '^#' && line !~ '^\s*$' 51 let l = split(line, '\s*;\s*', 1) 52 if len(l) != 2 53 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2' 54 return 55 endif 56 call add(s:widthprops, l) 57 endif 58 let lnum += 1 59 endwhile 60endfunc 61 62" Build the toLower or toUpper table in a new buffer. 63" Uses s:dataprops. 64func! BuildCaseTable(name, index) 65 let start = -1 66 let end = -1 67 let step = 0 68 let add = -1 69 let ranges = [] 70 for p in s:dataprops 71 if p[a:index] != '' 72 let n = ('0x' . p[0]) + 0 73 let nl = ('0x' . p[a:index]) + 0 74 if start >= 0 && add == nl - n && (step == 0 || n - end == step) 75 " continue with same range. 76 let step = n - end 77 let end = n 78 else 79 if start >= 0 80 " produce previous range 81 call Range(ranges, start, end, step, add) 82 endif 83 let start = n 84 let end = n 85 let step = 0 86 let add = nl - n 87 endif 88 endif 89 endfor 90 if start >= 0 91 call Range(ranges, start, end, step, add) 92 endif 93 94 " New buffer to put the result in. 95 new 96 exe "file to" . a:name 97 call setline(1, "static convertStruct to" . a:name . "[] =") 98 call setline(2, "{") 99 call append('$', ranges) 100 call setline('$', getline('$')[:-2]) " remove last comma 101 call setline(line('$') + 1, "};") 102 wincmd p 103endfunc 104 105" Build the foldCase table in a new buffer. 106" Uses s:foldprops. 107func! BuildFoldTable() 108 let start = -1 109 let end = -1 110 let step = 0 111 let add = -1 112 let ranges = [] 113 for p in s:foldprops 114 if p[1] == 'C' || p[1] == 'S' 115 let n = ('0x' . p[0]) + 0 116 let nl = ('0x' . p[2]) + 0 117 if start >= 0 && add == nl - n && (step == 0 || n - end == step) 118 " continue with same range. 119 let step = n - end 120 let end = n 121 else 122 if start >= 0 123 " produce previous range 124 call Range(ranges, start, end, step, add) 125 endif 126 let start = n 127 let end = n 128 let step = 0 129 let add = nl - n 130 endif 131 endif 132 endfor 133 if start >= 0 134 call Range(ranges, start, end, step, add) 135 endif 136 137 " New buffer to put the result in. 138 new 139 file foldCase 140 call setline(1, "static convertStruct foldCase[] =") 141 call setline(2, "{") 142 call append('$', ranges) 143 call setline('$', getline('$')[:-2]) " remove last comma 144 call setline(line('$') + 1, "};") 145 wincmd p 146endfunc 147 148func! Range(ranges, start, end, step, add) 149 let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add) 150 call add(a:ranges, s) 151endfunc 152 153" Build the combining table. 154" Uses s:dataprops. 155func! BuildCombiningTable() 156 let start = -1 157 let end = -1 158 let ranges = [] 159 for p in s:dataprops 160 if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me' 161 let n = ('0x' . p[0]) + 0 162 if start >= 0 && end + 1 == n 163 " continue with same range. 164 let end = n 165 else 166 if start >= 0 167 " produce previous range 168 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 169 endif 170 let start = n 171 let end = n 172 endif 173 endif 174 endfor 175 if start >= 0 176 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 177 endif 178 179 " New buffer to put the result in. 180 new 181 file combining 182 call setline(1, " static struct interval combining[] =") 183 call setline(2, " {") 184 call append('$', ranges) 185 call setline('$', getline('$')[:-2]) " remove last comma 186 call setline(line('$') + 1, " };") 187 wincmd p 188endfunc 189 190" Build the double width or ambiguous width table in a new buffer. 191" Uses s:widthprops and s:dataprops. 192func! BuildWidthTable(pattern, tableName) 193 let start = -1 194 let end = -1 195 let ranges = [] 196 let dataidx = 0 197 for p in s:widthprops 198 if p[1][0] =~ a:pattern 199 if p[0] =~ '\.\.' 200 " It is a range. we don't check for composing char then. 201 let rng = split(p[0], '\.\.') 202 if len(rng) != 2 203 echoerr "Cannot parse range: '" . p[0] . "' in width table" 204 endif 205 let n = ('0x' . rng[0]) + 0 206 let n_last = ('0x' . rng[1]) + 0 207 else 208 let n = ('0x' . p[0]) + 0 209 let n_last = n 210 endif 211 " Find this char in the data table. 212 while 1 213 let dn = ('0x' . s:dataprops[dataidx][0]) + 0 214 if dn >= n 215 break 216 endif 217 let dataidx += 1 218 endwhile 219 if dn != n && n_last == n 220 echoerr "Cannot find character " . n . " in data table" 221 endif 222 " Only use the char when it's not a composing char. 223 " But use all chars from a range. 224 let dp = s:dataprops[dataidx] 225 if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me') 226 if start >= 0 && end + 1 == n 227 " continue with same range. 228 else 229 if start >= 0 230 " produce previous range 231 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 232 endif 233 let start = n 234 endif 235 let end = n_last 236 endif 237 endif 238 endfor 239 if start >= 0 240 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end)) 241 endif 242 243 " New buffer to put the result in. 244 new 245 exe "file " . a:tableName 246 call setline(1, " static struct interval " . a:tableName . "[] =") 247 call setline(2, " {") 248 call append('$', ranges) 249 call setline('$', getline('$')[:-2]) " remove last comma 250 call setline(line('$') + 1, " };") 251 wincmd p 252endfunc 253 254 255 256" Edit the Unicode text file. Requires the netrw plugin. 257edit http://unicode.org/Public/UNIDATA/UnicodeData.txt 258 259" Parse each line, create a list of lists. 260call ParseDataToProps() 261 262" Build the toLower table. 263call BuildCaseTable("Lower", 13) 264 265" Build the toUpper table. 266call BuildCaseTable("Upper", 12) 267 268" Build the ranges of composing chars. 269call BuildCombiningTable() 270 271" Edit the case folding text file. Requires the netrw plugin. 272edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt 273 274" Parse each line, create a list of lists. 275call ParseFoldProps() 276 277" Build the foldCase table. 278call BuildFoldTable() 279 280" Edit the width text file. Requires the netrw plugin. 281edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt 282 283" Parse each line, create a list of lists. 284call ParseWidthProps() 285 286" Build the double width table. 287call BuildWidthTable('[WF]', 'doublewidth') 288 289" Build the ambiguous width table. 290call BuildWidthTable('A', 'ambiguous') 291