1" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
2" The format of the UnicodeData.txt file is explained here:
3" http://www.unicode.org/Public/5.1.0/ucd/UCD.html
4" For the other files see the header.
5"
6" Usage: Vim -S <this-file>
7"
8" Author: Bram Moolenaar
9" Last Update: 2010 Jan 12
10
11" Parse lines of UnicodeData.txt.  Creates a list of lists in s:dataprops.
12func! ParseDataToProps()
13  let s:dataprops = []
14  let lnum = 1
15  while lnum <= line('$')
16    let l = split(getline(lnum), '\s*;\s*', 1)
17    if len(l) != 15
18      echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
19      return
20    endif
21    call add(s:dataprops, l)
22    let lnum += 1
23  endwhile
24endfunc
25
26" Parse lines of CaseFolding.txt.  Creates a list of lists in s:foldprops.
27func! ParseFoldProps()
28  let s:foldprops = []
29  let lnum = 1
30  while lnum <= line('$')
31    let line = getline(lnum)
32    if line !~ '^#' && line !~ '^\s*$'
33      let l = split(line, '\s*;\s*', 1)
34      if len(l) != 4
35	echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
36	return
37      endif
38      call add(s:foldprops, l)
39    endif
40    let lnum += 1
41  endwhile
42endfunc
43
44" Parse lines of EastAsianWidth.txt.  Creates a list of lists in s:widthprops.
45func! ParseWidthProps()
46  let s:widthprops = []
47  let lnum = 1
48  while lnum <= line('$')
49    let line = getline(lnum)
50    if line !~ '^#' && line !~ '^\s*$'
51      let l = split(line, '\s*;\s*', 1)
52      if len(l) != 2
53	echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
54	return
55      endif
56      call add(s:widthprops, l)
57    endif
58    let lnum += 1
59  endwhile
60endfunc
61
62" Build the toLower or toUpper table in a new buffer.
63" Uses s:dataprops.
64func! BuildCaseTable(name, index)
65  let start = -1
66  let end = -1
67  let step = 0
68  let add = -1
69  let ranges = []
70  for p in s:dataprops
71    if p[a:index] != ''
72      let n = ('0x' . p[0]) + 0
73      let nl = ('0x' . p[a:index]) + 0
74      if start >= 0 && add == nl - n && (step == 0 || n - end == step)
75	" continue with same range.
76	let step = n - end
77	let end = n
78      else
79	if start >= 0
80	  " produce previous range
81	  call Range(ranges, start, end, step, add)
82	endif
83	let start = n
84	let end = n
85	let step = 0
86	let add = nl - n
87      endif
88    endif
89  endfor
90  if start >= 0
91    call Range(ranges, start, end, step, add)
92  endif
93
94  " New buffer to put the result in.
95  new
96  exe "file to" . a:name
97  call setline(1, "static convertStruct to" . a:name . "[] =")
98  call setline(2, "{")
99  call append('$', ranges)
100  call setline('$', getline('$')[:-2])  " remove last comma
101  call setline(line('$') + 1, "};")
102  wincmd p
103endfunc
104
105" Build the foldCase table in a new buffer.
106" Uses s:foldprops.
107func! BuildFoldTable()
108  let start = -1
109  let end = -1
110  let step = 0
111  let add = -1
112  let ranges = []
113  for p in s:foldprops
114    if p[1] == 'C' || p[1] == 'S'
115      let n = ('0x' . p[0]) + 0
116      let nl = ('0x' . p[2]) + 0
117      if start >= 0 && add == nl - n && (step == 0 || n - end == step)
118	" continue with same range.
119	let step = n - end
120	let end = n
121      else
122	if start >= 0
123	  " produce previous range
124	  call Range(ranges, start, end, step, add)
125	endif
126	let start = n
127	let end = n
128	let step = 0
129	let add = nl - n
130      endif
131    endif
132  endfor
133  if start >= 0
134    call Range(ranges, start, end, step, add)
135  endif
136
137  " New buffer to put the result in.
138  new
139  file foldCase
140  call setline(1, "static convertStruct foldCase[] =")
141  call setline(2, "{")
142  call append('$', ranges)
143  call setline('$', getline('$')[:-2])  " remove last comma
144  call setline(line('$') + 1, "};")
145  wincmd p
146endfunc
147
148func! Range(ranges, start, end, step, add)
149  let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
150  call add(a:ranges, s)
151endfunc
152
153" Build the combining table.
154" Uses s:dataprops.
155func! BuildCombiningTable()
156  let start = -1
157  let end = -1
158  let ranges = []
159  for p in s:dataprops
160    if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me'
161      let n = ('0x' . p[0]) + 0
162      if start >= 0 && end + 1 == n
163	" continue with same range.
164	let end = n
165      else
166	if start >= 0
167	  " produce previous range
168	  call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
169	endif
170	let start = n
171	let end = n
172      endif
173    endif
174  endfor
175  if start >= 0
176    call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
177  endif
178
179  " New buffer to put the result in.
180  new
181  file combining
182  call setline(1, "    static struct interval combining[] =")
183  call setline(2, "    {")
184  call append('$', ranges)
185  call setline('$', getline('$')[:-2])  " remove last comma
186  call setline(line('$') + 1, "    };")
187  wincmd p
188endfunc
189
190" Build the double width or ambiguous width table in a new buffer.
191" Uses s:widthprops and s:dataprops.
192func! BuildWidthTable(pattern, tableName)
193  let start = -1
194  let end = -1
195  let ranges = []
196  let dataidx = 0
197  for p in s:widthprops
198    if p[1][0] =~ a:pattern
199      if p[0] =~ '\.\.'
200	" It is a range.  we don't check for composing char then.
201	let rng = split(p[0], '\.\.')
202	if len(rng) != 2
203	  echoerr "Cannot parse range: '" . p[0] . "' in width table"
204	endif
205	let n = ('0x' . rng[0]) + 0
206	let n_last =  ('0x' . rng[1]) + 0
207      else
208	let n = ('0x' . p[0]) + 0
209	let n_last = n
210      endif
211      " Find this char in the data table.
212      while 1
213	let dn = ('0x' . s:dataprops[dataidx][0]) + 0
214	if dn >= n
215	  break
216	endif
217	let dataidx += 1
218      endwhile
219      if dn != n && n_last == n
220	echoerr "Cannot find character " . n . " in data table"
221      endif
222      " Only use the char when it's not a composing char.
223      " But use all chars from a range.
224      let dp = s:dataprops[dataidx]
225      if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me')
226	if start >= 0 && end + 1 == n
227	  " continue with same range.
228	else
229	  if start >= 0
230	    " produce previous range
231	    call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
232	  endif
233	  let start = n
234	endif
235	let end = n_last
236      endif
237    endif
238  endfor
239  if start >= 0
240    call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
241  endif
242
243  " New buffer to put the result in.
244  new
245  exe "file " . a:tableName
246  call setline(1, "    static struct interval " . a:tableName . "[] =")
247  call setline(2, "    {")
248  call append('$', ranges)
249  call setline('$', getline('$')[:-2])  " remove last comma
250  call setline(line('$') + 1, "    };")
251  wincmd p
252endfunc
253
254
255
256" Edit the Unicode text file.  Requires the netrw plugin.
257edit http://unicode.org/Public/UNIDATA/UnicodeData.txt
258
259" Parse each line, create a list of lists.
260call ParseDataToProps()
261
262" Build the toLower table.
263call BuildCaseTable("Lower", 13)
264
265" Build the toUpper table.
266call BuildCaseTable("Upper", 12)
267
268" Build the ranges of composing chars.
269call BuildCombiningTable()
270
271" Edit the case folding text file.  Requires the netrw plugin.
272edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
273
274" Parse each line, create a list of lists.
275call ParseFoldProps()
276
277" Build the foldCase table.
278call BuildFoldTable()
279
280" Edit the width text file.  Requires the netrw plugin.
281edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
282
283" Parse each line, create a list of lists.
284call ParseWidthProps()
285
286" Build the double width table.
287call BuildWidthTable('[WF]', 'doublewidth')
288
289" Build the ambiguous width table.
290call BuildWidthTable('A', 'ambiguous')
291