1#! /usr/bin/perl
2use strict;
3
4my $USAGE = <<__EOF__;
5   usage: mkutable [-n] [-f#] type... [--] [<] UnicodeData.txt
6          -n = take non-matching types
7      -f = zero-based type field (default 2)
8__EOF__
9
10use vars qw( $opt_f $opt_n );
11use Getopt::Std;
12my $type_field = 2;
13
14# Override Unicode tables for certain control chars
15# that are expected to be found in normal text files.
16my %force_space = (
17    0x08 => 1, # backspace
18    0x09 => 1, # tab
19    0x0a => 1, # newline
20    0x0c => 1, # form feed
21    0x0d => 1, # carriage return
22);
23
24exit (main() ? 0 : 1);
25
26sub main {
27    my $date = `date`;
28    chomp $date;
29    my $args = join ' ', @ARGV;
30    my $header = "/* Generated by \"$0 $args\" on $date */\n";
31
32    die $USAGE if not getopts('f:n');
33    $type_field = $opt_f if $opt_f;
34    my %types;
35    my $arg;
36    while ($arg = shift @ARGV) {
37        last if $arg eq '--';
38        $types{$arg} = 1;
39    }
40    my %out = ( 'types' => \%types );
41
42    print $header;
43    my $last_code = 0;
44    while (<>) {
45        chomp;
46        s/#.*//;
47        my @fields = split /;/;
48        next if not @fields;
49        my ($lo_code, $hi_code);
50        my $codes = $fields[0];
51        if ($codes =~ /(\w+)\.\.(\w+)/) {
52            $lo_code = hex $1;
53            $hi_code = hex $2;
54        } else {
55            $lo_code = $hi_code = hex $fields[0];
56        }
57        my $type = $fields[$type_field];
58        $type =~ s/\s//g;
59        for ($last_code = $lo_code; $last_code <= $hi_code; ++$last_code) {
60            $type = 'Zs' if $force_space{$last_code};
61            output(\%out, $last_code, $type);
62        }
63    }
64    output(\%out, $last_code);
65    return 1;
66}
67
68sub output {
69    my ($out, $code, $type) = @_;
70    my $type_ok = ($type and ${${$out}{types}}{$type});
71    $type_ok = not $type_ok if $opt_n;
72    my $prev_code = $$out{prev_code};
73
74    if (not $type_ok) {
75        end_run($out, $prev_code);
76    } elsif (not $$out{in_run} or $type ne $$out{run_type} or $code != $prev_code+1) {
77        end_run($out, $prev_code);
78        start_run($out, $code, $type);
79    }
80    $$out{prev_code} = $code;
81}
82
83sub start_run {
84    my ($out, $code, $type) = @_;
85    $$out{start_code} = $code;
86    $$out{prev_code} = $code;
87    $$out{run_type} = $type;
88    $$out{in_run} = 1;
89}
90
91sub end_run {
92    my ($out, $code) = @_;
93    return if not $$out{in_run};
94    printf "\t{ 0x%04x, 0x%04x }, /* %s */\n", $$out{start_code}, $code, $$out{run_type};
95    $$out{in_run} = 0;
96}
97