1#!./perl
2use 5.008001;
3BEGIN { pop @INC if $INC[-1] eq '.' }
4use strict;
5use warnings;
6use Encode;
7use Getopt::Std;
8use Carp;
9use Encode::Guess;
10$Getopt::Std::STANDARD_HELP_VERSION = 1;
11
12my %opt;
13getopts( "huSs:", \%opt );
14my @suspect_list;
15list_valid_suspects() and exit if $opt{S};
16@suspect_list = split /:,/, $opt{s} if $opt{s};
17HELP_MESSAGE() if $opt{h};
18HELP_MESSAGE() unless @ARGV;
19do_guess($_) for @ARGV;
20
21sub read_file {
22    my $filename = shift;
23    local $/;
24    open my $fh, '<:raw', $filename or croak "$filename:$!";
25    my $content = <$fh>;
26    close $fh;
27    return $content;
28}
29
30sub do_guess {
31    my $filename = shift;
32    my $data     = read_file($filename);
33    my $enc      = guess_encoding( $data, @suspect_list );
34    if ( !ref($enc) && $opt{u} ) {
35        return 1;
36    }
37    print "$filename\t";
38    if ( ref($enc) ) {
39        print $enc->mime_name();
40    }
41    else {
42        print "unknown";
43    }
44    print "\n";
45    return 1;
46}
47
48sub list_valid_suspects {
49    print join( "\n", Encode->encodings(":all") );
50    print "\n";
51    return 1;
52}
53
54sub HELP_MESSAGE {
55    exec 'pod2usage', $0 or die "pod2usage: $!" 
56}
57__END__
58=head1 NAME
59
60encguess - guess character encodings of files
61
62=head1 VERSION
63
64$Id: encguess,v 0.3 2020/12/02 01:28:17 dankogai Exp $
65
66=head1 SYNOPSIS
67
68  encguess [switches] filename...
69
70=head2 SWITCHES
71
72=over 2
73
74=item -h
75
76show this message and exit.
77
78=item -s
79
80specify a list of "suspect encoding types" to test, 
81separated by either C<:> or C<,>
82
83=item -S
84
85output a list of all acceptable encoding types that can be used with
86the -s param
87
88=item -u
89
90suppress display of unidentified types
91
92=back
93
94=head2 EXAMPLES:
95
96=over 2
97
98=item *
99
100Guess encoding of a file named C<test.txt>, using only the default
101suspect types.
102
103   encguess test.txt
104
105=item *
106
107Guess the encoding type of a file named C<test.txt>, using the suspect
108types C<euc-jp,shiftjis,7bit-jis>.
109
110   encguess -s euc-jp,shiftjis,7bit-jis test.txt
111   encguess -s euc-jp:shiftjis:7bit-jis test.txt
112
113=item *
114
115Guess the encoding type of several files, do not display results for
116unidentified files.
117
118   encguess -us euc-jp,shiftjis,7bit-jis test*.txt
119
120=back
121
122=head1 DESCRIPTION
123
124The encoding identification is done by checking one encoding type at a
125time until all but the right type are eliminated. The set of encoding
126types to try is defined by the -s parameter and defaults to ascii,
127utf8 and UTF-16/32 with BOM. This can be overridden by passing one or
128more encoding types via the -s parameter. If you need to pass in
129multiple suspect encoding types, use a quoted string with the a space
130separating each value.
131
132=head1 SEE ALSO
133
134L<Encode::Guess>, L<Encode::Detect>
135
136=head1 LICENSE AND COPYRIGHT
137
138Copyright 2015 Michael LaGrasta and Dan Kogai.
139
140This program is free software; you can redistribute it and/or modify it
141under the terms of the the Artistic License (2.0). You may obtain a
142copy of the full license at:
143
144L<http://www.perlfoundation.org/artistic_license_2_0>
145
146=cut
147