Main.java revision 3170:dc017a37aac5
1/*
2 * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 */
23
24package tidystats;
25
26import java.io.IOException;
27import java.nio.charset.Charset;
28import java.nio.file.FileSystem;
29import java.nio.file.FileSystems;
30import java.nio.file.Files;
31import java.nio.file.Path;
32import java.util.ArrayList;
33import java.util.Comparator;
34import java.util.HashMap;
35import java.util.List;
36import java.util.Map;
37import java.util.Set;
38import java.util.TreeMap;
39import java.util.TreeSet;
40import java.util.regex.Matcher;
41import java.util.regex.Pattern;
42
43/**
44 * Generate statistics from the files generated by tidy.sh.
45 *
46 * <p>The tidy.sh script is used to run tidy on all the HTML files
47 * in a directory, creating files in a new directory, and for each
48 * HTML file, it writes the console output from tidy into a file
49 * beside the fixed up file, with an additional .tidy extension.
50 *
51 * <p>This program will scan a directory for *.tidy files and
52 * analyze the messages reported by tidy, in order to generate a
53 * report with statistics on the various messages that were
54 * reported by tidy.
55 *
56 * <p>Typical usage:
57 * <pre>
58 * $ bash /path/to/tidy.sh /path/to/htmldir
59 * $ javac -d /path/to/classes /path/to/Main.java
60 * $ java -cp /path/to/classes tidystats.Main /path/to/htmldir.tidy
61 * </pre>
62 *
63 * <p>Internally, the program works by matching lines in the *.tidy
64 * files against a series of regular expressions that are used to
65 * categorize the messages.  The set of regular expressions was
66 * empirically determined by running the program on the output from
67 * running tidy.sh on all the generated JDK documentation. It is
68 * possible that tidy may generate more/different messages on other
69 * doc sets, in which case, the set of regexes in the program should
70 * be updated.
71 */
72public class Main {
73    public static void main(String... args) throws IOException {
74        new Main().run(args);
75    }
76
77    void run(String... args) throws IOException {
78        FileSystem fs = FileSystems.getDefault();
79        List<Path> paths = new ArrayList<>();
80
81        int i;
82        for (i = 0; i < args.length; i++) {
83            String arg = args[i];
84            if (arg.startsWith("-"))
85                throw new IllegalArgumentException(arg);
86            else
87                break;
88        }
89
90        for ( ; i < args.length; i++) {
91            Path p = fs.getPath(args[i]);
92            paths.add(p);
93        }
94
95        for (Path p: paths) {
96            scan(p);
97        }
98
99        print("%6d files read", files);
100        print("%6d files had no errors or warnings", ok);
101        print("%6d files reported \"Not all warnings/errors were shown.\"", overflow);
102        print("%6d errors found", errs);
103        print("%6d warnings found", warns);
104        print("%6d recommendations to use CSS", css);
105        print("");
106
107        Map<Integer, Set<String>> sortedCounts = new TreeMap<>(
108                new Comparator<Integer>() {
109                    @Override
110                    public int compare(Integer o1, Integer o2) {
111                        return o2.compareTo(o1);
112                    }
113                });
114
115        for (Map.Entry<Pattern, Integer> e: counts.entrySet()) {
116            Pattern p = e.getKey();
117            Integer n = e.getValue();
118            Set<String> set = sortedCounts.get(n);
119            if (set == null)
120                sortedCounts.put(n, (set = new TreeSet<>()));
121            set.add(p.toString());
122        }
123
124        for (Map.Entry<Integer, Set<String>> e: sortedCounts.entrySet()) {
125            for (String p: e.getValue()) {
126                if (p.startsWith(".*")) p = p.substring(2);
127                print("%6d: %s", e.getKey(), p);
128            }
129        }
130    }
131
132    void scan(Path p) throws IOException {
133        if (Files.isDirectory(p)) {
134            for (Path c: Files.newDirectoryStream(p)) {
135                scan(c);
136            }
137        } else if (isTidyFile(p)) {
138            scan(Files.readAllLines(p, Charset.defaultCharset()));
139        }
140    }
141
142    boolean isTidyFile(Path p) {
143        return Files.isRegularFile(p) && p.getFileName().toString().endsWith(".tidy");
144    }
145
146    void scan(List<String> lines) {
147        Matcher m;
148        files++;
149        for (String line: lines) {
150            if (okPattern.matcher(line).matches()) {
151                ok++;
152            } else if ((m = countPattern.matcher(line)).matches()) {
153                warns += Integer.valueOf(m.group(1));
154                errs += Integer.valueOf(m.group(2));
155                if (m.group(3) != null)
156                    overflow++;
157            } else if ((m = guardPattern.matcher(line)).matches()) {
158                boolean found = false;
159                for (Pattern p: patterns) {
160                    if ((m = p.matcher(line)).matches()) {
161                        found = true;
162                        count(p);
163                        break;
164                    }
165                }
166                if (!found)
167                    System.err.println("Unrecognized line: " + line);
168            } else if (cssPattern.matcher(line).matches()) {
169                css++;
170            }
171        }
172    }
173
174    Map<Pattern, Integer> counts = new HashMap<>();
175    void count(Pattern p) {
176        Integer i = counts.get(p);
177        counts.put(p, (i == null) ? 1 : i + 1);
178    }
179
180    void print(String format, Object... args) {
181        System.out.println(String.format(format, args));
182    }
183
184    Pattern okPattern = Pattern.compile("No warnings or errors were found.");
185    Pattern countPattern = Pattern.compile("([0-9]+) warnings, ([0-9]+) errors were found!.*?(Not all warnings/errors were shown.)?");
186    Pattern cssPattern = Pattern.compile("You are recommended to use CSS.*");
187    Pattern guardPattern = Pattern.compile("line [0-9]+ column [0-9]+ - (Error|Warning):.*");
188
189    Pattern[] patterns = {
190        Pattern.compile(".*Error: <.*> is not recognized!"),
191        Pattern.compile(".*Error: missing quote mark for attribute value"),
192        Pattern.compile(".*Warning: <.*> anchor \".*\" already defined"),
193        Pattern.compile(".*Warning: <.*> attribute \".*\" has invalid value \".*\""),
194        Pattern.compile(".*Warning: <.*> attribute \".*\" lacks value"),
195        Pattern.compile(".*Warning: <.*> attribute \".*\" lacks value"),
196        Pattern.compile(".*Warning: <.*> attribute with missing trailing quote mark"),
197        Pattern.compile(".*Warning: <.*> dropping value \".*\" for repeated attribute \".*\""),
198        Pattern.compile(".*Warning: <.*> inserting \".*\" attribute"),
199        Pattern.compile(".*Warning: <.*> is probably intended as </.*>"),
200        Pattern.compile(".*Warning: <.*> isn't allowed in <.*> elements"),
201        Pattern.compile(".*Warning: <.*> lacks \".*\" attribute"),
202        Pattern.compile(".*Warning: <.*> missing '>' for end of tag"),
203        Pattern.compile(".*Warning: <.*> proprietary attribute \".*\""),
204        Pattern.compile(".*Warning: <.*> unexpected or duplicate quote mark"),
205        Pattern.compile(".*Warning: <a> cannot copy name attribute to id"),
206        Pattern.compile(".*Warning: <a> escaping malformed URI reference"),
207        Pattern.compile(".*Warning: <blockquote> proprietary attribute \"pre\""),
208        Pattern.compile(".*Warning: discarding unexpected <.*>"),
209        Pattern.compile(".*Warning: discarding unexpected </.*>"),
210        Pattern.compile(".*Warning: entity \".*\" doesn't end in ';'"),
211        Pattern.compile(".*Warning: inserting implicit <.*>"),
212        Pattern.compile(".*Warning: inserting missing 'title' element"),
213        Pattern.compile(".*Warning: missing <!DOCTYPE> declaration"),
214        Pattern.compile(".*Warning: missing <.*>"),
215        Pattern.compile(".*Warning: missing </.*> before <.*>"),
216        Pattern.compile(".*Warning: nested emphasis <.*>"),
217        Pattern.compile(".*Warning: plain text isn't allowed in <.*> elements"),
218        Pattern.compile(".*Warning: replacing <p> by <br>"),
219        Pattern.compile(".*Warning: replacing invalid numeric character reference .*"),
220        Pattern.compile(".*Warning: replacing unexpected .* by </.*>"),
221        Pattern.compile(".*Warning: trimming empty <.*>"),
222        Pattern.compile(".*Warning: unescaped & or unknown entity \".*\""),
223        Pattern.compile(".*Warning: unescaped & which should be written as &amp;"),
224        Pattern.compile(".*Warning: using <br> in place of <p>")
225    };
226
227    int files;
228    int ok;
229    int warns;
230    int errs;
231    int css;
232    int overflow;
233}
234
235