do_uniq.py revision 141477
1#!/usr/local/bin/python
2#
3# $FreeBSD: head/games/fortune/tools/do_uniq.py 141477 2005-02-07 21:15:16Z ru $
4#
5# an aggressive little script for trimming duplicate cookies
6
7import re, sys
8
9wordlist = [
10    'hadnot',
11    'donot', 'hadnt',
12    'dont', 'have', 'more', 'will', 'your',
13    'and', 'are', 'had', 'the', 'you',
14    'am', 'an', 'is', 'll', 've', 'we',
15    'a', 'd', 'i', 'm', 's',
16]
17
18def hash(fortune):
19    f = fortune
20    f = f.lower()
21    f = re.sub('[\W_]', '', f)
22    for word in wordlist:
23        f = re.sub(word, '', f)
24#    f = re.sub('[aeiouy]', '', f)
25#    f = re.sub('[^aeiouy]', '', f)
26    f = f[:30]
27#    f = f[-30:]
28    return f
29
30def edit(datfile):
31    dups = {}
32    fortunes = []
33    fortune = ""
34    for line in file(datfile):
35        if line == "%\n":
36            key = hash(fortune)
37            if not dups.has_key(key):
38                dups[key] = []
39            dups[key].append(fortune)
40            fortunes.append(fortune)
41            fortune = ""
42        else:
43            fortune += line
44    for key in dups.keys():
45        if len(dups[key]) == 1:
46            del dups[key]
47    o = file(datfile + '~', "w")
48    for fortune in fortunes:
49        key = hash(fortune)
50        if key in dups:
51            print '\n' * 50
52            for f in dups[key]:
53                if f != fortune:
54                    print f, '%'
55            print fortune, '%'
56            if raw_input("Remove last fortune? ") == 'y':
57                del dups[key]
58                continue
59        o.write(fortune + "%\n")
60    o.close()
61
62assert len(sys.argv) == 2
63edit(sys.argv[1])
64