1#!/usr/bin/python -u
2#
3# imports the API description and fills up a database with
4# name relevance to modules, functions or web pages
5#
6# Operation needed:
7# =================
8#
9# install mysqld, the python wrappers for mysql and libxml2, start mysqld
10# Change the root passwd of mysql:
11#    mysqladmin -u root password new_password
12# Create the new database xmlsoft
13#    mysqladmin -p create xmlsoft
14# Create a database user 'veillard' and give him passord access
15# change veillard and abcde with the right user name and passwd
16#    mysql -p
17#    password:
18#    mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost
19#           IDENTIFIED BY 'abcde' WITH GRANT OPTION;
20#
21# As the user check the access:
22#    mysql -p xmlsoft
23#    Enter password:
24#    Welcome to the MySQL monitor....
25#    mysql> use xmlsoft
26#    Database changed
27#    mysql> quit
28#    Bye
29#
30# Then run the script in the doc subdir, it will create the XSLTsymbols and
31# word tables and populate them with informations extracted from
32# the libxml2-api.xml API description, and make then accessible read-only
33# by nobody@loaclhost the user expected to be Apache's one
34#
35# On the Apache configuration, make sure you have php support enabled
36#
37
38import MySQLdb
39import libxml2
40import sys
41import string
42import os
43
44#
45# We are not interested in parsing errors here
46#
47def callback(ctx, str):
48    return
49libxml2.registerErrorHandler(callback, None)
50
51#
52# The dictionary of tables required and the SQL command needed
53# to create them
54#
55TABLES={
56  "XSLTsymbols" : """CREATE TABLE XSLTsymbols (
57           name varchar(255) BINARY NOT NULL,
58	   module varchar(255) BINARY NOT NULL,
59           type varchar(25) NOT NULL,
60	   descr varchar(255),
61	   UNIQUE KEY name (name),
62	   KEY module (module))""",
63  "XSLTwords" : """CREATE TABLE XSLTwords (
64           name varchar(50) BINARY NOT NULL,
65	   symbol varchar(255) BINARY NOT NULL,
66           relevance int,
67	   KEY name (name),
68	   KEY symbol (symbol),
69	   UNIQUE KEY ID (name, symbol))""",
70  "XSLTwordsHTML" : """CREATE TABLE XSLTwordsHTML (
71           name varchar(50) BINARY NOT NULL,
72	   resource varchar(255) BINARY NOT NULL,
73	   section varchar(255),
74	   id varchar(50),
75           relevance int,
76	   KEY name (name),
77	   KEY resource (resource),
78	   UNIQUE KEY ref (name, resource))""",
79  "XSLTwordsArchive" : """CREATE TABLE XSLTwordsArchive (
80           name varchar(50) BINARY NOT NULL,
81	   ID int(11) NOT NULL,
82           relevance int,
83	   KEY name (name),
84	   UNIQUE KEY ref (name, ID))""",
85  "XSLTpages" : """CREATE TABLE XSLTpages (
86           resource varchar(255) BINARY NOT NULL,
87	   title varchar(255) BINARY NOT NULL,
88	   UNIQUE KEY name (resource))""",
89  "archives" : """CREATE TABLE archives (
90           ID int(11) NOT NULL auto_increment,
91           resource varchar(255) BINARY NOT NULL,
92	   title varchar(255) BINARY NOT NULL,
93	   UNIQUE KEY id (ID,resource(255)),
94	   INDEX (ID),
95	   INDEX (resource))""",
96  "Queries" : """CREATE TABLE Queries (
97           ID int(11) NOT NULL auto_increment,
98	   Value varchar(50) NOT NULL,
99	   Count int(11) NOT NULL,
100	   UNIQUE KEY id (ID,Value(35)),
101	   INDEX (ID))""",
102}
103
104#
105# The XML API description file to parse
106#
107API="libxslt-api.xml"
108DB=None
109
110#########################################################################
111#									#
112#                  MySQL database interfaces				#
113#									#
114#########################################################################
115def createTable(db, name):
116    global TABLES
117
118    if db == None:
119        return -1
120    if name == None:
121        return -1
122    c = db.cursor()
123
124    ret = c.execute("DROP TABLE IF EXISTS %s" % (name))
125    if ret == 1:
126        print "Removed table %s" % (name)
127    print "Creating table %s" % (name)
128    try:
129        ret = c.execute(TABLES[name])
130    except:
131        print "Failed to create table %s" % (name)
132	return -1
133    return ret
134
135def checkTables(db):
136    global TABLES
137
138    if db == None:
139        return -1
140    c = db.cursor()
141    nbtables = c.execute("show tables")
142    print "Found %d tables" % (nbtables)
143    tables = {}
144    i = 0
145    while i < nbtables:
146        l = c.fetchone()
147	name = l[0]
148	tables[name] = {}
149        i = i + 1
150
151    for table in TABLES.keys():
152        if not tables.has_key(table):
153	    print "table %s missing" % (table)
154	    createTable(db, table)
155	try:
156	    ret = c.execute("SELECT count(*) from %s" % table);
157	    row = c.fetchone()
158	    print "Table %s contains %d records" % (table, row[0])
159	except:
160	    print "Troubles with table %s : repairing" % (table)
161	    ret = c.execute("repair table %s" % table);
162	    print "repairing returned %d" % (ret)
163	    ret = c.execute("SELECT count(*) from %s" % table);
164	    row = c.fetchone()
165	    print "Table %s contains %d records" % (table, row[0])
166    print "checkTables finished"
167
168    # make sure apache can access the tables read-only
169    try:
170	ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost")
171	ret = c.execute("GRANT INSERT,SELECT,UPDATE  ON xmlsoft.Queries TO nobody@localhost")
172    except:
173        pass
174    return 0
175
176def openMySQL(db="xmlsoft", passwd=None):
177    global DB
178
179    if passwd == None:
180        try:
181	    passwd = os.environ["MySQL_PASS"]
182	except:
183	    print "No password available, set environment MySQL_PASS"
184	    sys.exit(1)
185
186    DB = MySQLdb.connect(passwd=passwd, db=db)
187    if DB == None:
188        return -1
189    ret = checkTables(DB)
190    return ret
191
192def updateWord(name, symbol, relevance):
193    global DB
194
195    if DB == None:
196        openMySQL()
197    if DB == None:
198        return -1
199    if name == None:
200        return -1
201    if symbol == None:
202        return -1
203
204    c = DB.cursor()
205    try:
206	ret = c.execute(
207"""INSERT INTO XSLTwords (name, symbol, relevance) VALUES ('%s','%s', %d)""" %
208		(name, symbol, relevance))
209    except:
210        try:
211	    ret = c.execute(
212    """UPDATE XSLTwords SET relevance = %d where name = '%s' and symbol = '%s'""" %
213		    (relevance, name, symbol))
214	except:
215	    print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance)
216	    print "UPDATE XSLTwords SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol)
217	    print sys.exc_type, sys.exc_value
218	    return -1
219
220    return ret
221
222def updateSymbol(name, module, type, desc):
223    global DB
224
225    updateWord(name, name, 50)
226    if DB == None:
227        openMySQL()
228    if DB == None:
229        return -1
230    if name == None:
231        return -1
232    if module == None:
233        return -1
234    if type == None:
235        return -1
236
237    try:
238	desc = string.replace(desc, "'", " ")
239	l = string.split(desc, ".")
240	desc = l[0]
241	desc = desc[0:99]
242    except:
243        desc = ""
244
245    c = DB.cursor()
246    try:
247	ret = c.execute(
248"""INSERT INTO XSLTsymbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" %
249                    (name, module, type, desc))
250    except:
251        try:
252	    ret = c.execute(
253"""UPDATE XSLTsymbols SET module='%s', type='%s', descr='%s' where name='%s'""" %
254                    (module, type, desc, name))
255        except:
256	    print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
257	    print """UPDATE XSLTsymbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name)
258	    print sys.exc_type, sys.exc_value
259	    return -1
260
261    return ret
262
263def addFunction(name, module, desc = ""):
264    return updateSymbol(name, module, 'function', desc)
265
266def addMacro(name, module, desc = ""):
267    return updateSymbol(name, module, 'macro', desc)
268
269def addEnum(name, module, desc = ""):
270    return updateSymbol(name, module, 'enum', desc)
271
272def addStruct(name, module, desc = ""):
273    return updateSymbol(name, module, 'struct', desc)
274
275def addConst(name, module, desc = ""):
276    return updateSymbol(name, module, 'const', desc)
277
278def addType(name, module, desc = ""):
279    return updateSymbol(name, module, 'type', desc)
280
281def addFunctype(name, module, desc = ""):
282    return updateSymbol(name, module, 'functype', desc)
283
284def addPage(resource, title):
285    global DB
286
287    if DB == None:
288        openMySQL()
289    if DB == None:
290        return -1
291    if resource == None:
292        return -1
293
294    c = DB.cursor()
295    try:
296	ret = c.execute(
297	    """INSERT INTO XSLTpages (resource, title) VALUES ('%s','%s')""" %
298                    (resource, title))
299    except:
300        try:
301	    ret = c.execute(
302		"""UPDATE XSLTpages SET title='%s' WHERE resource='%s'""" %
303                    (title, resource))
304        except:
305	    print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
306	    print """UPDATE XSLTpages SET title='%s' WHERE resource='%s'""" % (title, resource)
307	    print sys.exc_type, sys.exc_value
308	    return -1
309
310    return ret
311
312def updateWordHTML(name, resource, desc, id, relevance):
313    global DB
314
315    if DB == None:
316        openMySQL()
317    if DB == None:
318        return -1
319    if name == None:
320        return -1
321    if resource == None:
322        return -1
323    if id == None:
324        id = ""
325    if desc == None:
326        desc = ""
327    else:
328	try:
329	    desc = string.replace(desc, "'", " ")
330	    desc = desc[0:99]
331	except:
332	    desc = ""
333
334    c = DB.cursor()
335    try:
336	ret = c.execute(
337"""INSERT INTO XSLTwordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
338                    (name, resource, desc, id, relevance))
339    except:
340        try:
341	    ret = c.execute(
342"""UPDATE XSLTwordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
343                    (desc, id, relevance, name, resource))
344        except:
345	    print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
346	    print """UPDATE XSLTwordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
347	    print sys.exc_type, sys.exc_value
348	    return -1
349
350    return ret
351
352def checkXMLMsgArchive(url):
353    global DB
354
355    if DB == None:
356        openMySQL()
357    if DB == None:
358        return -1
359    if url == None:
360        return -1
361
362    c = DB.cursor()
363    try:
364	ret = c.execute(
365	    """SELECT ID FROM archives WHERE resource='%s'""" % (url))
366	row = c.fetchone()
367	if row == None:
368	    return -1
369    except:
370	return -1
371
372    return row[0]
373
374def addXMLMsgArchive(url, title):
375    global DB
376
377    if DB == None:
378        openMySQL()
379    if DB == None:
380        return -1
381    if url == None:
382        return -1
383    if title == None:
384        title = ""
385    else:
386	title = string.replace(title, "'", " ")
387	title = title[0:99]
388
389    c = DB.cursor()
390    try:
391        cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title)
392        ret = c.execute(cmd)
393	cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url)
394        ret = c.execute(cmd)
395	row = c.fetchone()
396	if row == None:
397	    print "addXMLMsgArchive failed to get the ID: %s" % (url)
398	    return -1
399    except:
400        print "addXMLMsgArchive failed command: %s" % (cmd)
401	return -1
402
403    return((int)(row[0]))
404
405def updateWordArchive(name, id, relevance):
406    global DB
407
408    if DB == None:
409        openMySQL()
410    if DB == None:
411        return -1
412    if name == None:
413        return -1
414    if id == None:
415        return -1
416
417    c = DB.cursor()
418    try:
419	ret = c.execute(
420"""INSERT INTO XSLTwordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" %
421                    (name, id, relevance))
422    except:
423        try:
424	    ret = c.execute(
425"""UPDATE XSLTwordsArchive SET relevance='%d' where name='%s' and ID='%d'""" %
426                    (relevance, name, id))
427        except:
428	    print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance)
429	    print """UPDATE XSLTwordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id)
430	    print sys.exc_type, sys.exc_value
431	    return -1
432
433    return ret
434
435#########################################################################
436#									#
437#                  Word dictionary and analysis routines		#
438#									#
439#########################################################################
440
441#
442# top 100 english word without the one len < 3 + own set
443#
444dropWords = {
445    'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0,
446    'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0,
447    'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0,
448    'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0,
449    'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0,
450    'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0,
451    'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0,
452    'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0,
453    'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0,
454    'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0,
455    'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0,
456    'down':0,
457    'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0,
458}
459
460wordsDict = {}
461wordsDictHTML = {}
462wordsDictArchive = {}
463
464def cleanupWordsString(str):
465    str = string.replace(str, ".", " ")
466    str = string.replace(str, "!", " ")
467    str = string.replace(str, "?", " ")
468    str = string.replace(str, ",", " ")
469    str = string.replace(str, "'", " ")
470    str = string.replace(str, '"', " ")
471    str = string.replace(str, ";", " ")
472    str = string.replace(str, "(", " ")
473    str = string.replace(str, ")", " ")
474    str = string.replace(str, "{", " ")
475    str = string.replace(str, "}", " ")
476    str = string.replace(str, "<", " ")
477    str = string.replace(str, ">", " ")
478    str = string.replace(str, "=", " ")
479    str = string.replace(str, "/", " ")
480    str = string.replace(str, "*", " ")
481    str = string.replace(str, ":", " ")
482    str = string.replace(str, "#", " ")
483    str = string.replace(str, "\\", " ")
484    str = string.replace(str, "\n", " ")
485    str = string.replace(str, "\r", " ")
486    str = string.replace(str, "\xc2", " ")
487    str = string.replace(str, "\xa0", " ")
488    return str
489
490def cleanupDescrString(str):
491    str = string.replace(str, "'", " ")
492    str = string.replace(str, "\n", " ")
493    str = string.replace(str, "\r", " ")
494    str = string.replace(str, "\xc2", " ")
495    str = string.replace(str, "\xa0", " ")
496    l = string.split(str)
497    str = string.join(str)
498    return str
499
500def splitIdentifier(str):
501    ret = []
502    while str != "":
503        cur = string.lower(str[0])
504	str = str[1:]
505	if ((cur < 'a') or (cur > 'z')):
506	    continue
507	while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'):
508	    cur = cur + string.lower(str[0])
509	    str = str[1:]
510	while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'):
511	    cur = cur + str[0]
512	    str = str[1:]
513	while (str != "") and (str[0] >= '0') and (str[0] <= '9'):
514	    str = str[1:]
515	ret.append(cur)
516    return ret
517
518def addWord(word, module, symbol, relevance):
519    global wordsDict
520
521    if word == None or len(word) < 3:
522        return -1
523    if module == None or symbol == None:
524        return -1
525    if dropWords.has_key(word):
526        return 0
527    if ord(word[0]) > 0x80:
528        return 0
529
530    if wordsDict.has_key(word):
531        d = wordsDict[word]
532	if d == None:
533	    return 0
534	if len(d) > 500:
535	    wordsDict[word] = None
536	    return 0
537	try:
538	    relevance = relevance + d[(module, symbol)]
539	except:
540	    pass
541    else:
542        wordsDict[word] = {}
543    wordsDict[word][(module, symbol)] = relevance
544    return relevance
545
546def addString(str, module, symbol, relevance):
547    if str == None or len(str) < 3:
548        return -1
549    ret = 0
550    str = cleanupWordsString(str)
551    l = string.split(str)
552    for word in l:
553	if len(word) > 2:
554	    ret = ret + addWord(word, module, symbol, 5)
555
556    return ret
557
558def addWordHTML(word, resource, id, section, relevance):
559    global wordsDictHTML
560
561    if word == None or len(word) < 3:
562        return -1
563    if resource == None or section == None:
564        return -1
565    if dropWords.has_key(word):
566        return 0
567    if ord(word[0]) > 0x80:
568        return 0
569
570    section = cleanupDescrString(section)
571
572    if wordsDictHTML.has_key(word):
573        d = wordsDictHTML[word]
574	if d == None:
575	    print "skipped %s" % (word)
576	    return 0
577	try:
578	    (r,i,s) = d[resource]
579	    if i != None:
580	        id = i
581	    if s != None:
582	        section = s
583	    relevance = relevance + r
584	except:
585	    pass
586    else:
587        wordsDictHTML[word] = {}
588    d = wordsDictHTML[word];
589    d[resource] = (relevance, id, section)
590    return relevance
591
592def addStringHTML(str, resource, id, section, relevance):
593    if str == None or len(str) < 3:
594        return -1
595    ret = 0
596    str = cleanupWordsString(str)
597    l = string.split(str)
598    for word in l:
599	if len(word) > 2:
600	    try:
601		r = addWordHTML(word, resource, id, section, relevance)
602		if r < 0:
603		    print "addWordHTML failed: %s %s" % (word, resource)
604		ret = ret + r
605	    except:
606		print "addWordHTML failed: %s %s %d" % (word, resource, relevance)
607		print sys.exc_type, sys.exc_value
608
609    return ret
610
611def addWordArchive(word, id, relevance):
612    global wordsDictArchive
613
614    if word == None or len(word) < 3:
615        return -1
616    if id == None or id == -1:
617        return -1
618    if dropWords.has_key(word):
619        return 0
620    if ord(word[0]) > 0x80:
621        return 0
622
623    if wordsDictArchive.has_key(word):
624        d = wordsDictArchive[word]
625	if d == None:
626	    print "skipped %s" % (word)
627	    return 0
628	try:
629	    r = d[id]
630	    relevance = relevance + r
631	except:
632	    pass
633    else:
634        wordsDictArchive[word] = {}
635    d = wordsDictArchive[word];
636    d[id] = relevance
637    return relevance
638
639def addStringArchive(str, id, relevance):
640    if str == None or len(str) < 3:
641        return -1
642    ret = 0
643    str = cleanupWordsString(str)
644    l = string.split(str)
645    for word in l:
646        i = len(word)
647	if i > 2:
648	    try:
649		r = addWordArchive(word, id, relevance)
650		if r < 0:
651		    print "addWordArchive failed: %s %s" % (word, id)
652		else:
653		    ret = ret + r
654	    except:
655		print "addWordArchive failed: %s %s %d" % (word, id, relevance)
656		print sys.exc_type, sys.exc_value
657    return ret
658
659#########################################################################
660#									#
661#                  XML API description analysis				#
662#									#
663#########################################################################
664
665def loadAPI(filename):
666    doc = libxml2.parseFile(filename)
667    print "loaded %s" % (filename)
668    return doc
669
670def foundExport(file, symbol):
671    if file == None:
672        return 0
673    if symbol == None:
674        return 0
675    addFunction(symbol, file)
676    l = splitIdentifier(symbol)
677    for word in l:
678	addWord(word, file, symbol, 10)
679    return 1
680
681def analyzeAPIFile(top):
682    count = 0
683    name = top.prop("name")
684    cur = top.children
685    while cur != None:
686        if cur.type == 'text':
687	    cur = cur.next
688	    continue
689	if cur.name == "exports":
690	    count = count + foundExport(name, cur.prop("symbol"))
691	else:
692	    print "unexpected element %s in API doc <file name='%s'>" % (name)
693        cur = cur.next
694    return count
695
696def analyzeAPIFiles(top):
697    count = 0
698    cur = top.children
699
700    while cur != None:
701        if cur.type == 'text':
702	    cur = cur.next
703	    continue
704	if cur.name == "file":
705	    count = count + analyzeAPIFile(cur)
706	else:
707	    print "unexpected element %s in API doc <files>" % (cur.name)
708        cur = cur.next
709    return count
710
711def analyzeAPIEnum(top):
712    file = top.prop("file")
713    if file == None:
714        return 0
715    symbol = top.prop("name")
716    if symbol == None:
717        return 0
718
719    addEnum(symbol, file)
720    l = splitIdentifier(symbol)
721    for word in l:
722	addWord(word, file, symbol, 10)
723
724    return 1
725
726def analyzeAPIConst(top):
727    file = top.prop("file")
728    if file == None:
729        return 0
730    symbol = top.prop("name")
731    if symbol == None:
732        return 0
733
734    addConst(symbol, file)
735    l = splitIdentifier(symbol)
736    for word in l:
737	addWord(word, file, symbol, 10)
738
739    return 1
740
741def analyzeAPIType(top):
742    file = top.prop("file")
743    if file == None:
744        return 0
745    symbol = top.prop("name")
746    if symbol == None:
747        return 0
748
749    addType(symbol, file)
750    l = splitIdentifier(symbol)
751    for word in l:
752	addWord(word, file, symbol, 10)
753    return 1
754
755def analyzeAPIFunctype(top):
756    file = top.prop("file")
757    if file == None:
758        return 0
759    symbol = top.prop("name")
760    if symbol == None:
761        return 0
762
763    addFunctype(symbol, file)
764    l = splitIdentifier(symbol)
765    for word in l:
766	addWord(word, file, symbol, 10)
767    return 1
768
769def analyzeAPIStruct(top):
770    file = top.prop("file")
771    if file == None:
772        return 0
773    symbol = top.prop("name")
774    if symbol == None:
775        return 0
776
777    addStruct(symbol, file)
778    l = splitIdentifier(symbol)
779    for word in l:
780	addWord(word, file, symbol, 10)
781
782    info = top.prop("info")
783    if info != None:
784	info = string.replace(info, "'", " ")
785	info = string.strip(info)
786	l = string.split(info)
787	for word in l:
788	    if len(word) > 2:
789		addWord(word, file, symbol, 5)
790    return 1
791
792def analyzeAPIMacro(top):
793    file = top.prop("file")
794    if file == None:
795        return 0
796    symbol = top.prop("name")
797    if symbol == None:
798        return 0
799    symbol = string.replace(symbol, "'", " ")
800    symbol = string.strip(symbol)
801
802    info = None
803    cur = top.children
804    while cur != None:
805        if cur.type == 'text':
806	    cur = cur.next
807	    continue
808	if cur.name == "info":
809	    info = cur.content
810	    break
811        cur = cur.next
812
813    l = splitIdentifier(symbol)
814    for word in l:
815	addWord(word, file, symbol, 10)
816
817    if info == None:
818	addMacro(symbol, file)
819        print "Macro %s description has no <info>" % (symbol)
820        return 0
821
822    info = string.replace(info, "'", " ")
823    info = string.strip(info)
824    addMacro(symbol, file, info)
825    l = string.split(info)
826    for word in l:
827	if len(word) > 2:
828	    addWord(word, file, symbol, 5)
829    return 1
830
831def analyzeAPIFunction(top):
832    file = top.prop("file")
833    if file == None:
834        return 0
835    symbol = top.prop("name")
836    if symbol == None:
837        return 0
838
839    symbol = string.replace(symbol, "'", " ")
840    symbol = string.strip(symbol)
841    info = None
842    cur = top.children
843    while cur != None:
844        if cur.type == 'text':
845	    cur = cur.next
846	    continue
847	if cur.name == "info":
848	    info = cur.content
849	elif cur.name == "return":
850	    rinfo = cur.prop("info")
851	    if rinfo != None:
852		rinfo = string.replace(rinfo, "'", " ")
853		rinfo = string.strip(rinfo)
854	        addString(rinfo, file, symbol, 7)
855	elif cur.name == "arg":
856	    ainfo = cur.prop("info")
857	    if ainfo != None:
858		ainfo = string.replace(ainfo, "'", " ")
859		ainfo = string.strip(ainfo)
860	        addString(ainfo, file, symbol, 5)
861	    name = cur.prop("name")
862	    if name != None:
863		name = string.replace(name, "'", " ")
864		name = string.strip(name)
865	        addWord(name, file, symbol, 7)
866        cur = cur.next
867    if info == None:
868        print "Function %s description has no <info>" % (symbol)
869	addFunction(symbol, file, "")
870    else:
871        info = string.replace(info, "'", " ")
872	info = string.strip(info)
873	addFunction(symbol, file, info)
874        addString(info, file, symbol, 5)
875
876    l = splitIdentifier(symbol)
877    for word in l:
878	addWord(word, file, symbol, 10)
879
880    return 1
881
882def analyzeAPISymbols(top):
883    count = 0
884    cur = top.children
885
886    while cur != None:
887        if cur.type == 'text':
888	    cur = cur.next
889	    continue
890	if cur.name == "macro":
891	    count = count + analyzeAPIMacro(cur)
892	elif cur.name == "function":
893	    count = count + analyzeAPIFunction(cur)
894	elif cur.name == "const":
895	    count = count + analyzeAPIConst(cur)
896	elif cur.name == "typedef":
897	    count = count + analyzeAPIType(cur)
898	elif cur.name == "struct":
899	    count = count + analyzeAPIStruct(cur)
900	elif cur.name == "enum":
901	    count = count + analyzeAPIEnum(cur)
902	elif cur.name == "functype":
903	    count = count + analyzeAPIFunctype(cur)
904	else:
905	    print "unexpected element %s in API doc <files>" % (cur.name)
906        cur = cur.next
907    return count
908
909def analyzeAPI(doc):
910    count = 0
911    if doc == None:
912        return -1
913    root = doc.getRootElement()
914    if root.name != "api":
915        print "Unexpected root name"
916        return -1
917    cur = root.children
918    while cur != None:
919        if cur.type == 'text':
920	    cur = cur.next
921	    continue
922	if cur.name == "files":
923	    pass
924#	    count = count + analyzeAPIFiles(cur)
925	elif cur.name == "symbols":
926	    count = count + analyzeAPISymbols(cur)
927	else:
928	    print "unexpected element %s in API doc" % (cur.name)
929        cur = cur.next
930    return count
931
932#########################################################################
933#									#
934#                  Web pages parsing and analysis			#
935#									#
936#########################################################################
937
938import glob
939
940def analyzeHTMLText(doc, resource, p, section, id):
941    words = 0
942    try:
943	content = p.content
944	words = words + addStringHTML(content, resource, id, section, 5)
945    except:
946        return -1
947    return words
948
949def analyzeHTMLPara(doc, resource, p, section, id):
950    words = 0
951    try:
952	content = p.content
953	words = words + addStringHTML(content, resource, id, section, 5)
954    except:
955        return -1
956    return words
957
958def analyzeHTMLPre(doc, resource, p, section, id):
959    words = 0
960    try:
961	content = p.content
962	words = words + addStringHTML(content, resource, id, section, 5)
963    except:
964        return -1
965    return words
966
967def analyzeHTML(doc, resource, p, section, id):
968    words = 0
969    try:
970	content = p.content
971	words = words + addStringHTML(content, resource, id, section, 5)
972    except:
973        return -1
974    return words
975
976def analyzeHTML(doc, resource):
977    para = 0;
978    ctxt = doc.xpathNewContext()
979    try:
980	res = ctxt.xpathEval("//head/title")
981	title = res[0].content
982    except:
983        title = "Page %s" % (resource)
984    addPage(resource, title)
985    try:
986	items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()")
987	section = title
988	id = ""
989	for item in items:
990	    if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
991	        section = item.content
992		if item.prop("id"):
993		    id = item.prop("id")
994		elif item.prop("name"):
995		    id = item.prop("name")
996	    elif item.type == 'text':
997	        analyzeHTMLText(doc, resource, item, section, id)
998		para = para + 1
999	    elif item.name == 'p':
1000	        analyzeHTMLPara(doc, resource, item, section, id)
1001		para = para + 1
1002	    elif item.name == 'pre':
1003	        analyzeHTMLPre(doc, resource, item, section, id)
1004		para = para + 1
1005	    else:
1006	        print "Page %s, unexpected %s element" % (resource, item.name)
1007    except:
1008        print "Page %s: problem analyzing" % (resource)
1009	print sys.exc_type, sys.exc_value
1010
1011    return para
1012
1013def analyzeHTMLPages():
1014    ret = 0
1015    HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
1016    for html in HTMLfiles:
1017	if html[0:3] == "API":
1018	    continue
1019	if html == "xslt.html":
1020	    continue
1021	try:
1022	    doc = libxml2.htmlParseFile(html, None)
1023	    res = analyzeHTML(doc, html)
1024	    print "Parsed %s : %d paragraphs" % (html, res)
1025	    ret = ret + 1
1026	except:
1027	    print "could not parse %s" % (html)
1028    return ret
1029
1030#########################################################################
1031#									#
1032#                  Mail archives parsing and analysis			#
1033#									#
1034#########################################################################
1035
1036import time
1037
1038def getXMLDateArchive(t = None):
1039    if t == None:
1040	t = time.time()
1041    T = time.gmtime(t)
1042    month = time.strftime("%B", T)
1043    year = T[0]
1044    url = "http://mail.gnome.org/archives/xslt/%d-%s/date.html" % (year, month)
1045    return url
1046
1047def scanXMLMsgArchive(url, title, force = 0):
1048    if url == None or title == None:
1049        return 0
1050
1051    ID = checkXMLMsgArchive(url)
1052    if force == 0 and ID != -1:
1053        return 0
1054
1055    if ID == -1:
1056	ID = addXMLMsgArchive(url, title)
1057	if ID == -1:
1058	    return 0
1059
1060    try:
1061        print "Loading %s" % (url)
1062        doc = libxml2.htmlParseFile(url, None);
1063    except:
1064        doc = None
1065    if doc == None:
1066        print "Failed to parse %s" % (url)
1067	return 0
1068
1069    addStringArchive(title, ID, 20)
1070    ctxt = doc.xpathNewContext()
1071    texts = ctxt.xpathEval("//pre//text()")
1072    for text in texts:
1073        addStringArchive(text.content, ID, 5)
1074
1075    return 1
1076
1077def scanXMLDateArchive(t = None, force = 0):
1078    global wordsDictArchive
1079
1080    wordsDictArchive = {}
1081
1082    url = getXMLDateArchive(t)
1083    print "loading %s" % (url)
1084    try:
1085	doc = libxml2.htmlParseFile(url, None);
1086    except:
1087        doc = None
1088    if doc == None:
1089        print "Failed to parse %s" % (url)
1090	return -1
1091    ctxt = doc.xpathNewContext()
1092    anchors = ctxt.xpathEval("//a[@href]")
1093    links = 0
1094    newmsg = 0
1095    for anchor in anchors:
1096	href = anchor.prop("href")
1097	if href == None or href[0:3] != "msg":
1098	    continue
1099        try:
1100	    links = links + 1
1101
1102	    msg = libxml2.buildURI(href, url)
1103	    title = anchor.content
1104	    if title != None and title[0:4] == 'Re: ':
1105	        title = title[4:]
1106	    if title != None and title[0:6] == '[xml] ':
1107	        title = title[6:]
1108	    if title != None and title[0:7] == '[xslt] ':
1109	        title = title[7:]
1110	    newmsg = newmsg + scanXMLMsgArchive(msg, title, force)
1111
1112	except:
1113	    pass
1114
1115    return newmsg
1116
1117
1118#########################################################################
1119#									#
1120#          Main code: open the DB, the API XML and analyze it		#
1121#									#
1122#########################################################################
1123try:
1124    openMySQL()
1125except:
1126    print "Failed to open the database"
1127    print sys.exc_type, sys.exc_value
1128    sys.exit(1)
1129
1130def analyzeArchives(t = None, force = 0):
1131    global wordsDictArchive
1132
1133    ret = scanXMLDateArchive(t, force)
1134    print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret)
1135
1136    i = 0
1137    skipped = 0
1138    for word in wordsDictArchive.keys():
1139	refs = wordsDictArchive[word]
1140	if refs  == None:
1141	    skipped = skipped + 1
1142	    continue;
1143	for id in refs.keys():
1144	    relevance = refs[id]
1145	    updateWordArchive(word, id, relevance)
1146	    i = i + 1
1147
1148    print "Found %d associations in HTML pages" % (i)
1149
1150def analyzeHTMLTop():
1151    global wordsDictHTML
1152
1153    ret = analyzeHTMLPages()
1154    print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
1155
1156    i = 0
1157    skipped = 0
1158    for word in wordsDictHTML.keys():
1159	refs = wordsDictHTML[word]
1160	if refs  == None:
1161	    skipped = skipped + 1
1162	    continue;
1163	for resource in refs.keys():
1164	    (relevance, id, section) = refs[resource]
1165	    updateWordHTML(word, resource, section, id, relevance)
1166	    i = i + 1
1167
1168    print "Found %d associations in HTML pages" % (i)
1169
1170def analyzeAPITop():
1171    global wordsDict
1172    global API
1173
1174    try:
1175	doc = loadAPI(API)
1176	ret = analyzeAPI(doc)
1177	print "Analyzed %d blocs" % (ret)
1178	doc.freeDoc()
1179    except:
1180	print "Failed to parse and analyze %s" % (API)
1181	print sys.exc_type, sys.exc_value
1182	sys.exit(1)
1183
1184    print "Indexed %d words" % (len(wordsDict))
1185    i = 0
1186    skipped = 0
1187    for word in wordsDict.keys():
1188	refs = wordsDict[word]
1189	if refs  == None:
1190	    skipped = skipped + 1
1191	    continue;
1192	for (module, symbol) in refs.keys():
1193	    updateWord(word, symbol, refs[(module, symbol)])
1194	    i = i + 1
1195
1196    print "Found %d associations, skipped %d words" % (i, skipped)
1197
1198def usage():
1199    print "Usage index.py [--force] [--archive]  [--archive-year year] [--archive-month month] [--API] [--docs]"
1200    sys.exit(1)
1201
1202def main():
1203    args = sys.argv[1:]
1204    force = 0
1205    if args:
1206        i = 0
1207	while i < len(args):
1208	    if args[i] == '--force':
1209	        force = 1
1210	    elif args[i] == '--archive':
1211	        analyzeArchives(None, force)
1212	    elif args[i] == '--archive-year':
1213	        i = i + 1;
1214		year = args[i]
1215		months = ["January" , "February", "March", "April", "May",
1216			  "June", "July", "August", "September", "October",
1217			  "November", "December"];
1218	        for month in months:
1219		    try:
1220		        str = "%s-%s" % (year, month)
1221			T = time.strptime(str, "%Y-%B")
1222			t = time.mktime(T) + 3600 * 24 * 10;
1223			analyzeArchives(t, force)
1224		    except:
1225			print "Failed to index month archive:"
1226			print sys.exc_type, sys.exc_value
1227	    elif args[i] == '--archive-month':
1228	        i = i + 1;
1229		month = args[i]
1230		try:
1231		    T = time.strptime(month, "%Y-%B")
1232		    t = time.mktime(T) + 3600 * 24 * 10;
1233		    analyzeArchives(t, force)
1234		except:
1235		    print "Failed to index month archive:"
1236		    print sys.exc_type, sys.exc_value
1237	    elif args[i] == '--API':
1238	        analyzeAPITop()
1239	    elif args[i] == '--docs':
1240	        analyzeHTMLTop()
1241	    else:
1242	        usage()
1243	    i = i + 1
1244    else:
1245        usage()
1246
1247if __name__ == "__main__":
1248    main()
1249