1#!/usr/bin/python -u 2# 3# imports the API description and fills up a database with 4# name relevance to modules, functions or web pages 5# 6# Operation needed: 7# ================= 8# 9# install mysqld, the python wrappers for mysql and libxml2, start mysqld 10# Change the root passwd of mysql: 11# mysqladmin -u root password new_password 12# Create the new database xmlsoft 13# mysqladmin -p create xmlsoft 14# Create a database user 'veillard' and give him passord access 15# change veillard and abcde with the right user name and passwd 16# mysql -p 17# password: 18# mysql> GRANT ALL PRIVILEGES ON xmlsoft TO veillard@localhost 19# IDENTIFIED BY 'abcde' WITH GRANT OPTION; 20# 21# As the user check the access: 22# mysql -p xmlsoft 23# Enter password: 24# Welcome to the MySQL monitor.... 25# mysql> use xmlsoft 26# Database changed 27# mysql> quit 28# Bye 29# 30# Then run the script in the doc subdir, it will create the XSLTsymbols and 31# word tables and populate them with informations extracted from 32# the libxml2-api.xml API description, and make then accessible read-only 33# by nobody@loaclhost the user expected to be Apache's one 34# 35# On the Apache configuration, make sure you have php support enabled 36# 37 38import MySQLdb 39import libxml2 40import sys 41import string 42import os 43 44# 45# We are not interested in parsing errors here 46# 47def callback(ctx, str): 48 return 49libxml2.registerErrorHandler(callback, None) 50 51# 52# The dictionary of tables required and the SQL command needed 53# to create them 54# 55TABLES={ 56 "XSLTsymbols" : """CREATE TABLE XSLTsymbols ( 57 name varchar(255) BINARY NOT NULL, 58 module varchar(255) BINARY NOT NULL, 59 type varchar(25) NOT NULL, 60 descr varchar(255), 61 UNIQUE KEY name (name), 62 KEY module (module))""", 63 "XSLTwords" : """CREATE TABLE XSLTwords ( 64 name varchar(50) BINARY NOT NULL, 65 symbol varchar(255) BINARY NOT NULL, 66 relevance int, 67 KEY name (name), 68 KEY symbol (symbol), 69 UNIQUE KEY ID (name, symbol))""", 70 "XSLTwordsHTML" : """CREATE TABLE XSLTwordsHTML ( 71 name varchar(50) BINARY NOT NULL, 72 resource varchar(255) BINARY NOT NULL, 73 section varchar(255), 74 id varchar(50), 75 relevance int, 76 KEY name (name), 77 KEY resource (resource), 78 UNIQUE KEY ref (name, resource))""", 79 "XSLTwordsArchive" : """CREATE TABLE XSLTwordsArchive ( 80 name varchar(50) BINARY NOT NULL, 81 ID int(11) NOT NULL, 82 relevance int, 83 KEY name (name), 84 UNIQUE KEY ref (name, ID))""", 85 "XSLTpages" : """CREATE TABLE XSLTpages ( 86 resource varchar(255) BINARY NOT NULL, 87 title varchar(255) BINARY NOT NULL, 88 UNIQUE KEY name (resource))""", 89 "archives" : """CREATE TABLE archives ( 90 ID int(11) NOT NULL auto_increment, 91 resource varchar(255) BINARY NOT NULL, 92 title varchar(255) BINARY NOT NULL, 93 UNIQUE KEY id (ID,resource(255)), 94 INDEX (ID), 95 INDEX (resource))""", 96 "Queries" : """CREATE TABLE Queries ( 97 ID int(11) NOT NULL auto_increment, 98 Value varchar(50) NOT NULL, 99 Count int(11) NOT NULL, 100 UNIQUE KEY id (ID,Value(35)), 101 INDEX (ID))""", 102} 103 104# 105# The XML API description file to parse 106# 107API="libxslt-api.xml" 108DB=None 109 110######################################################################### 111# # 112# MySQL database interfaces # 113# # 114######################################################################### 115def createTable(db, name): 116 global TABLES 117 118 if db == None: 119 return -1 120 if name == None: 121 return -1 122 c = db.cursor() 123 124 ret = c.execute("DROP TABLE IF EXISTS %s" % (name)) 125 if ret == 1: 126 print "Removed table %s" % (name) 127 print "Creating table %s" % (name) 128 try: 129 ret = c.execute(TABLES[name]) 130 except: 131 print "Failed to create table %s" % (name) 132 return -1 133 return ret 134 135def checkTables(db): 136 global TABLES 137 138 if db == None: 139 return -1 140 c = db.cursor() 141 nbtables = c.execute("show tables") 142 print "Found %d tables" % (nbtables) 143 tables = {} 144 i = 0 145 while i < nbtables: 146 l = c.fetchone() 147 name = l[0] 148 tables[name] = {} 149 i = i + 1 150 151 for table in TABLES.keys(): 152 if not tables.has_key(table): 153 print "table %s missing" % (table) 154 createTable(db, table) 155 try: 156 ret = c.execute("SELECT count(*) from %s" % table); 157 row = c.fetchone() 158 print "Table %s contains %d records" % (table, row[0]) 159 except: 160 print "Troubles with table %s : repairing" % (table) 161 ret = c.execute("repair table %s" % table); 162 print "repairing returned %d" % (ret) 163 ret = c.execute("SELECT count(*) from %s" % table); 164 row = c.fetchone() 165 print "Table %s contains %d records" % (table, row[0]) 166 print "checkTables finished" 167 168 # make sure apache can access the tables read-only 169 try: 170 ret = c.execute("GRANT SELECT ON xmlsoft.* TO nobody@localhost") 171 ret = c.execute("GRANT INSERT,SELECT,UPDATE ON xmlsoft.Queries TO nobody@localhost") 172 except: 173 pass 174 return 0 175 176def openMySQL(db="xmlsoft", passwd=None): 177 global DB 178 179 if passwd == None: 180 try: 181 passwd = os.environ["MySQL_PASS"] 182 except: 183 print "No password available, set environment MySQL_PASS" 184 sys.exit(1) 185 186 DB = MySQLdb.connect(passwd=passwd, db=db) 187 if DB == None: 188 return -1 189 ret = checkTables(DB) 190 return ret 191 192def updateWord(name, symbol, relevance): 193 global DB 194 195 if DB == None: 196 openMySQL() 197 if DB == None: 198 return -1 199 if name == None: 200 return -1 201 if symbol == None: 202 return -1 203 204 c = DB.cursor() 205 try: 206 ret = c.execute( 207"""INSERT INTO XSLTwords (name, symbol, relevance) VALUES ('%s','%s', %d)""" % 208 (name, symbol, relevance)) 209 except: 210 try: 211 ret = c.execute( 212 """UPDATE XSLTwords SET relevance = %d where name = '%s' and symbol = '%s'""" % 213 (relevance, name, symbol)) 214 except: 215 print "Update word (%s, %s, %s) failed command" % (name, symbol, relevance) 216 print "UPDATE XSLTwords SET relevance = %d where name = '%s' and symbol = '%s'" % (relevance, name, symbol) 217 print sys.exc_type, sys.exc_value 218 return -1 219 220 return ret 221 222def updateSymbol(name, module, type, desc): 223 global DB 224 225 updateWord(name, name, 50) 226 if DB == None: 227 openMySQL() 228 if DB == None: 229 return -1 230 if name == None: 231 return -1 232 if module == None: 233 return -1 234 if type == None: 235 return -1 236 237 try: 238 desc = string.replace(desc, "'", " ") 239 l = string.split(desc, ".") 240 desc = l[0] 241 desc = desc[0:99] 242 except: 243 desc = "" 244 245 c = DB.cursor() 246 try: 247 ret = c.execute( 248"""INSERT INTO XSLTsymbols (name, module, type, descr) VALUES ('%s','%s', '%s', '%s')""" % 249 (name, module, type, desc)) 250 except: 251 try: 252 ret = c.execute( 253"""UPDATE XSLTsymbols SET module='%s', type='%s', descr='%s' where name='%s'""" % 254 (module, type, desc, name)) 255 except: 256 print "Update symbol (%s, %s, %s) failed command" % (name, module, type) 257 print """UPDATE XSLTsymbols SET module='%s', type='%s', descr='%s' where name='%s'""" % (module, type, desc, name) 258 print sys.exc_type, sys.exc_value 259 return -1 260 261 return ret 262 263def addFunction(name, module, desc = ""): 264 return updateSymbol(name, module, 'function', desc) 265 266def addMacro(name, module, desc = ""): 267 return updateSymbol(name, module, 'macro', desc) 268 269def addEnum(name, module, desc = ""): 270 return updateSymbol(name, module, 'enum', desc) 271 272def addStruct(name, module, desc = ""): 273 return updateSymbol(name, module, 'struct', desc) 274 275def addConst(name, module, desc = ""): 276 return updateSymbol(name, module, 'const', desc) 277 278def addType(name, module, desc = ""): 279 return updateSymbol(name, module, 'type', desc) 280 281def addFunctype(name, module, desc = ""): 282 return updateSymbol(name, module, 'functype', desc) 283 284def addPage(resource, title): 285 global DB 286 287 if DB == None: 288 openMySQL() 289 if DB == None: 290 return -1 291 if resource == None: 292 return -1 293 294 c = DB.cursor() 295 try: 296 ret = c.execute( 297 """INSERT INTO XSLTpages (resource, title) VALUES ('%s','%s')""" % 298 (resource, title)) 299 except: 300 try: 301 ret = c.execute( 302 """UPDATE XSLTpages SET title='%s' WHERE resource='%s'""" % 303 (title, resource)) 304 except: 305 print "Update symbol (%s, %s, %s) failed command" % (name, module, type) 306 print """UPDATE XSLTpages SET title='%s' WHERE resource='%s'""" % (title, resource) 307 print sys.exc_type, sys.exc_value 308 return -1 309 310 return ret 311 312def updateWordHTML(name, resource, desc, id, relevance): 313 global DB 314 315 if DB == None: 316 openMySQL() 317 if DB == None: 318 return -1 319 if name == None: 320 return -1 321 if resource == None: 322 return -1 323 if id == None: 324 id = "" 325 if desc == None: 326 desc = "" 327 else: 328 try: 329 desc = string.replace(desc, "'", " ") 330 desc = desc[0:99] 331 except: 332 desc = "" 333 334 c = DB.cursor() 335 try: 336 ret = c.execute( 337"""INSERT INTO XSLTwordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" % 338 (name, resource, desc, id, relevance)) 339 except: 340 try: 341 ret = c.execute( 342"""UPDATE XSLTwordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % 343 (desc, id, relevance, name, resource)) 344 except: 345 print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance) 346 print """UPDATE XSLTwordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource) 347 print sys.exc_type, sys.exc_value 348 return -1 349 350 return ret 351 352def checkXMLMsgArchive(url): 353 global DB 354 355 if DB == None: 356 openMySQL() 357 if DB == None: 358 return -1 359 if url == None: 360 return -1 361 362 c = DB.cursor() 363 try: 364 ret = c.execute( 365 """SELECT ID FROM archives WHERE resource='%s'""" % (url)) 366 row = c.fetchone() 367 if row == None: 368 return -1 369 except: 370 return -1 371 372 return row[0] 373 374def addXMLMsgArchive(url, title): 375 global DB 376 377 if DB == None: 378 openMySQL() 379 if DB == None: 380 return -1 381 if url == None: 382 return -1 383 if title == None: 384 title = "" 385 else: 386 title = string.replace(title, "'", " ") 387 title = title[0:99] 388 389 c = DB.cursor() 390 try: 391 cmd = """INSERT INTO archives (resource, title) VALUES ('%s','%s')""" % (url, title) 392 ret = c.execute(cmd) 393 cmd = """SELECT ID FROM archives WHERE resource='%s'""" % (url) 394 ret = c.execute(cmd) 395 row = c.fetchone() 396 if row == None: 397 print "addXMLMsgArchive failed to get the ID: %s" % (url) 398 return -1 399 except: 400 print "addXMLMsgArchive failed command: %s" % (cmd) 401 return -1 402 403 return((int)(row[0])) 404 405def updateWordArchive(name, id, relevance): 406 global DB 407 408 if DB == None: 409 openMySQL() 410 if DB == None: 411 return -1 412 if name == None: 413 return -1 414 if id == None: 415 return -1 416 417 c = DB.cursor() 418 try: 419 ret = c.execute( 420"""INSERT INTO XSLTwordsArchive (name, id, relevance) VALUES ('%s', '%d', '%d')""" % 421 (name, id, relevance)) 422 except: 423 try: 424 ret = c.execute( 425"""UPDATE XSLTwordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % 426 (relevance, name, id)) 427 except: 428 print "Update word archive (%s, %d, %d) failed command" % (name, id, relevance) 429 print """UPDATE XSLTwordsArchive SET relevance='%d' where name='%s' and ID='%d'""" % (relevance, name, id) 430 print sys.exc_type, sys.exc_value 431 return -1 432 433 return ret 434 435######################################################################### 436# # 437# Word dictionary and analysis routines # 438# # 439######################################################################### 440 441# 442# top 100 english word without the one len < 3 + own set 443# 444dropWords = { 445 'the':0, 'this':0, 'can':0, 'man':0, 'had':0, 'him':0, 'only':0, 446 'and':0, 'not':0, 'been':0, 'other':0, 'even':0, 'are':0, 'was':0, 447 'new':0, 'most':0, 'but':0, 'when':0, 'some':0, 'made':0, 'from':0, 448 'who':0, 'could':0, 'after':0, 'that':0, 'will':0, 'time':0, 'also':0, 449 'have':0, 'more':0, 'these':0, 'did':0, 'was':0, 'two':0, 'many':0, 450 'they':0, 'may':0, 'before':0, 'for':0, 'which':0, 'out':0, 'then':0, 451 'must':0, 'one':0, 'through':0, 'with':0, 'you':0, 'said':0, 452 'first':0, 'back':0, 'were':0, 'what':0, 'any':0, 'years':0, 'his':0, 453 'her':0, 'where':0, 'all':0, 'its':0, 'now':0, 'much':0, 'she':0, 454 'about':0, 'such':0, 'your':0, 'there':0, 'into':0, 'like':0, 'may':0, 455 'would':0, 'than':0, 'our':0, 'well':0, 'their':0, 'them':0, 'over':0, 456 'down':0, 457 'net':0, 'www':0, 'bad':0, 'Okay':0, 'bin':0, 'cur':0, 458} 459 460wordsDict = {} 461wordsDictHTML = {} 462wordsDictArchive = {} 463 464def cleanupWordsString(str): 465 str = string.replace(str, ".", " ") 466 str = string.replace(str, "!", " ") 467 str = string.replace(str, "?", " ") 468 str = string.replace(str, ",", " ") 469 str = string.replace(str, "'", " ") 470 str = string.replace(str, '"', " ") 471 str = string.replace(str, ";", " ") 472 str = string.replace(str, "(", " ") 473 str = string.replace(str, ")", " ") 474 str = string.replace(str, "{", " ") 475 str = string.replace(str, "}", " ") 476 str = string.replace(str, "<", " ") 477 str = string.replace(str, ">", " ") 478 str = string.replace(str, "=", " ") 479 str = string.replace(str, "/", " ") 480 str = string.replace(str, "*", " ") 481 str = string.replace(str, ":", " ") 482 str = string.replace(str, "#", " ") 483 str = string.replace(str, "\\", " ") 484 str = string.replace(str, "\n", " ") 485 str = string.replace(str, "\r", " ") 486 str = string.replace(str, "\xc2", " ") 487 str = string.replace(str, "\xa0", " ") 488 return str 489 490def cleanupDescrString(str): 491 str = string.replace(str, "'", " ") 492 str = string.replace(str, "\n", " ") 493 str = string.replace(str, "\r", " ") 494 str = string.replace(str, "\xc2", " ") 495 str = string.replace(str, "\xa0", " ") 496 l = string.split(str) 497 str = string.join(str) 498 return str 499 500def splitIdentifier(str): 501 ret = [] 502 while str != "": 503 cur = string.lower(str[0]) 504 str = str[1:] 505 if ((cur < 'a') or (cur > 'z')): 506 continue 507 while (str != "") and (str[0] >= 'A') and (str[0] <= 'Z'): 508 cur = cur + string.lower(str[0]) 509 str = str[1:] 510 while (str != "") and (str[0] >= 'a') and (str[0] <= 'z'): 511 cur = cur + str[0] 512 str = str[1:] 513 while (str != "") and (str[0] >= '0') and (str[0] <= '9'): 514 str = str[1:] 515 ret.append(cur) 516 return ret 517 518def addWord(word, module, symbol, relevance): 519 global wordsDict 520 521 if word == None or len(word) < 3: 522 return -1 523 if module == None or symbol == None: 524 return -1 525 if dropWords.has_key(word): 526 return 0 527 if ord(word[0]) > 0x80: 528 return 0 529 530 if wordsDict.has_key(word): 531 d = wordsDict[word] 532 if d == None: 533 return 0 534 if len(d) > 500: 535 wordsDict[word] = None 536 return 0 537 try: 538 relevance = relevance + d[(module, symbol)] 539 except: 540 pass 541 else: 542 wordsDict[word] = {} 543 wordsDict[word][(module, symbol)] = relevance 544 return relevance 545 546def addString(str, module, symbol, relevance): 547 if str == None or len(str) < 3: 548 return -1 549 ret = 0 550 str = cleanupWordsString(str) 551 l = string.split(str) 552 for word in l: 553 if len(word) > 2: 554 ret = ret + addWord(word, module, symbol, 5) 555 556 return ret 557 558def addWordHTML(word, resource, id, section, relevance): 559 global wordsDictHTML 560 561 if word == None or len(word) < 3: 562 return -1 563 if resource == None or section == None: 564 return -1 565 if dropWords.has_key(word): 566 return 0 567 if ord(word[0]) > 0x80: 568 return 0 569 570 section = cleanupDescrString(section) 571 572 if wordsDictHTML.has_key(word): 573 d = wordsDictHTML[word] 574 if d == None: 575 print "skipped %s" % (word) 576 return 0 577 try: 578 (r,i,s) = d[resource] 579 if i != None: 580 id = i 581 if s != None: 582 section = s 583 relevance = relevance + r 584 except: 585 pass 586 else: 587 wordsDictHTML[word] = {} 588 d = wordsDictHTML[word]; 589 d[resource] = (relevance, id, section) 590 return relevance 591 592def addStringHTML(str, resource, id, section, relevance): 593 if str == None or len(str) < 3: 594 return -1 595 ret = 0 596 str = cleanupWordsString(str) 597 l = string.split(str) 598 for word in l: 599 if len(word) > 2: 600 try: 601 r = addWordHTML(word, resource, id, section, relevance) 602 if r < 0: 603 print "addWordHTML failed: %s %s" % (word, resource) 604 ret = ret + r 605 except: 606 print "addWordHTML failed: %s %s %d" % (word, resource, relevance) 607 print sys.exc_type, sys.exc_value 608 609 return ret 610 611def addWordArchive(word, id, relevance): 612 global wordsDictArchive 613 614 if word == None or len(word) < 3: 615 return -1 616 if id == None or id == -1: 617 return -1 618 if dropWords.has_key(word): 619 return 0 620 if ord(word[0]) > 0x80: 621 return 0 622 623 if wordsDictArchive.has_key(word): 624 d = wordsDictArchive[word] 625 if d == None: 626 print "skipped %s" % (word) 627 return 0 628 try: 629 r = d[id] 630 relevance = relevance + r 631 except: 632 pass 633 else: 634 wordsDictArchive[word] = {} 635 d = wordsDictArchive[word]; 636 d[id] = relevance 637 return relevance 638 639def addStringArchive(str, id, relevance): 640 if str == None or len(str) < 3: 641 return -1 642 ret = 0 643 str = cleanupWordsString(str) 644 l = string.split(str) 645 for word in l: 646 i = len(word) 647 if i > 2: 648 try: 649 r = addWordArchive(word, id, relevance) 650 if r < 0: 651 print "addWordArchive failed: %s %s" % (word, id) 652 else: 653 ret = ret + r 654 except: 655 print "addWordArchive failed: %s %s %d" % (word, id, relevance) 656 print sys.exc_type, sys.exc_value 657 return ret 658 659######################################################################### 660# # 661# XML API description analysis # 662# # 663######################################################################### 664 665def loadAPI(filename): 666 doc = libxml2.parseFile(filename) 667 print "loaded %s" % (filename) 668 return doc 669 670def foundExport(file, symbol): 671 if file == None: 672 return 0 673 if symbol == None: 674 return 0 675 addFunction(symbol, file) 676 l = splitIdentifier(symbol) 677 for word in l: 678 addWord(word, file, symbol, 10) 679 return 1 680 681def analyzeAPIFile(top): 682 count = 0 683 name = top.prop("name") 684 cur = top.children 685 while cur != None: 686 if cur.type == 'text': 687 cur = cur.next 688 continue 689 if cur.name == "exports": 690 count = count + foundExport(name, cur.prop("symbol")) 691 else: 692 print "unexpected element %s in API doc <file name='%s'>" % (name) 693 cur = cur.next 694 return count 695 696def analyzeAPIFiles(top): 697 count = 0 698 cur = top.children 699 700 while cur != None: 701 if cur.type == 'text': 702 cur = cur.next 703 continue 704 if cur.name == "file": 705 count = count + analyzeAPIFile(cur) 706 else: 707 print "unexpected element %s in API doc <files>" % (cur.name) 708 cur = cur.next 709 return count 710 711def analyzeAPIEnum(top): 712 file = top.prop("file") 713 if file == None: 714 return 0 715 symbol = top.prop("name") 716 if symbol == None: 717 return 0 718 719 addEnum(symbol, file) 720 l = splitIdentifier(symbol) 721 for word in l: 722 addWord(word, file, symbol, 10) 723 724 return 1 725 726def analyzeAPIConst(top): 727 file = top.prop("file") 728 if file == None: 729 return 0 730 symbol = top.prop("name") 731 if symbol == None: 732 return 0 733 734 addConst(symbol, file) 735 l = splitIdentifier(symbol) 736 for word in l: 737 addWord(word, file, symbol, 10) 738 739 return 1 740 741def analyzeAPIType(top): 742 file = top.prop("file") 743 if file == None: 744 return 0 745 symbol = top.prop("name") 746 if symbol == None: 747 return 0 748 749 addType(symbol, file) 750 l = splitIdentifier(symbol) 751 for word in l: 752 addWord(word, file, symbol, 10) 753 return 1 754 755def analyzeAPIFunctype(top): 756 file = top.prop("file") 757 if file == None: 758 return 0 759 symbol = top.prop("name") 760 if symbol == None: 761 return 0 762 763 addFunctype(symbol, file) 764 l = splitIdentifier(symbol) 765 for word in l: 766 addWord(word, file, symbol, 10) 767 return 1 768 769def analyzeAPIStruct(top): 770 file = top.prop("file") 771 if file == None: 772 return 0 773 symbol = top.prop("name") 774 if symbol == None: 775 return 0 776 777 addStruct(symbol, file) 778 l = splitIdentifier(symbol) 779 for word in l: 780 addWord(word, file, symbol, 10) 781 782 info = top.prop("info") 783 if info != None: 784 info = string.replace(info, "'", " ") 785 info = string.strip(info) 786 l = string.split(info) 787 for word in l: 788 if len(word) > 2: 789 addWord(word, file, symbol, 5) 790 return 1 791 792def analyzeAPIMacro(top): 793 file = top.prop("file") 794 if file == None: 795 return 0 796 symbol = top.prop("name") 797 if symbol == None: 798 return 0 799 symbol = string.replace(symbol, "'", " ") 800 symbol = string.strip(symbol) 801 802 info = None 803 cur = top.children 804 while cur != None: 805 if cur.type == 'text': 806 cur = cur.next 807 continue 808 if cur.name == "info": 809 info = cur.content 810 break 811 cur = cur.next 812 813 l = splitIdentifier(symbol) 814 for word in l: 815 addWord(word, file, symbol, 10) 816 817 if info == None: 818 addMacro(symbol, file) 819 print "Macro %s description has no <info>" % (symbol) 820 return 0 821 822 info = string.replace(info, "'", " ") 823 info = string.strip(info) 824 addMacro(symbol, file, info) 825 l = string.split(info) 826 for word in l: 827 if len(word) > 2: 828 addWord(word, file, symbol, 5) 829 return 1 830 831def analyzeAPIFunction(top): 832 file = top.prop("file") 833 if file == None: 834 return 0 835 symbol = top.prop("name") 836 if symbol == None: 837 return 0 838 839 symbol = string.replace(symbol, "'", " ") 840 symbol = string.strip(symbol) 841 info = None 842 cur = top.children 843 while cur != None: 844 if cur.type == 'text': 845 cur = cur.next 846 continue 847 if cur.name == "info": 848 info = cur.content 849 elif cur.name == "return": 850 rinfo = cur.prop("info") 851 if rinfo != None: 852 rinfo = string.replace(rinfo, "'", " ") 853 rinfo = string.strip(rinfo) 854 addString(rinfo, file, symbol, 7) 855 elif cur.name == "arg": 856 ainfo = cur.prop("info") 857 if ainfo != None: 858 ainfo = string.replace(ainfo, "'", " ") 859 ainfo = string.strip(ainfo) 860 addString(ainfo, file, symbol, 5) 861 name = cur.prop("name") 862 if name != None: 863 name = string.replace(name, "'", " ") 864 name = string.strip(name) 865 addWord(name, file, symbol, 7) 866 cur = cur.next 867 if info == None: 868 print "Function %s description has no <info>" % (symbol) 869 addFunction(symbol, file, "") 870 else: 871 info = string.replace(info, "'", " ") 872 info = string.strip(info) 873 addFunction(symbol, file, info) 874 addString(info, file, symbol, 5) 875 876 l = splitIdentifier(symbol) 877 for word in l: 878 addWord(word, file, symbol, 10) 879 880 return 1 881 882def analyzeAPISymbols(top): 883 count = 0 884 cur = top.children 885 886 while cur != None: 887 if cur.type == 'text': 888 cur = cur.next 889 continue 890 if cur.name == "macro": 891 count = count + analyzeAPIMacro(cur) 892 elif cur.name == "function": 893 count = count + analyzeAPIFunction(cur) 894 elif cur.name == "const": 895 count = count + analyzeAPIConst(cur) 896 elif cur.name == "typedef": 897 count = count + analyzeAPIType(cur) 898 elif cur.name == "struct": 899 count = count + analyzeAPIStruct(cur) 900 elif cur.name == "enum": 901 count = count + analyzeAPIEnum(cur) 902 elif cur.name == "functype": 903 count = count + analyzeAPIFunctype(cur) 904 else: 905 print "unexpected element %s in API doc <files>" % (cur.name) 906 cur = cur.next 907 return count 908 909def analyzeAPI(doc): 910 count = 0 911 if doc == None: 912 return -1 913 root = doc.getRootElement() 914 if root.name != "api": 915 print "Unexpected root name" 916 return -1 917 cur = root.children 918 while cur != None: 919 if cur.type == 'text': 920 cur = cur.next 921 continue 922 if cur.name == "files": 923 pass 924# count = count + analyzeAPIFiles(cur) 925 elif cur.name == "symbols": 926 count = count + analyzeAPISymbols(cur) 927 else: 928 print "unexpected element %s in API doc" % (cur.name) 929 cur = cur.next 930 return count 931 932######################################################################### 933# # 934# Web pages parsing and analysis # 935# # 936######################################################################### 937 938import glob 939 940def analyzeHTMLText(doc, resource, p, section, id): 941 words = 0 942 try: 943 content = p.content 944 words = words + addStringHTML(content, resource, id, section, 5) 945 except: 946 return -1 947 return words 948 949def analyzeHTMLPara(doc, resource, p, section, id): 950 words = 0 951 try: 952 content = p.content 953 words = words + addStringHTML(content, resource, id, section, 5) 954 except: 955 return -1 956 return words 957 958def analyzeHTMLPre(doc, resource, p, section, id): 959 words = 0 960 try: 961 content = p.content 962 words = words + addStringHTML(content, resource, id, section, 5) 963 except: 964 return -1 965 return words 966 967def analyzeHTML(doc, resource, p, section, id): 968 words = 0 969 try: 970 content = p.content 971 words = words + addStringHTML(content, resource, id, section, 5) 972 except: 973 return -1 974 return words 975 976def analyzeHTML(doc, resource): 977 para = 0; 978 ctxt = doc.xpathNewContext() 979 try: 980 res = ctxt.xpathEval("//head/title") 981 title = res[0].content 982 except: 983 title = "Page %s" % (resource) 984 addPage(resource, title) 985 try: 986 items = ctxt.xpathEval("//h1 | //h2 | //h3 | //text()") 987 section = title 988 id = "" 989 for item in items: 990 if item.name == 'h1' or item.name == 'h2' or item.name == 'h3': 991 section = item.content 992 if item.prop("id"): 993 id = item.prop("id") 994 elif item.prop("name"): 995 id = item.prop("name") 996 elif item.type == 'text': 997 analyzeHTMLText(doc, resource, item, section, id) 998 para = para + 1 999 elif item.name == 'p': 1000 analyzeHTMLPara(doc, resource, item, section, id) 1001 para = para + 1 1002 elif item.name == 'pre': 1003 analyzeHTMLPre(doc, resource, item, section, id) 1004 para = para + 1 1005 else: 1006 print "Page %s, unexpected %s element" % (resource, item.name) 1007 except: 1008 print "Page %s: problem analyzing" % (resource) 1009 print sys.exc_type, sys.exc_value 1010 1011 return para 1012 1013def analyzeHTMLPages(): 1014 ret = 0 1015 HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html") 1016 for html in HTMLfiles: 1017 if html[0:3] == "API": 1018 continue 1019 if html == "xslt.html": 1020 continue 1021 try: 1022 doc = libxml2.htmlParseFile(html, None) 1023 res = analyzeHTML(doc, html) 1024 print "Parsed %s : %d paragraphs" % (html, res) 1025 ret = ret + 1 1026 except: 1027 print "could not parse %s" % (html) 1028 return ret 1029 1030######################################################################### 1031# # 1032# Mail archives parsing and analysis # 1033# # 1034######################################################################### 1035 1036import time 1037 1038def getXMLDateArchive(t = None): 1039 if t == None: 1040 t = time.time() 1041 T = time.gmtime(t) 1042 month = time.strftime("%B", T) 1043 year = T[0] 1044 url = "http://mail.gnome.org/archives/xslt/%d-%s/date.html" % (year, month) 1045 return url 1046 1047def scanXMLMsgArchive(url, title, force = 0): 1048 if url == None or title == None: 1049 return 0 1050 1051 ID = checkXMLMsgArchive(url) 1052 if force == 0 and ID != -1: 1053 return 0 1054 1055 if ID == -1: 1056 ID = addXMLMsgArchive(url, title) 1057 if ID == -1: 1058 return 0 1059 1060 try: 1061 print "Loading %s" % (url) 1062 doc = libxml2.htmlParseFile(url, None); 1063 except: 1064 doc = None 1065 if doc == None: 1066 print "Failed to parse %s" % (url) 1067 return 0 1068 1069 addStringArchive(title, ID, 20) 1070 ctxt = doc.xpathNewContext() 1071 texts = ctxt.xpathEval("//pre//text()") 1072 for text in texts: 1073 addStringArchive(text.content, ID, 5) 1074 1075 return 1 1076 1077def scanXMLDateArchive(t = None, force = 0): 1078 global wordsDictArchive 1079 1080 wordsDictArchive = {} 1081 1082 url = getXMLDateArchive(t) 1083 print "loading %s" % (url) 1084 try: 1085 doc = libxml2.htmlParseFile(url, None); 1086 except: 1087 doc = None 1088 if doc == None: 1089 print "Failed to parse %s" % (url) 1090 return -1 1091 ctxt = doc.xpathNewContext() 1092 anchors = ctxt.xpathEval("//a[@href]") 1093 links = 0 1094 newmsg = 0 1095 for anchor in anchors: 1096 href = anchor.prop("href") 1097 if href == None or href[0:3] != "msg": 1098 continue 1099 try: 1100 links = links + 1 1101 1102 msg = libxml2.buildURI(href, url) 1103 title = anchor.content 1104 if title != None and title[0:4] == 'Re: ': 1105 title = title[4:] 1106 if title != None and title[0:6] == '[xml] ': 1107 title = title[6:] 1108 if title != None and title[0:7] == '[xslt] ': 1109 title = title[7:] 1110 newmsg = newmsg + scanXMLMsgArchive(msg, title, force) 1111 1112 except: 1113 pass 1114 1115 return newmsg 1116 1117 1118######################################################################### 1119# # 1120# Main code: open the DB, the API XML and analyze it # 1121# # 1122######################################################################### 1123try: 1124 openMySQL() 1125except: 1126 print "Failed to open the database" 1127 print sys.exc_type, sys.exc_value 1128 sys.exit(1) 1129 1130def analyzeArchives(t = None, force = 0): 1131 global wordsDictArchive 1132 1133 ret = scanXMLDateArchive(t, force) 1134 print "Indexed %d words in %d archive pages" % (len(wordsDictArchive), ret) 1135 1136 i = 0 1137 skipped = 0 1138 for word in wordsDictArchive.keys(): 1139 refs = wordsDictArchive[word] 1140 if refs == None: 1141 skipped = skipped + 1 1142 continue; 1143 for id in refs.keys(): 1144 relevance = refs[id] 1145 updateWordArchive(word, id, relevance) 1146 i = i + 1 1147 1148 print "Found %d associations in HTML pages" % (i) 1149 1150def analyzeHTMLTop(): 1151 global wordsDictHTML 1152 1153 ret = analyzeHTMLPages() 1154 print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret) 1155 1156 i = 0 1157 skipped = 0 1158 for word in wordsDictHTML.keys(): 1159 refs = wordsDictHTML[word] 1160 if refs == None: 1161 skipped = skipped + 1 1162 continue; 1163 for resource in refs.keys(): 1164 (relevance, id, section) = refs[resource] 1165 updateWordHTML(word, resource, section, id, relevance) 1166 i = i + 1 1167 1168 print "Found %d associations in HTML pages" % (i) 1169 1170def analyzeAPITop(): 1171 global wordsDict 1172 global API 1173 1174 try: 1175 doc = loadAPI(API) 1176 ret = analyzeAPI(doc) 1177 print "Analyzed %d blocs" % (ret) 1178 doc.freeDoc() 1179 except: 1180 print "Failed to parse and analyze %s" % (API) 1181 print sys.exc_type, sys.exc_value 1182 sys.exit(1) 1183 1184 print "Indexed %d words" % (len(wordsDict)) 1185 i = 0 1186 skipped = 0 1187 for word in wordsDict.keys(): 1188 refs = wordsDict[word] 1189 if refs == None: 1190 skipped = skipped + 1 1191 continue; 1192 for (module, symbol) in refs.keys(): 1193 updateWord(word, symbol, refs[(module, symbol)]) 1194 i = i + 1 1195 1196 print "Found %d associations, skipped %d words" % (i, skipped) 1197 1198def usage(): 1199 print "Usage index.py [--force] [--archive] [--archive-year year] [--archive-month month] [--API] [--docs]" 1200 sys.exit(1) 1201 1202def main(): 1203 args = sys.argv[1:] 1204 force = 0 1205 if args: 1206 i = 0 1207 while i < len(args): 1208 if args[i] == '--force': 1209 force = 1 1210 elif args[i] == '--archive': 1211 analyzeArchives(None, force) 1212 elif args[i] == '--archive-year': 1213 i = i + 1; 1214 year = args[i] 1215 months = ["January" , "February", "March", "April", "May", 1216 "June", "July", "August", "September", "October", 1217 "November", "December"]; 1218 for month in months: 1219 try: 1220 str = "%s-%s" % (year, month) 1221 T = time.strptime(str, "%Y-%B") 1222 t = time.mktime(T) + 3600 * 24 * 10; 1223 analyzeArchives(t, force) 1224 except: 1225 print "Failed to index month archive:" 1226 print sys.exc_type, sys.exc_value 1227 elif args[i] == '--archive-month': 1228 i = i + 1; 1229 month = args[i] 1230 try: 1231 T = time.strptime(month, "%Y-%B") 1232 t = time.mktime(T) + 3600 * 24 * 10; 1233 analyzeArchives(t, force) 1234 except: 1235 print "Failed to index month archive:" 1236 print sys.exc_type, sys.exc_value 1237 elif args[i] == '--API': 1238 analyzeAPITop() 1239 elif args[i] == '--docs': 1240 analyzeHTMLTop() 1241 else: 1242 usage() 1243 i = i + 1 1244 else: 1245 usage() 1246 1247if __name__ == "__main__": 1248 main() 1249