Package pylal :: Module scrapeHtmlUtils
[hide private]
[frames] | no frames]

Source Code for Module pylal.scrapeHtmlUtils

  1  """ 
  2  followup web page scraping utilities 
  3  """ 
  4   
  5  __author__ = 'Cristina Valeria Torres <cristina.torres@ligo.org>' 
  6   
  7  ###################################################################### 
  8   
  9  import os 
 10  import string 
 11  import sys 
 12  import time 
 13   
14 -class scrapePage:
15 """ 16 This class is responisble for taking in out expected html 17 formatted file and allowing us to manipulate the central table of 18 interest while keeping the rest of the html available for later 19 writing to a disk. 20 """
21 - def __init__(self):
22 self.filename='' 23 self.fileRead=False 24 self.originalText=list() 25 self.startKey="<h3>Follow-up tests</h3>\n" 26 self.endKey="<h3>Parameter estimation</h3>\n" 27 self.saveLines=list() 28 self.tableHead=str() 29 self.tableFoot=str() 30 self.topOfPage=list() 31 self.middleOfPage=list() 32 self.endOfPage=list() 33 self.tableObject=list() 34 self.rowNames=list() 35 self.colNames=list() 36 self.removeKey=["<table","<tbody"] 37 ignoreKeysMatch=list() 38 for match in self.removeKey: 39 ignoreKeysMatch.append("/"+match.strip("<")) 40 self.removeKey.extend(ignoreKeysMatch) 41 self.tmpTableText=str()
42 #End Init 43
44 - def setContextKeys(self,newStartKey="",newEndKey=""):
45 """ 46 Calling self.setContectKeys will allow you to specify two new 47 context keys to select a single table from a parsed html 48 file. The two arguments for this function require that you 49 specify a key which is one entire line long from the source 50 html file that you want to extract the table from. This will 51 allow the code to save the surrounding html and allow you to 52 manipulate the table in a more natural manner of 53 table[row][col] to edit the entries. Do not set the keys to 54 partial line matches or non-printing characters, this will 55 almost ensure the failure of the html parser. 56 """ 57 if newStartKey != "": 58 self.startKey=newStartKey 59 if newEndKey != "": 60 self.endKey=newEndKey
61 #End setContextKeys() 62
63 - def readfile(self,filename):
64 """ 65 Reads in a text html file given a filename. 66 """ 67 fp=open(filename) 68 self.originalText=fp.readlines() 69 fp.close() 70 startCut=0 71 endCut=0 72 foundStartKey=False 73 foundEndKey=False 74 currentLine=0 75 newWay=True 76 #Cutting up html page to extract table 77 if newWay: 78 allHTMLPage=str().join(self.originalText) 79 try: 80 pageHead,pageRemainder=allHTMLPage.split(self.startKey,1) 81 pageHead=pageHead+self.startKey 82 except ValueError: 83 pageRemainder=allHTMLPage 84 try: 85 pageMiddle,pageRemainder=pageRemainder.split(self.endKey,1) 86 pageRemainder=self.endKey+pageRemainder 87 except ValueError: 88 pageMiddle=pageRemainder 89 self.topOfPage.append(pageHead) 90 self.middleOfPage.append(pageMiddle) 91 self.endOfPage.append(pageRemainder) 92 else: #This is depricated and prone to errors will be removed soon. 93 while (currentLine < self.originalText.__len__()): 94 if self.originalText[currentLine].__contains__(self.endKey): 95 foundEndKey=True 96 if not(foundStartKey): 97 self.topOfPage.append(self.originalText[currentLine]) 98 if foundStartKey and not(foundEndKey): 99 self.middleOfPage.append(self.originalText[currentLine]) 100 if foundStartKey and foundEndKey: 101 self.endOfPage.append(self.originalText[currentLine]) 102 if self.originalText[currentLine].__contains__(self.startKey): 103 foundStartKey=True 104 self.endOfPage.append(self.originalText[currentLine]) 105 currentLine=currentLine+1 106 cleantext=list() 107 if not(foundStartKey): 108 print "Problem finding start key." 109 if not(foundEndKey): 110 print "Problem finding end key." 111 self.__createTableObject__() 112 rowNames=list() 113 rowNumber=0 114 for row in self.tableObject: 115 #Setting self.rowNames list of lists 116 #[[Index,Row C1 Text,Col Count],[a,b,c],...,[]] 117 if row.__len__() > 2: 118 self.rowNames.append([rowNumber,row[1],row.__len__()]) 119 rowNumber=rowNumber+1 120 else: 121 self.rowNames.append([rowNumber,"\n",-1]) 122 rowNumber=rowNumber+1 123 colNumber=0 124 for col in range(0,self.tableObject[1].__len__()): 125 self.colNames.append([colNumber,self.tableObject[1][col]]) 126 colNumber=colNumber+1
127 #End read method 128
129 - def __createTableObject__(self,inputHTML=None):
130 """ 131 Given a list of text string we want to manipulate these 132 strings to create a table object. If the inputHTML is None 133 then we assume we want to work with self.middleOfPage 134 variable. 135 """ 136 if inputHTML == None: 137 inputHTML=str().join(self.middleOfPage) 138 #Join all lines into a single text string 139 tableText=inputHTML 140 #Remove all "\n" symbols 141 tableText=tableText.replace("\n","") 142 #Split of table head or the HTML before first occurence of 143 #the key <tr 144 try: 145 tableHeadString,tableSep,tableBodyString=tableText.partition("<tr") 146 except AttributeError: 147 tableSep="<tr" 148 tableHeadString,tableBodyString=tableText.split(tableSep,1) 149 150 tableText=tableSep+tableBodyString 151 #Split of the table foot or the HTML after the last occurence 152 #of the key </tr 153 try: 154 tableText,tableSep,tableFootString=tableText.rpartition("</tr>") 155 except AttributeError: 156 tableSep="</tr>" 157 tableText,tableFootString=tableText.rsplit(tableSep,1) 158 159 tableText=tableText+tableSep 160 #Save the text from the tableHead and tableFoot 161 self.tableHead=tableHeadString 162 self.tableFoot=tableFootString 163 #Search for internal tables replace the table with the string 164 #<MARKIT1> numbering for each subtable. 165 #subTableList -> [[MARKIT1,TXT],[MARKIT2,TXT]...[MARKITn,TXT]] 166 subTableList=list() 167 self.tmpTableText=tableText 168 while tableText.__contains__("<table"): 169 tableLabel="<MARKIT%i>"%(subTableList.__len__()) 170 tableStart=tableText.find("<table") 171 tableStop=tableText.find("</table") 172 if ((tableStop == -1) or (tableStart == -1)): 173 print "Found unmatched table tag!?!?!?!" 174 print "Not creating table object!" 175 print "Read self.tmpTableText to diagnose html." 176 raise SyntaxError 177 thisSubTable=tableText.__getslice__(tableStart,tableStop)+"</table>" 178 tableText=tableText.replace(thisSubTable,tableLabel) 179 subTableList.append([tableLabel,thisSubTable]) 180 #Add carriage returns to help clean up table formatting 181 tableText=tableText.replace("</td>","</td>\n").replace("</th>","</th>\n").replace("</table>","</table>\n").replace("<tr>","<tr>\n").replace("</tr>","</tr>\n") 182 #Convert the text into the tableObject variable 183 for row in tableText.replace("<tr","<MARK><tr").split("<MARK>"): 184 self.tableObject.append(row.replace("<th","<MARK><th").replace("<td","<MARK><td").split("<MARK>")) 185 #Scan the table object fields for the markers in subTableList 186 for rIndex in range(self.tableObject.__len__()): 187 for cIndex in range(self.tableObject[rIndex].__len__()): 188 for tableLabel,subTableText in subTableList: 189 if self.tableObject[rIndex][cIndex].__contains__(tableLabel): 190 self.tableObject[rIndex][cIndex]=self.tableObject[rIndex][cIndex].replace(tableLabel,subTableText)
191 #End self.__createTableObject__() 192 193
194 - def getColumnByText(self,textString='',colNum=1):
195 """ 196 Given a text string expected in Column #1 we select the 197 specified column given as an argument here. If there was 198 nothing found return empty string. 199 """ 200 currentRow=0 201 rowCount=self.rowNames.__len__()-1 202 foundRow=-1 203 while currentRow <= rowCount: 204 if self.__compareKeyWords__(textString.lower(),self.rowNames[currentRow][1].lower()): 205 foundRow=self.rowNames[currentRow][0] 206 currentRow=rowCount+1 207 currentRow=currentRow+1 208 if (foundRow > -1): 209 try: 210 outputData=self.tableObject[foundRow][colNum] 211 if outputData.__len__()==0: 212 outputData=" " 213 return outputData 214 except IndexError: 215 return "" 216 else: 217 return ""
218 #End getColumnByText() 219
220 - def showRows(self):
221 """ 222 Call this method after reading the html to literally see 223 the row labels inside the HTML table we are manipulating. 224 """ 225 for row in self.rowNames: 226 sys.stdout.write("Row %i, %s, %i\n"%(int(row[0]), 227 str(row[1]), 228 int(row[2]))) 229 sys.stdout.flush()
230 #End showRows() 231
232 - def getRowList(self):
233 """ 234 This method gets the list of rows in the table for that 235 htmlPage() instance. The data returned in a list of two element 236 lists. Like [[a,b],[c,d],...,[y,z]] 237 """ 238 return self.rowNames
239
240 - def showCols(self):
241 """ 242 Call this method after reading the html to literally see 243 the column labels inside of the html table we are manipulating. 244 """ 245 colNum=1 246 for col in self.colNames: 247 sys.stdout.write("Col %i, %s\n"%(int(col[0]),str(col[1]))) 248 sys.stdout.flush()
249
250 - def getColumnByCoord(self,RowNum,ColNum):
251 """ 252 Given a row number and column number return that element in 253 the table. If the coords do not exist return empty string. 254 """ 255 return self.tableObject[RowNum][ColNum]
256 #End getColumnByCoord() 257
258 - def insertTextAtCoord(self,RowNum,ColNum,Text):
259 """ 260 Given a row number and column number insert the argument text 261 over what currently exists. If the RowNum and ColNum is out 262 of bounds do nothing. 263 """ 264 self.tableObject[RowNum][ColNum]=Text
265 #End insertTextAtCoord 266
267 - def insertTextGivenText(self,matchText,colNum,Text):
268 """ 269 Looks for given row matching column 1 to given text. It then 270 inserts the Text into the column specified by ColNum. If 271 there is no match or ColNum is out of bound nothing is done. 272 """ 273 if Text=="": 274 return 275 currentRow=0 276 rowCount=self.rowNames.__len__()-1 277 foundRow=-1 278 while currentRow <= rowCount: 279 if self.__compareKeyWords__(matchText.lower(),self.rowNames[currentRow][1].lower()): 280 foundRow=self.rowNames[currentRow][0] 281 currentRow=rowCount+1 282 currentRow=currentRow+1 283 if foundRow > -1: 284 try: 285 self.tableObject[foundRow][colNum]=Text 286 except IndexError: 287 print "Exception Encountered" 288 print "String to match with : "+str(matchText) 289 print "Destination Table Column Count : "+str(self.tableObject[foundRow].__len__()) 290 print "Row: "+str(foundRow)+" Col:"+str(colNum) 291 print "Text that should be inserted : "+str(Text) 292 print "****************************************" 293 print "*Ignoring error not inserting anything!*" 294 print "****************************************" 295 raise
296 #End insertTextGivenText() 297
298 - def __buildMiddleOfPage__(self):
299 """ 300 This method should not be called explicity. It will rebuild 301 the table object variable into a chunk of html for writing to 302 the disk. 303 """ 304 tmpMiddle=list() 305 tmpMiddle.append(self.tableHead) 306 tmpMiddle.append(str().join([str().join(x) for x in self.tableObject])) 307 tmpMiddle.append(self.tableFoot) 308 self.middleOfPage=list([str().join(tmpMiddle)])
309 #End __buildMiddleOfPage__() 310
311 - def buildTableHTML(self,formattingTxt=""):
312 """ 313 Call this method to build a single string that corresponds the 314 the html you want to have that will begin with <table> and end 315 with </table>. 316 """ 317 self.__buildMiddleOfPage__() 318 htmlTable=self.middleOfPage 319 self.middleOfPage=list() 320 txtStringA="<table %s>"%(formattingTxt) 321 txtStringB=str(htmlTable[0]) 322 txtStringC="</table>" 323 return txtStringA+txtStringB+txtStringC
324 325 #End buildTableHTML() 326
327 - def writeTableHTML(self,filename="table.html",formattingTxt=""):
328 """ 329 Call this method to write just the html for creating the table 330 to a file. 331 """ 332 fp=open(filename,'w') 333 outputText=self.buildTableHTML(formattingTxt) 334 fp.writelines(outputText) 335 fp.close()
336 #End writeTableHTML() 337
338 - def __stripHTMLTags__(self,stringIN):
339 """ 340 Take input string and remove all tags inside of < > 341 delimiters. 342 """ 343 leftD="<" 344 rightD=">" 345 input=stringIN 346 result='' 347 maxloop=0 348 ignoreKeys=["td","tr","em","br","h1","h2","h3","hr"] 349 ignoreKeysMatch=list() 350 for match in ignoreKeys: 351 ignoreKeysMatch.append("/"+match.strip("<")) 352 ignoreKeys.extend(ignoreKeysMatch) 353 foundKeys=0 354 output=list() 355 while ((input.__contains__("<") and input.__contains__(">")) and (maxloop < 100)): 356 maxloop=maxloop+1 357 tag=input.__getslice__(input.find("<"),input.find(">")+1) 358 foundKeys=0 359 for key in ignoreKeys: 360 if tag.lower().__contains__(key): 361 foundKeys=foundKeys+1 362 if (not(tag.__contains__(" ")) and foundKeys == 0): 363 output.append(input.split(tag,1)[0]) 364 input=input.split(tag,1)[1] 365 if (not(tag.__contains__(" ")) and foundKeys > 0): 366 output.append(input.split(tag,1)[0]) 367 input=input.split(tag,1)[1] 368 output.append(input) 369 result=str().join(output) 370 return result
371 #End __stripHTMLTags__() 372
373 - def __stripRowNumber__(self,stringA):
374 """ 375 Takes the string representing the table row number. It strips 376 the number strip from the front. The input string is assumed 377 to have the form #?? Word Words More Words 378 where the only number is #?? Ideally this method should only 379 be called by self.__compareKeyWords__() 380 """ 381 delimiter="#" 382 if (stringA.find(delimiter) == -1): 383 return stringA 384 [startTXT,middleTXT]=stringA.split("#",1) 385 middleTXT=middleTXT.split(" ",1)[1] 386 return startTXT+middleTXT
387
388 - def __compareKeyWords__(self,stringA="",stringB="",exact=False):
389 """ 390 Break stringA into keywords minus html tags. Then take these 391 words and make sure they exist inside of stringB. 392 If the exact key is True then strings like 393 Big blue bird will not match Big blue pretty bird 394 if the string is left as default (False) then we allow the 395 above string to be matched since all the words in the first 396 string are contained in the second string. 397 """ 398 if ( 399 (self.__stripHTMLTags__(self.__stripRowNumber__(stringA)).isspace()) 400 or 401 (self.__stripHTMLTags__(self.__stripRowNumber__(stringB)).isspace()) 402 ): 403 return False 404 keyWordList=self.__stripHTMLTags__(self.__stripRowNumber__(stringA)).lower().split() 405 matchCount=0 406 match=False 407 stringB=self.__stripHTMLTags__(self.__stripRowNumber__(stringB)).lower() 408 for word in stringB.split(): 409 for key in keyWordList: 410 if (word.__contains__(key)): 411 match=True 412 if match: 413 matchCount=matchCount+1 414 match=False 415 if exact: 416 if ( 417 (matchCount==keyWordList.__len__() 418 and 419 (keyWordList.__len__() == list(stringB.split()).__len__()) 420 )): 421 return True 422 else: 423 return False 424 if matchCount>=keyWordList.__len__(): 425 return True 426 else: 427 return False
428 #End __compareKeyWords__() 429
430 - def writeHTML(self,filename):
431 """ 432 Writes out the html that was manipulated to the file filaname. 433 """ 434 fp=open(filename,'w') 435 outputData=list() 436 self.__buildMiddleOfPage__() 437 outputData.extend(self.topOfPage) 438 outputData.extend(self.middleOfPage) 439 outputData.extend(self.endOfPage) 440 fp.writelines(outputData) 441 fp.close()
442 #End writeHTML() 443 444 #End CLASS scrapPage 445