1 """
2 followup web page scraping utilities
3 """
4
5 __author__ = 'Cristina Valeria Torres <cristina.torres@ligo.org>'
6
7
8
9 import os
10 import string
11 import sys
12 import time
13
15 """
16 This class is responisble for taking in out expected html
17 formatted file and allowing us to manipulate the central table of
18 interest while keeping the rest of the html available for later
19 writing to a disk.
20 """
22 self.filename=''
23 self.fileRead=False
24 self.originalText=list()
25 self.startKey="<h3>Follow-up tests</h3>\n"
26 self.endKey="<h3>Parameter estimation</h3>\n"
27 self.saveLines=list()
28 self.tableHead=str()
29 self.tableFoot=str()
30 self.topOfPage=list()
31 self.middleOfPage=list()
32 self.endOfPage=list()
33 self.tableObject=list()
34 self.rowNames=list()
35 self.colNames=list()
36 self.removeKey=["<table","<tbody"]
37 ignoreKeysMatch=list()
38 for match in self.removeKey:
39 ignoreKeysMatch.append("/"+match.strip("<"))
40 self.removeKey.extend(ignoreKeysMatch)
41 self.tmpTableText=str()
42
43
44 - def setContextKeys(self,newStartKey="",newEndKey=""):
45 """
46 Calling self.setContectKeys will allow you to specify two new
47 context keys to select a single table from a parsed html
48 file. The two arguments for this function require that you
49 specify a key which is one entire line long from the source
50 html file that you want to extract the table from. This will
51 allow the code to save the surrounding html and allow you to
52 manipulate the table in a more natural manner of
53 table[row][col] to edit the entries. Do not set the keys to
54 partial line matches or non-printing characters, this will
55 almost ensure the failure of the html parser.
56 """
57 if newStartKey != "":
58 self.startKey=newStartKey
59 if newEndKey != "":
60 self.endKey=newEndKey
61
62
63 - def readfile(self,filename):
64 """
65 Reads in a text html file given a filename.
66 """
67 fp=open(filename)
68 self.originalText=fp.readlines()
69 fp.close()
70 startCut=0
71 endCut=0
72 foundStartKey=False
73 foundEndKey=False
74 currentLine=0
75 newWay=True
76
77 if newWay:
78 allHTMLPage=str().join(self.originalText)
79 try:
80 pageHead,pageRemainder=allHTMLPage.split(self.startKey,1)
81 pageHead=pageHead+self.startKey
82 except ValueError:
83 pageRemainder=allHTMLPage
84 try:
85 pageMiddle,pageRemainder=pageRemainder.split(self.endKey,1)
86 pageRemainder=self.endKey+pageRemainder
87 except ValueError:
88 pageMiddle=pageRemainder
89 self.topOfPage.append(pageHead)
90 self.middleOfPage.append(pageMiddle)
91 self.endOfPage.append(pageRemainder)
92 else:
93 while (currentLine < self.originalText.__len__()):
94 if self.originalText[currentLine].__contains__(self.endKey):
95 foundEndKey=True
96 if not(foundStartKey):
97 self.topOfPage.append(self.originalText[currentLine])
98 if foundStartKey and not(foundEndKey):
99 self.middleOfPage.append(self.originalText[currentLine])
100 if foundStartKey and foundEndKey:
101 self.endOfPage.append(self.originalText[currentLine])
102 if self.originalText[currentLine].__contains__(self.startKey):
103 foundStartKey=True
104 self.endOfPage.append(self.originalText[currentLine])
105 currentLine=currentLine+1
106 cleantext=list()
107 if not(foundStartKey):
108 print "Problem finding start key."
109 if not(foundEndKey):
110 print "Problem finding end key."
111 self.__createTableObject__()
112 rowNames=list()
113 rowNumber=0
114 for row in self.tableObject:
115
116
117 if row.__len__() > 2:
118 self.rowNames.append([rowNumber,row[1],row.__len__()])
119 rowNumber=rowNumber+1
120 else:
121 self.rowNames.append([rowNumber,"\n",-1])
122 rowNumber=rowNumber+1
123 colNumber=0
124 for col in range(0,self.tableObject[1].__len__()):
125 self.colNames.append([colNumber,self.tableObject[1][col]])
126 colNumber=colNumber+1
127
128
129 - def __createTableObject__(self,inputHTML=None):
130 """
131 Given a list of text string we want to manipulate these
132 strings to create a table object. If the inputHTML is None
133 then we assume we want to work with self.middleOfPage
134 variable.
135 """
136 if inputHTML == None:
137 inputHTML=str().join(self.middleOfPage)
138
139 tableText=inputHTML
140
141 tableText=tableText.replace("\n","")
142
143
144 try:
145 tableHeadString,tableSep,tableBodyString=tableText.partition("<tr")
146 except AttributeError:
147 tableSep="<tr"
148 tableHeadString,tableBodyString=tableText.split(tableSep,1)
149
150 tableText=tableSep+tableBodyString
151
152
153 try:
154 tableText,tableSep,tableFootString=tableText.rpartition("</tr>")
155 except AttributeError:
156 tableSep="</tr>"
157 tableText,tableFootString=tableText.rsplit(tableSep,1)
158
159 tableText=tableText+tableSep
160
161 self.tableHead=tableHeadString
162 self.tableFoot=tableFootString
163
164
165
166 subTableList=list()
167 self.tmpTableText=tableText
168 while tableText.__contains__("<table"):
169 tableLabel="<MARKIT%i>"%(subTableList.__len__())
170 tableStart=tableText.find("<table")
171 tableStop=tableText.find("</table")
172 if ((tableStop == -1) or (tableStart == -1)):
173 print "Found unmatched table tag!?!?!?!"
174 print "Not creating table object!"
175 print "Read self.tmpTableText to diagnose html."
176 raise SyntaxError
177 thisSubTable=tableText.__getslice__(tableStart,tableStop)+"</table>"
178 tableText=tableText.replace(thisSubTable,tableLabel)
179 subTableList.append([tableLabel,thisSubTable])
180
181 tableText=tableText.replace("</td>","</td>\n").replace("</th>","</th>\n").replace("</table>","</table>\n").replace("<tr>","<tr>\n").replace("</tr>","</tr>\n")
182
183 for row in tableText.replace("<tr","<MARK><tr").split("<MARK>"):
184 self.tableObject.append(row.replace("<th","<MARK><th").replace("<td","<MARK><td").split("<MARK>"))
185
186 for rIndex in range(self.tableObject.__len__()):
187 for cIndex in range(self.tableObject[rIndex].__len__()):
188 for tableLabel,subTableText in subTableList:
189 if self.tableObject[rIndex][cIndex].__contains__(tableLabel):
190 self.tableObject[rIndex][cIndex]=self.tableObject[rIndex][cIndex].replace(tableLabel,subTableText)
191
192
193
194 - def getColumnByText(self,textString='',colNum=1):
195 """
196 Given a text string expected in Column #1 we select the
197 specified column given as an argument here. If there was
198 nothing found return empty string.
199 """
200 currentRow=0
201 rowCount=self.rowNames.__len__()-1
202 foundRow=-1
203 while currentRow <= rowCount:
204 if self.__compareKeyWords__(textString.lower(),self.rowNames[currentRow][1].lower()):
205 foundRow=self.rowNames[currentRow][0]
206 currentRow=rowCount+1
207 currentRow=currentRow+1
208 if (foundRow > -1):
209 try:
210 outputData=self.tableObject[foundRow][colNum]
211 if outputData.__len__()==0:
212 outputData=" "
213 return outputData
214 except IndexError:
215 return ""
216 else:
217 return ""
218
219
220 - def showRows(self):
221 """
222 Call this method after reading the html to literally see
223 the row labels inside the HTML table we are manipulating.
224 """
225 for row in self.rowNames:
226 sys.stdout.write("Row %i, %s, %i\n"%(int(row[0]),
227 str(row[1]),
228 int(row[2])))
229 sys.stdout.flush()
230
231
232 - def getRowList(self):
233 """
234 This method gets the list of rows in the table for that
235 htmlPage() instance. The data returned in a list of two element
236 lists. Like [[a,b],[c,d],...,[y,z]]
237 """
238 return self.rowNames
239
240 - def showCols(self):
241 """
242 Call this method after reading the html to literally see
243 the column labels inside of the html table we are manipulating.
244 """
245 colNum=1
246 for col in self.colNames:
247 sys.stdout.write("Col %i, %s\n"%(int(col[0]),str(col[1])))
248 sys.stdout.flush()
249
250 - def getColumnByCoord(self,RowNum,ColNum):
251 """
252 Given a row number and column number return that element in
253 the table. If the coords do not exist return empty string.
254 """
255 return self.tableObject[RowNum][ColNum]
256
257
258 - def insertTextAtCoord(self,RowNum,ColNum,Text):
259 """
260 Given a row number and column number insert the argument text
261 over what currently exists. If the RowNum and ColNum is out
262 of bounds do nothing.
263 """
264 self.tableObject[RowNum][ColNum]=Text
265
266
267 - def insertTextGivenText(self,matchText,colNum,Text):
268 """
269 Looks for given row matching column 1 to given text. It then
270 inserts the Text into the column specified by ColNum. If
271 there is no match or ColNum is out of bound nothing is done.
272 """
273 if Text=="":
274 return
275 currentRow=0
276 rowCount=self.rowNames.__len__()-1
277 foundRow=-1
278 while currentRow <= rowCount:
279 if self.__compareKeyWords__(matchText.lower(),self.rowNames[currentRow][1].lower()):
280 foundRow=self.rowNames[currentRow][0]
281 currentRow=rowCount+1
282 currentRow=currentRow+1
283 if foundRow > -1:
284 try:
285 self.tableObject[foundRow][colNum]=Text
286 except IndexError:
287 print "Exception Encountered"
288 print "String to match with : "+str(matchText)
289 print "Destination Table Column Count : "+str(self.tableObject[foundRow].__len__())
290 print "Row: "+str(foundRow)+" Col:"+str(colNum)
291 print "Text that should be inserted : "+str(Text)
292 print "****************************************"
293 print "*Ignoring error not inserting anything!*"
294 print "****************************************"
295 raise
296
297
299 """
300 This method should not be called explicity. It will rebuild
301 the table object variable into a chunk of html for writing to
302 the disk.
303 """
304 tmpMiddle=list()
305 tmpMiddle.append(self.tableHead)
306 tmpMiddle.append(str().join([str().join(x) for x in self.tableObject]))
307 tmpMiddle.append(self.tableFoot)
308 self.middleOfPage=list([str().join(tmpMiddle)])
309
310
311 - def buildTableHTML(self,formattingTxt=""):
312 """
313 Call this method to build a single string that corresponds the
314 the html you want to have that will begin with <table> and end
315 with </table>.
316 """
317 self.__buildMiddleOfPage__()
318 htmlTable=self.middleOfPage
319 self.middleOfPage=list()
320 txtStringA="<table %s>"%(formattingTxt)
321 txtStringB=str(htmlTable[0])
322 txtStringC="</table>"
323 return txtStringA+txtStringB+txtStringC
324
325
326
327 - def writeTableHTML(self,filename="table.html",formattingTxt=""):
328 """
329 Call this method to write just the html for creating the table
330 to a file.
331 """
332 fp=open(filename,'w')
333 outputText=self.buildTableHTML(formattingTxt)
334 fp.writelines(outputText)
335 fp.close()
336
337
339 """
340 Take input string and remove all tags inside of < >
341 delimiters.
342 """
343 leftD="<"
344 rightD=">"
345 input=stringIN
346 result=''
347 maxloop=0
348 ignoreKeys=["td","tr","em","br","h1","h2","h3","hr"]
349 ignoreKeysMatch=list()
350 for match in ignoreKeys:
351 ignoreKeysMatch.append("/"+match.strip("<"))
352 ignoreKeys.extend(ignoreKeysMatch)
353 foundKeys=0
354 output=list()
355 while ((input.__contains__("<") and input.__contains__(">")) and (maxloop < 100)):
356 maxloop=maxloop+1
357 tag=input.__getslice__(input.find("<"),input.find(">")+1)
358 foundKeys=0
359 for key in ignoreKeys:
360 if tag.lower().__contains__(key):
361 foundKeys=foundKeys+1
362 if (not(tag.__contains__(" ")) and foundKeys == 0):
363 output.append(input.split(tag,1)[0])
364 input=input.split(tag,1)[1]
365 if (not(tag.__contains__(" ")) and foundKeys > 0):
366 output.append(input.split(tag,1)[0])
367 input=input.split(tag,1)[1]
368 output.append(input)
369 result=str().join(output)
370 return result
371
372
373 - def __stripRowNumber__(self,stringA):
374 """
375 Takes the string representing the table row number. It strips
376 the number strip from the front. The input string is assumed
377 to have the form #?? Word Words More Words
378 where the only number is #?? Ideally this method should only
379 be called by self.__compareKeyWords__()
380 """
381 delimiter="#"
382 if (stringA.find(delimiter) == -1):
383 return stringA
384 [startTXT,middleTXT]=stringA.split("#",1)
385 middleTXT=middleTXT.split(" ",1)[1]
386 return startTXT+middleTXT
387
388 - def __compareKeyWords__(self,stringA="",stringB="",exact=False):
389 """
390 Break stringA into keywords minus html tags. Then take these
391 words and make sure they exist inside of stringB.
392 If the exact key is True then strings like
393 Big blue bird will not match Big blue pretty bird
394 if the string is left as default (False) then we allow the
395 above string to be matched since all the words in the first
396 string are contained in the second string.
397 """
398 if (
399 (self.__stripHTMLTags__(self.__stripRowNumber__(stringA)).isspace())
400 or
401 (self.__stripHTMLTags__(self.__stripRowNumber__(stringB)).isspace())
402 ):
403 return False
404 keyWordList=self.__stripHTMLTags__(self.__stripRowNumber__(stringA)).lower().split()
405 matchCount=0
406 match=False
407 stringB=self.__stripHTMLTags__(self.__stripRowNumber__(stringB)).lower()
408 for word in stringB.split():
409 for key in keyWordList:
410 if (word.__contains__(key)):
411 match=True
412 if match:
413 matchCount=matchCount+1
414 match=False
415 if exact:
416 if (
417 (matchCount==keyWordList.__len__()
418 and
419 (keyWordList.__len__() == list(stringB.split()).__len__())
420 )):
421 return True
422 else:
423 return False
424 if matchCount>=keyWordList.__len__():
425 return True
426 else:
427 return False
428
429
430 - def writeHTML(self,filename):
431 """
432 Writes out the html that was manipulated to the file filaname.
433 """
434 fp=open(filename,'w')
435 outputData=list()
436 self.__buildMiddleOfPage__()
437 outputData.extend(self.topOfPage)
438 outputData.extend(self.middleOfPage)
439 outputData.extend(self.endOfPage)
440 fp.writelines(outputData)
441 fp.close()
442
443
444
445