Added preprocessing to html_extract.py
2 files changed, 25 lines added, 0 lines removed
Changes
--- server/html_extract.py c813fb6a9d7ddd6376bf6581771b4ff8698ac579
+++ server/html_extract.py 28c1df983f8f37032e263d357ccfd2d2bc8cbcbe
@@ -5,6 +5,12 @@
+
+#Remove the garbage so htmllib can read it
+def preprocess_page(html):
+ start = html.find("<body>")+ end = html.find("</body>")+ return html[start:end+7]
@@ -88,6 +94,8 @@
+ #print l.text,l.density
+
@@ -126,4 +134,21 @@
+ html = preprocess_page(html)
+
+if __name__ == "__main__":
+ response = urllib2.urlopen("http://www.google.com/racing")+ html = response.read()
+ html = preprocess_page(html)
+ # Derive from formatter.AbstractWriter to store paragraphs.
+ writer = LineWriter()
+ # Default formatter sends commands to our writer.
+ format = formatter.AbstractFormatter(writer)
+ # Derive from htmllib.HTMLParser to track parsed bytes.
+ parser = TrackingParser(writer, format)
+ # Give the parser the raw HTML data.
+ parser.feed(html)
+ parser.close()
+ # Filter the paragraphs stored and output them.
+ print repr(writer.output())