Rensselaer Center for Open Source Software

Added preprocessing to html_extract.py

2 files changed, 25 lines added, 0 lines removed

Changes

--- server/html_extract.py c813fb6a9d7ddd6376bf6581771b4ff8698ac579
+++ server/html_extract.py 28c1df983f8f37032e263d357ccfd2d2bc8cbcbe
@@ -5,6 +5,12 @@
+
+#Remove the garbage so htmllib can read it
+def preprocess_page(html):
+  start = html.find("<body>")
+  end = html.find("</body>")
+  return html[start:end+7]
@@ -88,6 +94,8 @@
+      #print l.text,l.density
+      
@@ -126,4 +134,21 @@
+  html = preprocess_page(html)
+
+if __name__ == "__main__":
+  response = urllib2.urlopen("http://www.google.com/racing")
+  html = response.read()
+  html = preprocess_page(html)
+  # Derive from formatter.AbstractWriter to store paragraphs.
+  writer = LineWriter()
+  # Default formatter sends commands to our writer.
+  format = formatter.AbstractFormatter(writer)
+  # Derive from htmllib.HTMLParser to track parsed bytes.
+  parser = TrackingParser(writer, format)
+  # Give the parser the raw HTML data.
+  parser.feed(html)
+  parser.close()
+  # Filter the paragraphs stored and output them.
+  print repr(writer.output())
Michael
DaBuzz • 59 weeks ago