Python Html Parser
html.parser is a module for parsing HTML.
Contents
Usage
from html.parser import HTMLParser def clean_attrs(attrs): """Clean raw attributes into a dictionary. Attributes are passed to the parser as a tuple of pairs. Styles are encoded as a string. """ clean = {} for pair in attrs: key, value = pair if key == "style": for style_pair in value.split(";"): if len(style_pair.strip()) == 0: continue style_key, style_value = style_pair.split(":") clean[key][style_key.strip()] = style_value.strip() else: clean[key] = value return clean class MyHTMLParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.in_div = False def handle_starttag(self, tag, _attrs): attrs = clean_attrs(_attrs) if tag == "div": self.in_div = True def handle_endtag(self, tag): if tag == "div": self.in_div = False def handle_data(self, data): if self.in_div: print(data) def parse(string): parser = MyHTMLParser() try: parser.feed(string) finally: parser.close()
See also
Python html.parser module documentation