Python Html Parser

html.parser is a module for parsing HTML.


Usage

from html.parser import HTMLParser

def clean_attrs(attrs):
    """Clean raw attributes into a dictionary.

    Attributes are passed to the parser as a tuple of pairs.
    Styles are encoded as a string.
    """
    clean = {}
    for pair in attrs:
        key, value = pair
        if key == "style":
            for style_pair in value.split(";"):
                if len(style_pair.strip()) == 0:
                    continue
                style_key, style_value = style_pair.split(":")
                clean[key][style_key.strip()] = style_value.strip()
        else:
            clean[key] = value
    return clean
            

class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.in_div = False

    def handle_starttag(self, tag, _attrs):
        attrs = clean_attrs(_attrs)
        if tag == "div":
            self.in_div = True

    def handle_endtag(self, tag):
        if tag == "div":
            self.in_div = False

    def handle_data(self, data):
        if self.in_div:
            print(data)

def parse(string):
    parser = MyHTMLParser()
    try:
        parser.feed(string)
    finally:
        parser.close()


See also

Python html.parser module documentation


CategoryRicottone

Python/Html/Parser (last edited 2023-10-13 20:41:17 by DominicRicottone)