Differences between revisions 4 and 5
Revision 4 as of 2023-03-01 15:28:37
Size: 1577
Comment:
Revision 5 as of 2023-03-01 15:30:45
Size: 1576
Comment:
Deletions are marked like this. Additions are marked like this.
Line 66: Line 66:
== See also === == See also ==

Python HTMLParser

html.parser is a module for parsing HTML. The parser is html.parser.HTMLParser.


Usage

from html.parser import HTMLParser

def clean_attrs(attrs):
    """Clean raw attributes into a dictionary.

    Attributes are passed to the parser as a tuple of pairs.
    Styles are encoded as a string.
    """
    clean = {}
    for pair in attrs:
        key, value = pair
        if key == "style":
            for style_pair in value.split(";"):
                if len(style_pair.strip()) == 0:
                    continue
                style_key, style_value = style_pair.split(":")
                clean[key][style_key.strip()] = style_value.strip()
        else:
            clean[key] = value
    return clean
            

class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.in_div = False

    def handle_starttag(self, tag, _attrs):
        attrs = clean_attrs(_attrs)
        if tag == "div":
            self.in_div = True

    def handle_endtag(self, tag):
        if tag == "div":
            self.in_div = False

    def handle_data(self, data):
        if self.in_div:
            print(data)

def parse(string):
    parser = MyHTMLParser()
    try:
        parser.feed(string)
    finally:
        parser.close()


See also

Python html.parser module documentation


CategoryRicottone

Python/Html/Parser (last edited 2023-10-13 20:41:17 by DominicRicottone)