Differences between revisions 1 and 8 (spanning 7 versions)
Revision 1 as of 2022-05-02 15:37:26
Size: 1350
Comment:
Revision 8 as of 2023-10-13 20:41:17
Size: 1531
Comment:
Deletions are marked like this. Additions are marked like this.
Line 2: Line 2:

'''`html.parser`''' is a module for parsing HTML.
Line 60: Line 62:
----



== See also ==

[[https://docs.python.org/3/library/html.parser.html|Python html.parser module documentation]]

Python Html Parser

html.parser is a module for parsing HTML.


Usage

from html.parser import HTMLParser

def clean_attrs(attrs):
    """Clean raw attributes into a dictionary.

    Attributes are passed to the parser as a tuple of pairs.
    Styles are encoded as a string.
    """
    clean = {}
    for pair in attrs:
        key, value = pair
        if key == "style":
            for style_pair in value.split(";"):
                if len(style_pair.strip()) == 0:
                    continue
                style_key, style_value = style_pair.split(":")
                clean[key][style_key.strip()] = style_value.strip()
        else:
            clean[key] = value
    return clean
            

class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.in_div = False

    def handle_starttag(self, tag, _attrs):
        attrs = clean_attrs(_attrs)
        if tag == "div":
            self.in_div = True

    def handle_endtag(self, tag):
        if tag == "div":
            self.in_div = False

    def handle_data(self, data):
        if self.in_div:
            print(data)

def parse(string):
    parser = MyHTMLParser()
    try:
        parser.feed(string)
    finally:
        parser.close()


See also

Python html.parser module documentation


CategoryRicottone

Python/Html/Parser (last edited 2023-10-13 20:41:17 by DominicRicottone)