Differences between revisions 4 and 5

Python HTMLParser

html.parser is a module for parsing HTML. The parser is html.parser.HTMLParser.

Contents

Python HTMLParser
1. Usage
2. See also

Usage

from html.parser import HTMLParser

def clean_attrs(attrs):
    """Clean raw attributes into a dictionary.

    Attributes are passed to the parser as a tuple of pairs.
    Styles are encoded as a string.
    """
    clean = {}
    for pair in attrs:
        key, value = pair
        if key == "style":
            for style_pair in value.split(";"):
                if len(style_pair.strip()) == 0:
                    continue
                style_key, style_value = style_pair.split(":")
                clean[key][style_key.strip()] = style_value.strip()
        else:
            clean[key] = value
    return clean
            

class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.in_div = False

    def handle_starttag(self, tag, _attrs):
        attrs = clean_attrs(_attrs)
        if tag == "div":
            self.in_div = True

    def handle_endtag(self, tag):
        if tag == "div":
            self.in_div = False

    def handle_data(self, data):
        if self.in_div:
            print(data)

def parse(string):
    parser = MyHTMLParser()
    try:
        parser.feed(string)
    finally:
        parser.close()

-  ⇤ ← Revision 4 as of 2023-03-01 15:28:37 → 
  Size: 1577
  Editor: DominicRicottone
  Comment:
+   ← Revision 5 as of 2023-03-01 15:30:45 → ⇥
  Size: 1576
  Editor: DominicRicottone
  Comment:
-Deletions are marked like this.
+Additions are marked like this.
 Line 66:
-== See also ===
+== See also ==

Diff for "Python/Html/Parser"

Python HTMLParser

Usage

See also