Size: 1350
Comment:
|
Size: 1572
Comment:
|
Deletions are marked like this. | Additions are marked like this. |
Line 1: | Line 1: |
= Python Html Parser = | ## page was renamed from Python/HtmlParser = Python HTMLParser = |
Line 3: | Line 4: |
'''`html.parser`''' is a module for parsing HTML. | |
Line 60: | Line 62: |
---- == See also == [[https://docs.python.org/3/library/html.parser.html|Python html.parser module documentation]] |
Python HTMLParser
html.parser is a module for parsing HTML.
Contents
Usage
from html.parser import HTMLParser def clean_attrs(attrs): """Clean raw attributes into a dictionary. Attributes are passed to the parser as a tuple of pairs. Styles are encoded as a string. """ clean = {} for pair in attrs: key, value = pair if key == "style": for style_pair in value.split(";"): if len(style_pair.strip()) == 0: continue style_key, style_value = style_pair.split(":") clean[key][style_key.strip()] = style_value.strip() else: clean[key] = value return clean class MyHTMLParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.in_div = False def handle_starttag(self, tag, _attrs): attrs = clean_attrs(_attrs) if tag == "div": self.in_div = True def handle_endtag(self, tag): if tag == "div": self.in_div = False def handle_data(self, data): if self.in_div: print(data) def parse(string): parser = MyHTMLParser() try: parser.feed(string) finally: parser.close()
See also
Python html.parser module documentation