|
1 | 1 | import logging |
2 | 2 | import warnings |
3 | 3 |
|
4 | | -from lxml.html import fromstring |
5 | | - |
6 | 4 | from extruct.jsonld import JsonLdExtractor |
7 | 5 | from extruct.rdfa import RDFaExtractor |
8 | 6 | from extruct.w3cmicrodata import MicrodataExtractor |
9 | 7 | from extruct.opengraph import OpenGraphExtractor |
10 | 8 | from extruct.microformat import MicroformatExtractor |
11 | | -from extruct.xmldom import XmlDomHTMLParser |
12 | 9 | from extruct.uniform import _umicrodata_microformat, _uopengraph |
13 | | - |
| 10 | +from extruct.utils import parse_xmldom_html |
14 | 11 |
|
15 | 12 | logger = logging.getLogger(__name__) |
16 | 13 | SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa'] |
@@ -52,8 +49,7 @@ def extract(htmlstring, base_url=None, encoding="UTF-8", |
52 | 49 | if errors not in ['log', 'ignore', 'strict']: |
53 | 50 | raise ValueError('Invalid error command, valid values are either "log"' |
54 | 51 | ', "ignore" or "strict"') |
55 | | - domparser = XmlDomHTMLParser(encoding=encoding) |
56 | | - tree = fromstring(htmlstring, parser=domparser) |
| 52 | + tree = parse_xmldom_html(htmlstring, encoding=encoding) |
57 | 53 | processors = [] |
58 | 54 | if 'microdata' in syntaxes: |
59 | 55 | processors.append(('microdata', MicrodataExtractor(add_html_node=return_html_node).extract_items, tree)) |
|
0 commit comments