Skip to content

Commit 4a7a538

Browse files
authored
Merge pull request #89 from scrapinghub/parse-cleanup
cleanup: extract repeated html loading code to functions
2 parents 7224d03 + bd484e9 commit 4a7a538

File tree

8 files changed

+37
-25
lines changed

8 files changed

+37
-25
lines changed

extruct/_extruct.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,13 @@
11
import logging
22
import warnings
33

4-
from lxml.html import fromstring
5-
64
from extruct.jsonld import JsonLdExtractor
75
from extruct.rdfa import RDFaExtractor
86
from extruct.w3cmicrodata import MicrodataExtractor
97
from extruct.opengraph import OpenGraphExtractor
108
from extruct.microformat import MicroformatExtractor
11-
from extruct.xmldom import XmlDomHTMLParser
129
from extruct.uniform import _umicrodata_microformat, _uopengraph
13-
10+
from extruct.utils import parse_xmldom_html
1411

1512
logger = logging.getLogger(__name__)
1613
SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa']
@@ -52,8 +49,7 @@ def extract(htmlstring, base_url=None, encoding="UTF-8",
5249
if errors not in ['log', 'ignore', 'strict']:
5350
raise ValueError('Invalid error command, valid values are either "log"'
5451
', "ignore" or "strict"')
55-
domparser = XmlDomHTMLParser(encoding=encoding)
56-
tree = fromstring(htmlstring, parser=domparser)
52+
tree = parse_xmldom_html(htmlstring, encoding=encoding)
5753
processors = []
5854
if 'microdata' in syntaxes:
5955
processors.append(('microdata', MicrodataExtractor(add_html_node=return_html_node).extract_items, tree))

extruct/jsonld.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
import re
88

99
import lxml.etree
10-
import lxml.html
1110

11+
from extruct.utils import parse_html
1212

1313
HTML_OR_JS_COMMENTLINE = re.compile('^\s*(//.*|<!--.*-->)')
1414

@@ -17,9 +17,8 @@ class JsonLdExtractor(object):
1717
_xp_jsonld = lxml.etree.XPath('descendant-or-self::script[@type="application/ld+json"]')
1818

1919
def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
20-
parser = lxml.html.HTMLParser(encoding=encoding)
21-
lxmldoc = lxml.html.fromstring(htmlstring, parser=parser)
22-
return self.extract_items(lxmldoc, base_url=base_url)
20+
tree = parse_html(htmlstring, encoding=encoding)
21+
return self.extract_items(tree, base_url=base_url)
2322

2423
def extract_items(self, document, base_url=None):
2524
return [item for items in map(self._extract_items,

extruct/opengraph.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import re
2-
import lxml.html
2+
3+
from extruct.utils import parse_html
34

45

56
_PREFIX_PATTERN = re.compile(r'\s*(\w+):\s*([^\s]+)')
@@ -17,9 +18,8 @@ class OpenGraphExtractor(object):
1718
"""OpenGraph extractor following extruct API."""
1819

1920
def extract(self, htmlstring, base_url=None, encoding='UTF-8'):
20-
parser = lxml.html.HTMLParser(encoding=encoding)
21-
doc = lxml.html.fromstring(htmlstring, parser=parser)
22-
return list(self.extract_items(doc, base_url=base_url))
21+
tree = parse_html(htmlstring, encoding=encoding)
22+
return list(self.extract_items(tree, base_url=base_url))
2323

2424
def extract_items(self, document, base_url=None):
2525
# OpenGraph defines a web page as a single rich object.

extruct/rdfa.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,15 @@
66
"""
77
import json
88
import logging
9+
910
rdflib_logger = logging.getLogger('rdflib')
1011
rdflib_logger.setLevel(logging.ERROR)
1112

12-
from lxml.html import fromstring
1313
from rdflib import Graph, logger as rdflib_logger
1414
from rdflib.plugins.parsers.pyRdfa import pyRdfa as PyRdfa, Options, logger as pyrdfa_logger
1515
from rdflib.plugins.parsers.pyRdfa.initialcontext import initial_context
1616

17-
from extruct.xmldom import XmlDomHTMLParser
17+
from extruct.utils import parse_xmldom_html
1818

1919

2020
# silence rdflib/PyRdfa INFO logs
@@ -31,9 +31,7 @@ class RDFaExtractor(object):
3131

3232
def extract(self, htmlstring, base_url=None, encoding="UTF-8",
3333
expanded=True):
34-
35-
domparser = XmlDomHTMLParser(encoding=encoding)
36-
tree = fromstring(htmlstring, parser=domparser)
34+
tree = parse_xmldom_html(htmlstring, encoding=encoding)
3735
return self.extract_items(tree, base_url=base_url, expanded=expanded)
3836

3937
def extract_items(self, document, base_url=None, expanded=True):

extruct/tool.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import argparse
22
import json
33
import requests
4+
45
import extruct
56
from extruct import SYNTAXES
67

7-
def metadata_from_url(url, syntaxes=SYNTAXES, uniform=False,
8+
9+
def metadata_from_url(url, syntaxes=SYNTAXES, uniform=False,
810
schema_context='http://schema.org', errors='strict'):
911
resp = requests.get(url, timeout=30)
1012
result = {'url': url, 'status': '{} {}'.format(resp.status_code, resp.reason)}

extruct/utils.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# -*- coding: utf-8 -*-
2+
import lxml.html
3+
4+
from extruct.xmldom import XmlDomHTMLParser
5+
6+
7+
def parse_html(html, encoding):
8+
""" Parse HTML using lxml.html.HTMLParser, return a tree """
9+
parser = lxml.html.HTMLParser(encoding=encoding)
10+
return lxml.html.fromstring(html, parser=parser)
11+
12+
13+
def parse_xmldom_html(html, encoding):
14+
""" Parse HTML using XmlDomHTMLParser, return a tree """
15+
parser = XmlDomHTMLParser(encoding=encoding)
16+
return lxml.html.fromstring(html, parser=parser)

extruct/w3cmicrodata.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,10 @@
1616
from urllib.parse import urljoin
1717

1818
import lxml.etree
19-
import lxml.html
2019
from w3lib.html import strip_html5_whitespace
2120

21+
from extruct.utils import parse_html
22+
2223

2324
class LxmlMicrodataExtractor(object):
2425
_xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]')
@@ -42,9 +43,8 @@ def get_docid(self, node):
4243
return int(self._xp_item_docid(node))
4344

4445
def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
45-
parser = lxml.html.HTMLParser(encoding=encoding)
46-
lxmldoc = lxml.html.fromstring(htmlstring, parser=parser)
47-
return self.extract_items(lxmldoc, base_url)
46+
tree = parse_html(htmlstring, encoding=encoding)
47+
return self.extract_items(tree, base_url)
4848

4949
def extract_items(self, document, base_url):
5050
items_seen = set()

extruct/xmldom.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
from xml.dom import Node
44
from xml.dom.minidom import Attr, NamedNodeMap
55

6-
from lxml.etree import ElementBase, _ElementStringResult, _ElementUnicodeResult, XPath, tostring
6+
from lxml.etree import (ElementBase, _ElementStringResult,
7+
_ElementUnicodeResult, XPath, tostring)
78
from lxml.html import HTMLParser, HtmlElementClassLookup
89

910

0 commit comments

Comments
 (0)