From ca64127d5779fd638c5485cfbf4c23d530437dde Mon Sep 17 00:00:00 2001 From: Jesir Vargas Date: Fri, 11 Sep 2015 10:40:41 -0400 Subject: [PATCH 1/4] add test for getDocFromUrl() to make sure it actually runs; also, add previously undefined exception 'e' --- boilerpy/extractors.py | 6 ++-- tests/unittests.py | 77 +++++++++++++++++++++++++----------------- 2 files changed, 49 insertions(+), 34 deletions(-) diff --git a/boilerpy/extractors.py b/boilerpy/extractors.py index 462c2f0..230cc1b 100644 --- a/boilerpy/extractors.py +++ b/boilerpy/extractors.py @@ -50,7 +50,7 @@ def getDocFromFile(self,filename): return self.getDoc(self.readFromFile(filename)) def getDocFromUrl(self,url): - return self.getDoc(self.readFromUrl(filename)) + return self.getDoc(self.readFromUrl(url)) def getDoc(self,text): doc=self.parseDoc(text) @@ -85,13 +85,13 @@ def parseDoc(self,inputStr): bpParser=parser.BoilerpipeHTMLParser() try: bpParser.feed(inputStr) - except: + except Exception as exc: #in case of error, try again, first removing script tag content bpParser=parser.BoilerpipeHTMLParser() inputStr=re.sub(r'<(?:script|SCRIPT)[^>]*>.*?','',inputStr,0,re.DOTALL) try: bpParser.feed(inputStr) - except: + except Exception as e: print "Error parsing HTML : "+str(e) return None doc=bpParser.toTextDocument() diff --git a/tests/unittests.py b/tests/unittests.py index ac0a97e..1716a36 100644 --- a/tests/unittests.py +++ b/tests/unittests.py @@ -1,5 +1,8 @@ import unittest import sys + +import mock + from boilerpy.document import TextDocument,TextBlock from boilerpy.filters import * from boilerpy.extractors import Extractor @@ -18,7 +21,7 @@ def runOneTest(): class TestFilters(unittest.TestCase): defaultWords="Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec fermentum tincidunt magna, eu pulvinar mauris dapibus pharetra. In varius, nisl a rutrum porta, sem sem semper lacus, et varius urna tellus vel lorem. Nullam urna eros, luctus eget blandit ac, imperdiet feugiat ipsum. Donec laoreet tristique mi a bibendum. Sed pretium bibendum scelerisque. Mauris id pellentesque turpis. Mauris porta adipiscing massa, quis tempus dui pharetra ac. Morbi lacus mauris, feugiat ac tempor ut, congue tincidunt risus. Pellentesque tincidunt adipiscing elit, in fringilla enim scelerisque vel. Nulla facilisi. ".split(' ') - + def makedoc(self,wordsArr,numAnchorWordsArr=None,isContentArr=None,labelArr=None): textBlocks=[] for idx,words in enumerate(wordsArr): @@ -45,26 +48,26 @@ def makedoc(self,wordsArr,numAnchorWordsArr=None,isContentArr=None,labelArr=None else: block.addLabel(label) except TypeError,IndexError: pass - + textBlocks.append(block) - + return TextDocument(textBlocks) - + def verifyContent(self,filtr,doc,contentArr,show=False): isContentBefore=[block.isContent() for block in doc.getTextBlocks()] isChanged=filtr.process(doc) isContent=[block.isContent() for block in doc.getTextBlocks()] self.assertEqual(isContent,contentArr) self.assertEqual(isChanged,isContent!=isContentBefore) - + def test_markEveryhingContent(self): doc=self.makedoc([5,100,80],None,[False,True,False]) self.verifyContent(MarkEverythingContentFilter(),doc,[True,True,True]) - + def test_inverted(self): doc=self.makedoc([5,100,80],None,[False,True,False]) self.verifyContent(InvertedFilter(),doc,[True,False,True]) - + def test_boilerplateBlock(self): #keeps if isContent doc=self.makedoc([5,100,10,50,80],None,[False,True,False,True,False]) @@ -76,7 +79,7 @@ def test_boilerplateBlock(self): self.assertEqual(doc.getTextBlocks(),finalBlocks) self.assertEqual(isContent,[True,True]) self.assertEqual(isChanged,True) - + def test_minWords(self): #rejects if #words

THIS IS CONTENT

" doc=self.extractor.parseDoc(s) self.assertEqual(doc.getTitle(),titleText) - + def test_body(self): bodyText="THIS IS CONTENT" s="

NOT IN BODY

"+bodyText+"

" doc=self.extractor.parseDoc(s) textArr=[block.getText() for block in doc.getTextBlocks()] self.assertEqual(textArr,[bodyText]) - + def test_inline(self): template="

*

*

**
" content=['AA','BB','CC','DD'] doc=self.makedoc(template,content) - + blocks=doc.getTextBlocks() textArr=[block.getText() for block in blocks] numWords=[block.getNumWords() for block in blocks] self.assertEqual(textArr,[content[0],content[1],content[2]+content[3]]) - + def test_ignorable(self): template="

*

" content=self.makecontent([10,12]) doc=self.makedoc(template,content) - + blocks=doc.getTextBlocks() textArr=[block.getText() for block in blocks] self.assertEqual(textArr,[content[0]]) @@ -372,36 +375,36 @@ def test_textDensity(self): template="

*

*

" content=self.makecontent([80,"one, !!! two"]) doc=self.makedoc(template,content) - + blocks=doc.getTextBlocks() numArr=[[block.getNumWords(),block.numWordsInWrappedLines,block.numWrappedLines,block.getTextDensity()] for block in blocks] - + #exact values are unknown, approximate value range to check self.assertEqual(blocks[0].getNumWords(),80) self.assertRange(blocks[0].numWordsInWrappedLines,60,80) self.assertRange(blocks[0].numWrappedLines,4,7) self.assertRange(blocks[0].getTextDensity(),8,16) - + self.assertEqual(numArr[1],[2,2,1,2]) - + def test_blockIdxs(self): template="

*

*

*

*

" content=self.makecontent([11,12,13,14]) doc=self.makedoc(template,content) - + blocks=doc.getTextBlocks() idxArr=[[block.getOffsetBlocksStart(),block.getOffsetBlocksEnd()] for block in blocks] self.assertEqual(idxArr,[[0,0],[1,1],[2,2],[3,3]]) - + def test_tagLevel(self): template="

*

*
" content=self.makecontent([5,6]) doc=self.makedoc(template,content) - + blocks=doc.getTextBlocks() levelArr=[block.getTagLevel() for block in blocks] self.assertEqual(levelArr,[5,3]) - + def test_merge(self): block1=TextBlock("AA BB CC ",set([0]),3,3,3,1,0) block2=TextBlock("DD EE FF GG HH II JJ .",set([1]),6,0,6,2,1) @@ -417,4 +420,16 @@ def test_merge(self): self.assertEqual(block1.getOffsetBlocksStart(),0) self.assertEqual(block1.getOffsetBlocksEnd(),1) + + def test_getDocFromUrl(self): + """getDocFromUrl() should run (was dying because of undefined 'filename')""" + url = "http://www.example.com/" + fake_readFromUrl = mock.Mock(return_value=u"

Example

") + tmp_filter = MarkEverythingContentFilter() + + with mock.patch.object(self.extractor, "readFromUrl", fake_readFromUrl): + with mock.patch.object(self.extractor, "filter", tmp_filter): + self.assertIsInstance(self.extractor.getDocFromUrl(url), TextDocument) + + runTests() From 9b9dcda1e3213ebad0b16cda029ae8a940781b49 Mon Sep 17 00:00:00 2001 From: Jesir Vargas Date: Fri, 6 Apr 2018 08:40:54 -0400 Subject: [PATCH 2/4] quick port to py3 using futurize, with minor fixes and style cleanups --- boilerpy/__init__.py | 3 +-- boilerpy/document.py | 11 ++++++----- boilerpy/extractors.py | 10 ++++++---- boilerpy/filters.py | 6 +++--- boilerpy/parser.py | 17 ++++++++--------- tests/unittests.py | 11 +++++------ 6 files changed, 29 insertions(+), 29 deletions(-) diff --git a/boilerpy/__init__.py b/boilerpy/__init__.py index a796300..96ca1bd 100644 --- a/boilerpy/__init__.py +++ b/boilerpy/__init__.py @@ -16,5 +16,4 @@ # * See the License for the specific language governing permissions and # * limitations under the License. # - -import extractors,filters,parser,document \ No newline at end of file +from . import extractors, filters, parser, document diff --git a/boilerpy/document.py b/boilerpy/document.py index 8c6852d..c8e8df5 100644 --- a/boilerpy/document.py +++ b/boilerpy/document.py @@ -17,7 +17,8 @@ # * limitations under the License. # # package: de.l3s.boilerpipe.document -import copy,sys +import copy +import sys # # * Some pre-defined labels which can be used in conjunction with @@ -150,8 +151,8 @@ def initDensities(self): if self.numWordsInWrappedLines == 0: self.numWordsInWrappedLines = self.numWords self.numWrappedLines = 1 - self.textDensity = self.numWordsInWrappedLines / float(self.numWrappedLines) - self.linkDensity = 0 if self.numWords==0 else self.numWordsInAnchorText / float(self.numWords) + self.textDensity = self.numWordsInWrappedLines / self.numWrappedLines + self.linkDensity = 0 if self.numWords == 0 else self.numWordsInAnchorText / self.numWords def isContent(self): """ generated source for method isContent """ @@ -294,7 +295,7 @@ def setTagLevel(self, tagLevel): self.tagLevel = tagLevel TextBlock.EMPTY_START = TextBlock("", set(), 0, 0, 0, 0, -1) -TextBlock.EMPTY_END = TextBlock("", set(), 0, 0, 0, 0, sys.maxint) +TextBlock.EMPTY_END = TextBlock("", set(), 0, 0, 0, 0, sys.maxsize) @@ -325,7 +326,7 @@ def __init__(self, doc, contentOnly): # def avgNumWords(self): """ generated source for method avgNumWords """ - return self.numWords / float(self.numBlocks) + return self.numWords / self.numBlocks # # * Returns the overall number of words in all blocks. diff --git a/boilerpy/extractors.py b/boilerpy/extractors.py index 230cc1b..aa72175 100644 --- a/boilerpy/extractors.py +++ b/boilerpy/extractors.py @@ -27,10 +27,12 @@ from xml.sax import parseString, SAXException -import HTMLParser +import html.parser from . import filters from . import parser -import urllib2 +import urllib.request +import urllib.error +import urllib.parse import re class Extractor(object): @@ -67,7 +69,7 @@ def readFromFile(self,filename): return text def readFromUrl(self,url): - f=urllib2.urlopen(url) + f = urllib.request.urlopen(url) text=f.read() encoding=self.getUrlEncoding(f) f.close() @@ -92,7 +94,7 @@ def parseDoc(self,inputStr): try: bpParser.feed(inputStr) except Exception as e: - print "Error parsing HTML : "+str(e) + print("Error parsing HTML : " + str(e)) return None doc=bpParser.toTextDocument() return doc diff --git a/boilerpy/filters.py b/boilerpy/filters.py index c2885bb..43d04e3 100644 --- a/boilerpy/filters.py +++ b/boilerpy/filters.py @@ -59,7 +59,7 @@ import re from . import document -from document import DefaultLabels +from .document import DefaultLabels # Boilerpipe abstract interface @@ -72,11 +72,11 @@ def subtractBlocks(self,blockArr,blocksToRemove): if len(blocksToRemove)==0: return blockArr newBlockArr=[] removeIter=iter(blocksToRemove) - curBlockToRemove=removeIter.next() + curBlockToRemove = next(removeIter) for idx,block in enumerate(blockArr): if block==curBlockToRemove: try: - curBlockToRemove=removeIter.next() + curBlockToRemove = next(removeIter) except StopIteration: #add the rest newBlockArr.extend(blockArr[idx+1:]) diff --git a/boilerpy/parser.py b/boilerpy/parser.py index 5f07449..5e90c43 100644 --- a/boilerpy/parser.py +++ b/boilerpy/parser.py @@ -17,10 +17,10 @@ # * limitations under the License. # -from HTMLParser import HTMLParser +from html.parser import HTMLParser from xml.sax import ContentHandler from . import document -from document import DefaultLabels +from .document import DefaultLabels import re @@ -146,7 +146,7 @@ def start(self, contentHandler, tagName, attrs): sizeAttr = attrs.getValue("size") size=None if sizeAttr != None: - match=PAT_FONT_SIZE.match(sizeAttr) + match = self.PAT_FONT_SIZE.match(sizeAttr) if match!=None: rel=match.group(0) val=match.group(1) @@ -293,13 +293,13 @@ def changesTagLevel(self): def getAncestorLabels(self): """ generated source for method getAncestorLabels """ labelSet = set() - for labels in labelStack: + for labels in self.labelStack: if labels == None:continue labelSet.update(labels) return labelSet -class CommonTagActions: +class CommonTagActions(object): TA_IGNORABLE_ELEMENT=IgnorableElementTagAction() TA_ANCHOR_TEXT=AnchorTextTagAction() TA_BODY=BodyTagAction() @@ -374,7 +374,7 @@ def addTo(self, textBlock): if self.condition(textBlock): self.addLabelsTo(textBlock) -class SpecialTokens: +class SpecialTokens(object): ANCHOR_TEXT_START = u'\ue00astart' ANCHOR_TEXT_END = u'\ue00aend' @@ -397,9 +397,8 @@ class BoilerpipeBaseParser(object): EVENT_CHARACTERS=2 EVENT_WHITESPACE=3 #all word characters except underscore -- i.e. not (not word or underscore) - PAT_VALID_WORD_CHARACTER = re.compile(r"[^\W_]",re.UNICODE) -# PAT_WORD = re.compile(r"\ue00a?[\w]+",re.UNICODE) - PAT_WORD = re.compile(ur"\ue00a?[\w\"'\.,\!\@\-\:\;\$\?\(\)/]+",re.UNICODE) + PAT_VALID_WORD_CHARACTER = re.compile(r"[^\W_]", re.UNICODE) + PAT_WORD = re.compile(r"\ue00a?[\w\"'\.,\!\@\-\:\;\$\?\(\)/]+", re.UNICODE) """ generated source for class BoilerpipeHTMLContentHandler """ # diff --git a/tests/unittests.py b/tests/unittests.py index 1716a36..96e367b 100644 --- a/tests/unittests.py +++ b/tests/unittests.py @@ -1,7 +1,6 @@ import unittest import sys - -import mock +from unittest import mock from boilerpy.document import TextDocument,TextBlock from boilerpy.filters import * @@ -33,12 +32,12 @@ def makedoc(self,wordsArr,numAnchorWordsArr=None,isContentArr=None,labelArr=None numWords=text.count(' ') try: numAnchorWords=numAnchorWordsArr[idx] - except TypeError,IndexError: + except (TypeError, IndexError): numAnchorWords=0 block=TextBlock(text,set(),numWords,numAnchorWords,0,0,idx) try: block.setIsContent(isContentArr[idx]) - except TypeError,IndexError: + except (TypeError, IndexError): pass try: label=labelArr[idx] @@ -46,7 +45,7 @@ def makedoc(self,wordsArr,numAnchorWordsArr=None,isContentArr=None,labelArr=None elif type(label)==list: for l in label: block.addLabel(l) else: block.addLabel(label) - except TypeError,IndexError: + except (TypeError, IndexError): pass textBlocks.append(block) @@ -414,7 +413,7 @@ def test_merge(self): self.assertEqual(block1.getText(),"AA BB CC \nDD EE FF GG HH II JJ .") self.assertEqual(block1.getNumWords(),9) self.assertEqual(block1.getNumWordsInAnchorText(),3) - self.assertAlmostEqual(block1.getLinkDensity(),1.0/3.0) + self.assertAlmostEqual(block1.getLinkDensity(), 1.0 / 3.0) self.assertEqual(block1.getTextDensity(),3) self.assertEqual(block1.getLabels(),set([DefaultLabels.MIGHT_BE_CONTENT,DefaultLabels.ARTICLE_METADATA])) self.assertEqual(block1.getOffsetBlocksStart(),0) From 93fa23e7ef395b0124d4a81f1bafa99bc573adf8 Mon Sep 17 00:00:00 2001 From: Jesir Vargas Date: Fri, 6 Apr 2018 08:47:12 -0400 Subject: [PATCH 3/4] replace tabs with spaces across the board --- .gitattributes | 10 +- README.txt | 14 +- boilerpy/__init__.py | 2 +- boilerpy/document.py | 554 ++++++++-------- boilerpy/extractors.py | 192 +++--- boilerpy/filters.py | 1382 ++++++++++++++++++++-------------------- boilerpy/parser.py | 1042 +++++++++++++++--------------- setup.py | 34 +- tests/unittests.py | 828 ++++++++++++------------ 9 files changed, 2029 insertions(+), 2029 deletions(-) diff --git a/.gitattributes b/.gitattributes index 412eeda..2431c40 100644 --- a/.gitattributes +++ b/.gitattributes @@ -10,13 +10,13 @@ *.dbproj merge=union # Standard to msysgit -*.doc diff=astextplain -*.DOC diff=astextplain +*.doc diff=astextplain +*.DOC diff=astextplain *.docx diff=astextplain *.DOCX diff=astextplain *.dot diff=astextplain *.DOT diff=astextplain *.pdf diff=astextplain -*.PDF diff=astextplain -*.rtf diff=astextplain -*.RTF diff=astextplain +*.PDF diff=astextplain +*.rtf diff=astextplain +*.RTF diff=astextplain diff --git a/README.txt b/README.txt index bf0b259..078fa4b 100644 --- a/README.txt +++ b/README.txt @@ -20,19 +20,19 @@ Installation BoilerPy was packaged with distutils. In can be installed from the command-line with the following line: - ``>python setup.py install`` + ``>python setup.py install`` Usage --------------------------------------- - ``import boilerpy`` + ``import boilerpy`` - ``boilerpy.extractors.ARTICLE_EXTRACTOR.getContentFromUrl('http://www.example.com/')`` + ``boilerpy.extractors.ARTICLE_EXTRACTOR.getContentFromUrl('http://www.example.com/')`` - ``boilerpy.extractors.ARTICLE_EXTRACTOR.getContentFromFile('site/example.html')`` + ``boilerpy.extractors.ARTICLE_EXTRACTOR.getContentFromFile('site/example.html')`` - ``htmlText='

Example

'`` - ``boilerpy.extractors.ARTICLE_EXTRACTOR.getContent(htmlText)`` + ``htmlText='

Example

'`` + ``boilerpy.extractors.ARTICLE_EXTRACTOR.getContent(htmlText)`` @@ -83,4 +83,4 @@ A full-text extractor which is tuned towards extracting sentences from news arti Version --------------------------------------- -1.0 - Created 14 Feb 2013 \ No newline at end of file +1.0 - Created 14 Feb 2013 diff --git a/boilerpy/__init__.py b/boilerpy/__init__.py index 96ca1bd..6d36c52 100644 --- a/boilerpy/__init__.py +++ b/boilerpy/__init__.py @@ -8,7 +8,7 @@ # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * -# * http://www.apache.org/licenses/LICENSE-2.0 +# * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, diff --git a/boilerpy/document.py b/boilerpy/document.py index c8e8df5..9f953ff 100644 --- a/boilerpy/document.py +++ b/boilerpy/document.py @@ -8,7 +8,7 @@ # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * -# * http://www.apache.org/licenses/LICENSE-2.0 +# * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, @@ -27,14 +27,14 @@ # * @author Christian Kohlschtter # class DefaultLabels(object): - """ generated source for class DefaultLabels """ - TITLE = "de.l3s.boilerpipe/TITLE" - ARTICLE_METADATA = "de.l3s.boilerpipe/ARTICLE_METADATA" - INDICATES_END_OF_TEXT = "de.l3s.boilerpipe/INDICATES_END_OF_TEXT" - MIGHT_BE_CONTENT = "de.l3s.boilerpipe/MIGHT_BE_CONTENT" - STRICTLY_NOT_CONTENT = "de.l3s.boilerpipe/STRICTLY_NOT_CONTENT" - HR = "de.l3s.boilerpipe/HR" - MARKUP_PREFIX = "<" + """ generated source for class DefaultLabels """ + TITLE = "de.l3s.boilerpipe/TITLE" + ARTICLE_METADATA = "de.l3s.boilerpipe/ARTICLE_METADATA" + INDICATES_END_OF_TEXT = "de.l3s.boilerpipe/INDICATES_END_OF_TEXT" + MIGHT_BE_CONTENT = "de.l3s.boilerpipe/MIGHT_BE_CONTENT" + STRICTLY_NOT_CONTENT = "de.l3s.boilerpipe/STRICTLY_NOT_CONTENT" + HR = "de.l3s.boilerpipe/HR" + MARKUP_PREFIX = "<" # # * A text document, consisting of one or more {@link TextBlock}s. @@ -42,77 +42,77 @@ class DefaultLabels(object): # * @author Christian Kohlschtter # class TextDocument(object): - # * Creates a new {@link TextDocument} with given {@link TextBlock}s and - # * given title. - # * - # * @param title - # * The "main" title for this text document. - # * @param textBlocks - # * The text blocks of this document. - def __init__(self, textBlocks, title=None): - self.title = title - self.textBlocks = textBlocks - - # * Returns the {@link TextBlock}s of this document. - # * - # * @return A list of {@link TextBlock}s, in sequential order of appearance. - # - def getTextBlocks(self): - """ generated source for method getTextBlocks """ - return self.textBlocks - - def setTextBlocks(self,textBlocks): self.textBlocks=textBlocks - - # - # * Returns the "main" title for this document, or null if no - # * such title has ben set. - # * - # * @return The "main" title. - def getTitle(self): - """ generated source for method getTitle """ - return self.title - - # - # * Updates the "main" title for this document. - # * - # * @param title - def setTitle(self, title): - """ generated source for method setTitle """ - self.title = title - - # - # * Returns the {@link TextDocument}'s content. - # * - # * @return The content text. - def getContent(self): - """ generated source for method getContent """ - return self.getText(True, False) - - # - # * Returns the {@link TextDocument}'s content, non-content or both - # * - # * @param includeContent Whether to include TextBlocks marked as "content". - # * @param includeNonContent Whether to include TextBlocks marked as "non-content". - # * @return The text. - def getText(self, includeContent, includeNonContent): - sb = "" - for block in self.getTextBlocks(): - if block.isContent(): - if not includeContent: - continue - else: - if not includeNonContent: - continue - sb+=block.getText()+'\n' - return sb - - # * Returns detailed debugging information about the contained {@link TextBlock}s. - # * @return Debug information. - def debugString(self): - sb = "" - for tb in self.getTextBlocks(): - sb+=str(tb)+"\n" - return sb + # * Creates a new {@link TextDocument} with given {@link TextBlock}s and + # * given title. + # * + # * @param title + # * The "main" title for this text document. + # * @param textBlocks + # * The text blocks of this document. + def __init__(self, textBlocks, title=None): + self.title = title + self.textBlocks = textBlocks + + # * Returns the {@link TextBlock}s of this document. + # * + # * @return A list of {@link TextBlock}s, in sequential order of appearance. + # + def getTextBlocks(self): + """ generated source for method getTextBlocks """ + return self.textBlocks + + def setTextBlocks(self,textBlocks): self.textBlocks=textBlocks + + # + # * Returns the "main" title for this document, or null if no + # * such title has ben set. + # * + # * @return The "main" title. + def getTitle(self): + """ generated source for method getTitle """ + return self.title + + # + # * Updates the "main" title for this document. + # * + # * @param title + def setTitle(self, title): + """ generated source for method setTitle """ + self.title = title + + # + # * Returns the {@link TextDocument}'s content. + # * + # * @return The content text. + def getContent(self): + """ generated source for method getContent """ + return self.getText(True, False) + + # + # * Returns the {@link TextDocument}'s content, non-content or both + # * + # * @param includeContent Whether to include TextBlocks marked as "content". + # * @param includeNonContent Whether to include TextBlocks marked as "non-content". + # * @return The text. + def getText(self, includeContent, includeNonContent): + sb = "" + for block in self.getTextBlocks(): + if block.isContent(): + if not includeContent: + continue + else: + if not includeNonContent: + continue + sb+=block.getText()+'\n' + return sb + + # * Returns detailed debugging information about the contained {@link TextBlock}s. + # * @return Debug information. + def debugString(self): + sb = "" + for tb in self.getTextBlocks(): + sb+=str(tb)+"\n" + return sb @@ -128,171 +128,171 @@ def debugString(self): # class TextBlock(object): - """ generated source for class TextBlock """ - - def __init__(self, text, containedTextElements=set(), numWords=0, numWordsInAnchorText=0, numWordsInWrappedLines=0, numWrappedLines=0, offsetBlocks=0): - self._isContent = False - self.labels = set() - self.numFullTextWords = 0 - self.tagLevel = 0 - - self.text = text - self.containedTextElements = containedTextElements - self.numWords = numWords - self.numWordsInAnchorText = numWordsInAnchorText - self.numWordsInWrappedLines = numWordsInWrappedLines - self.numWrappedLines = numWrappedLines - self.offsetBlocksStart = offsetBlocks - self.offsetBlocksEnd = offsetBlocks - self.initDensities() - - def initDensities(self): - """ generated source for method initDensities """ - if self.numWordsInWrappedLines == 0: - self.numWordsInWrappedLines = self.numWords - self.numWrappedLines = 1 - self.textDensity = self.numWordsInWrappedLines / self.numWrappedLines - self.linkDensity = 0 if self.numWords == 0 else self.numWordsInAnchorText / self.numWords - - def isContent(self): - """ generated source for method isContent """ - return self._isContent - - def setIsContent(self, isContent): - """ generated source for method setIsContent """ - if isContent != self._isContent: - self._isContent = isContent - return True - else: - return False - - def getText(self): - """ generated source for method getText """ - return self.text - - def getNumWords(self): - """ generated source for method getNumWords """ - return self.numWords - - def getNumWordsInAnchorText(self): - """ generated source for method getNumWordsInAnchorText """ - return self.numWordsInAnchorText - - def getTextDensity(self): - """ generated source for method getTextDensity """ - return self.textDensity - - def getLinkDensity(self): - """ generated source for method getLinkDensity """ - return self.linkDensity - - def mergeNext(self, nextTextBlock): - """ generated source for method mergeNext """ - if self.text==None: self.text="" - self.text+='\n'+nextTextBlock.text - self.numWords += nextTextBlock.numWords - self.numWordsInAnchorText += nextTextBlock.numWordsInAnchorText - self.numWordsInWrappedLines += nextTextBlock.numWordsInWrappedLines - self.numWrappedLines += nextTextBlock.numWrappedLines - self.offsetBlocksStart = min(self.offsetBlocksStart, nextTextBlock.offsetBlocksStart) - self.offsetBlocksEnd = max(self.offsetBlocksEnd, nextTextBlock.offsetBlocksEnd) - self.initDensities() - self._isContent |= nextTextBlock.isContent() - self.containedTextElements|=nextTextBlock.containedTextElements - self.numFullTextWords += nextTextBlock.numFullTextWords - self.labels|=nextTextBlock.labels - self.tagLevel = min(self.tagLevel, nextTextBlock.tagLevel) - - def getOffsetBlocksStart(self): - """ generated source for method getOffsetBlocksStart """ - return self.offsetBlocksStart - - def getOffsetBlocksEnd(self): - """ generated source for method getOffsetBlocksEnd """ - return self.offsetBlocksEnd - - def __repr__(self): - """ generated source for method toString """ - return "[" + str(self.offsetBlocksStart) + "-" + str(self.offsetBlocksEnd) + ";tl=" + str(self.tagLevel) + "; nw=" + str(self.numWords) + ";nwl=" + str(self.numWrappedLines) + ";ld=" + str(self.linkDensity) + "]\t" + ("CONTENT" if self.isContent else "boilerplate") + "," + str(self.labels) + "\n" + str(self.getText()) - - # - # * Adds an arbitrary String label to this {@link TextBlock}. - # * - # * @param label The label - # - def addLabel(self, label): - """ generated source for method addLabel """ - self.labels.add(label) - - # - # * Checks whether this TextBlock has the given label. - # * - # * @param label The label - # * @return true if this block is marked by the given label. - # - def hasLabel(self, label): - """ generated source for method hasLabel """ - return label in self.labels - - def removeLabel(self, label): - """ generated source for method removeLabel """ - try: - self.labels.remove(label) - return True - except KeyError: - return False - - # - # * Returns the labels associated to this TextBlock, or null if no such labels - # * exist. - # * - # * to the data structure. However it is recommended to use the label-specific methods in {@link TextBlock} - # * whenever possible. - # * - # * @return Returns the set of labels, or null if no labels was added yet. - # - def getLabels(self): - """ generated source for method getLabels """ - return self.labels - - # - # * Adds a set of labels to this {@link TextBlock}. - # * null-references are silently ignored. - # * - # * @param labels The labels to be added. - # - def addLabels(self, *labels): - """ generated source for method addLabels """ - if len(labels)==0 or labels[0] == None: return - if self.labels == None: self.labels = set() - elif len(labels)==1 and (type(labels[0])==set or type(labels[0])==list): self.labels|=set(labels[0]) - else: self.labels|=set(labels) - - - # - # * Returns the containedTextElements BitSet, or null. - # * @return - # - def getContainedTextElements(self): - """ generated source for method getContainedTextElements """ - return self.containedTextElements - - def clone(self): - try: - clone = copy.copy(self) - except copy.error: - raise copy.error - if self.labels != None: clone.labels = self.labels.copy() - if self.containedTextElements != None: clone.containedTextElements = self.containedTextElements.copy() - return clone - - def getTagLevel(self): - """ generated source for method getTagLevel """ - return self.tagLevel - - def setTagLevel(self, tagLevel): - """ generated source for method setTagLevel """ - self.tagLevel = tagLevel + """ generated source for class TextBlock """ + + def __init__(self, text, containedTextElements=set(), numWords=0, numWordsInAnchorText=0, numWordsInWrappedLines=0, numWrappedLines=0, offsetBlocks=0): + self._isContent = False + self.labels = set() + self.numFullTextWords = 0 + self.tagLevel = 0 + + self.text = text + self.containedTextElements = containedTextElements + self.numWords = numWords + self.numWordsInAnchorText = numWordsInAnchorText + self.numWordsInWrappedLines = numWordsInWrappedLines + self.numWrappedLines = numWrappedLines + self.offsetBlocksStart = offsetBlocks + self.offsetBlocksEnd = offsetBlocks + self.initDensities() + + def initDensities(self): + """ generated source for method initDensities """ + if self.numWordsInWrappedLines == 0: + self.numWordsInWrappedLines = self.numWords + self.numWrappedLines = 1 + self.textDensity = self.numWordsInWrappedLines / self.numWrappedLines + self.linkDensity = 0 if self.numWords == 0 else self.numWordsInAnchorText / self.numWords + + def isContent(self): + """ generated source for method isContent """ + return self._isContent + + def setIsContent(self, isContent): + """ generated source for method setIsContent """ + if isContent != self._isContent: + self._isContent = isContent + return True + else: + return False + + def getText(self): + """ generated source for method getText """ + return self.text + + def getNumWords(self): + """ generated source for method getNumWords """ + return self.numWords + + def getNumWordsInAnchorText(self): + """ generated source for method getNumWordsInAnchorText """ + return self.numWordsInAnchorText + + def getTextDensity(self): + """ generated source for method getTextDensity """ + return self.textDensity + + def getLinkDensity(self): + """ generated source for method getLinkDensity """ + return self.linkDensity + + def mergeNext(self, nextTextBlock): + """ generated source for method mergeNext """ + if self.text==None: self.text="" + self.text+='\n'+nextTextBlock.text + self.numWords += nextTextBlock.numWords + self.numWordsInAnchorText += nextTextBlock.numWordsInAnchorText + self.numWordsInWrappedLines += nextTextBlock.numWordsInWrappedLines + self.numWrappedLines += nextTextBlock.numWrappedLines + self.offsetBlocksStart = min(self.offsetBlocksStart, nextTextBlock.offsetBlocksStart) + self.offsetBlocksEnd = max(self.offsetBlocksEnd, nextTextBlock.offsetBlocksEnd) + self.initDensities() + self._isContent |= nextTextBlock.isContent() + self.containedTextElements|=nextTextBlock.containedTextElements + self.numFullTextWords += nextTextBlock.numFullTextWords + self.labels|=nextTextBlock.labels + self.tagLevel = min(self.tagLevel, nextTextBlock.tagLevel) + + def getOffsetBlocksStart(self): + """ generated source for method getOffsetBlocksStart """ + return self.offsetBlocksStart + + def getOffsetBlocksEnd(self): + """ generated source for method getOffsetBlocksEnd """ + return self.offsetBlocksEnd + + def __repr__(self): + """ generated source for method toString """ + return "[" + str(self.offsetBlocksStart) + "-" + str(self.offsetBlocksEnd) + ";tl=" + str(self.tagLevel) + "; nw=" + str(self.numWords) + ";nwl=" + str(self.numWrappedLines) + ";ld=" + str(self.linkDensity) + "]\t" + ("CONTENT" if self.isContent else "boilerplate") + "," + str(self.labels) + "\n" + str(self.getText()) + + # + # * Adds an arbitrary String label to this {@link TextBlock}. + # * + # * @param label The label + # + def addLabel(self, label): + """ generated source for method addLabel """ + self.labels.add(label) + + # + # * Checks whether this TextBlock has the given label. + # * + # * @param label The label + # * @return true if this block is marked by the given label. + # + def hasLabel(self, label): + """ generated source for method hasLabel """ + return label in self.labels + + def removeLabel(self, label): + """ generated source for method removeLabel """ + try: + self.labels.remove(label) + return True + except KeyError: + return False + + # + # * Returns the labels associated to this TextBlock, or null if no such labels + # * exist. + # * + # * to the data structure. However it is recommended to use the label-specific methods in {@link TextBlock} + # * whenever possible. + # * + # * @return Returns the set of labels, or null if no labels was added yet. + # + def getLabels(self): + """ generated source for method getLabels """ + return self.labels + + # + # * Adds a set of labels to this {@link TextBlock}. + # * null-references are silently ignored. + # * + # * @param labels The labels to be added. + # + def addLabels(self, *labels): + """ generated source for method addLabels """ + if len(labels)==0 or labels[0] == None: return + if self.labels == None: self.labels = set() + elif len(labels)==1 and (type(labels[0])==set or type(labels[0])==list): self.labels|=set(labels[0]) + else: self.labels|=set(labels) + + + # + # * Returns the containedTextElements BitSet, or null. + # * @return + # + def getContainedTextElements(self): + """ generated source for method getContainedTextElements """ + return self.containedTextElements + + def clone(self): + try: + clone = copy.copy(self) + except copy.error: + raise copy.error + if self.labels != None: clone.labels = self.labels.copy() + if self.containedTextElements != None: clone.containedTextElements = self.containedTextElements.copy() + return clone + + def getTagLevel(self): + """ generated source for method getTagLevel """ + return self.tagLevel + + def setTagLevel(self, tagLevel): + """ generated source for method setTagLevel """ + self.tagLevel = tagLevel TextBlock.EMPTY_START = TextBlock("", set(), 0, 0, 0, 0, -1) TextBlock.EMPTY_END = TextBlock("", set(), 0, 0, 0, 0, sys.maxsize) @@ -304,35 +304,35 @@ def setTagLevel(self, tagLevel): # * @author Christian Kohlschuetter # class TextDocumentStatistics(object): - # - # * Computes statistics on a given {@link TextDocument}. - # * - # * @param doc The {@link TextDocument}. - # * @param contentOnly if true then o - # - def __init__(self, doc, contentOnly): - self.numWords=0 - self.numBlocks=0 - for tb in doc.getTextBlocks(): - if contentOnly and not tb.isContent(): continue - self.numWords += tb.getNumWords() - self.numBlocks += 1 - - - # * Returns the average number of words at block-level (= overall number of words divided by - # * the number of blocks). - # * - # * @return Average - # - def avgNumWords(self): - """ generated source for method avgNumWords """ - return self.numWords / self.numBlocks - - # - # * Returns the overall number of words in all blocks. - # * - # * @return Sum - # - def getNumWords(self): - """ generated source for method getNumWords """ - return self.numWords + # + # * Computes statistics on a given {@link TextDocument}. + # * + # * @param doc The {@link TextDocument}. + # * @param contentOnly if true then o + # + def __init__(self, doc, contentOnly): + self.numWords=0 + self.numBlocks=0 + for tb in doc.getTextBlocks(): + if contentOnly and not tb.isContent(): continue + self.numWords += tb.getNumWords() + self.numBlocks += 1 + + + # * Returns the average number of words at block-level (= overall number of words divided by + # * the number of blocks). + # * + # * @return Average + # + def avgNumWords(self): + """ generated source for method avgNumWords """ + return self.numWords / self.numBlocks + + # + # * Returns the overall number of words in all blocks. + # * + # * @return Sum + # + def getNumWords(self): + """ generated source for method getNumWords """ + return self.numWords diff --git a/boilerpy/extractors.py b/boilerpy/extractors.py index aa72175..d3e95e7 100644 --- a/boilerpy/extractors.py +++ b/boilerpy/extractors.py @@ -8,7 +8,7 @@ # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * -# * http://www.apache.org/licenses/LICENSE-2.0 +# * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, @@ -36,68 +36,68 @@ import re class Extractor(object): - def __init__(self,filtr): - self.filter=filtr - - def getContent(self, text): - return self.getDoc(text).getContent() - - def getContentFromUrl(self, url): - return self.getDocFromUrl(url).getContent() - - def getContentFromFile(self, filename): - return self.getDocFromFile(filename).getContent() - - def getDocFromFile(self,filename): - return self.getDoc(self.readFromFile(filename)) - - def getDocFromUrl(self,url): - return self.getDoc(self.readFromUrl(url)) - - def getDoc(self,text): - doc=self.parseDoc(text) - self.filter.process(doc) - return doc - - def readFromFile(self,filename): - f=open(filename,'r') - text=f.read() - f.close() - try: - text=text.decode('utf8') - except UnicodeDecodeError: pass - return text - - def readFromUrl(self,url): - f = urllib.request.urlopen(url) - text=f.read() - encoding=self.getUrlEncoding(f) - f.close() - try: - text=text.decode(encoding) - except UnicodeDecodeError: pass - return text - - def getUrlEncoding(self,f): - try: - return f.headers['content-type'].split('charset=')[1].split(';')[0] - except: return 'utf8' - - def parseDoc(self,inputStr): - bpParser=parser.BoilerpipeHTMLParser() - try: - bpParser.feed(inputStr) - except Exception as exc: - #in case of error, try again, first removing script tag content - bpParser=parser.BoilerpipeHTMLParser() - inputStr=re.sub(r'<(?:script|SCRIPT)[^>]*>.*?','',inputStr,0,re.DOTALL) - try: - bpParser.feed(inputStr) - except Exception as e: - print("Error parsing HTML : " + str(e)) - return None - doc=bpParser.toTextDocument() - return doc + def __init__(self,filtr): + self.filter=filtr + + def getContent(self, text): + return self.getDoc(text).getContent() + + def getContentFromUrl(self, url): + return self.getDocFromUrl(url).getContent() + + def getContentFromFile(self, filename): + return self.getDocFromFile(filename).getContent() + + def getDocFromFile(self,filename): + return self.getDoc(self.readFromFile(filename)) + + def getDocFromUrl(self,url): + return self.getDoc(self.readFromUrl(url)) + + def getDoc(self,text): + doc=self.parseDoc(text) + self.filter.process(doc) + return doc + + def readFromFile(self,filename): + f=open(filename,'r') + text=f.read() + f.close() + try: + text=text.decode('utf8') + except UnicodeDecodeError: pass + return text + + def readFromUrl(self,url): + f = urllib.request.urlopen(url) + text=f.read() + encoding=self.getUrlEncoding(f) + f.close() + try: + text=text.decode(encoding) + except UnicodeDecodeError: pass + return text + + def getUrlEncoding(self,f): + try: + return f.headers['content-type'].split('charset=')[1].split(';')[0] + except: return 'utf8' + + def parseDoc(self,inputStr): + bpParser=parser.BoilerpipeHTMLParser() + try: + bpParser.feed(inputStr) + except Exception as exc: + #in case of error, try again, first removing script tag content + bpParser=parser.BoilerpipeHTMLParser() + inputStr=re.sub(r'<(?:script|SCRIPT)[^>]*>.*?','',inputStr,0,re.DOTALL) + try: + bpParser.feed(inputStr) + except Exception as e: + print("Error parsing HTML : " + str(e)) + return None + doc=bpParser.toTextDocument() + return doc @@ -105,28 +105,28 @@ def parseDoc(self,inputStr): # * A full-text extractor which is tuned towards news articles. In this scenario # * it achieves higher accuracy than {@link DefaultExtractor}. articleFilterChain=filters.FilterChain([ - filters.TerminatingBlocksFinder(), - filters.DocumentTitleMatchClassifier(None,True), - filters.NumWordsRulesClassifier(), - filters.IgnoreBlocksAfterContentFilter(), - filters.BlockProximityFusion(1,False,False), - filters.BoilerplateBlockFilter(), - filters.BlockProximityFusion(1,True,False), - filters.KeepLargestBlockFilter(), - filters.ExpandTitleToContentFilter() + filters.TerminatingBlocksFinder(), + filters.DocumentTitleMatchClassifier(None,True), + filters.NumWordsRulesClassifier(), + filters.IgnoreBlocksAfterContentFilter(), + filters.BlockProximityFusion(1,False,False), + filters.BoilerplateBlockFilter(), + filters.BlockProximityFusion(1,True,False), + filters.KeepLargestBlockFilter(), + filters.ExpandTitleToContentFilter() ]) -# * Works very well for most types of Article-like HTML. +# * Works very well for most types of Article-like HTML. ARTICLE_EXTRACTOR = Extractor(articleFilterChain) # class DefaultExtractor -# * Usually worse than {@link ArticleExtractor}, but simpler/no heuristics. +# * Usually worse than {@link ArticleExtractor}, but simpler/no heuristics. # * A quite generic full-text extractor. defaultFilterChain=filters.FilterChain([ - filters.SimpleBlockFusionProcessor(), - filters.BlockProximityFusion(1,False,False), - filters.DensityRulesClassifier() + filters.SimpleBlockFusionProcessor(), + filters.BlockProximityFusion(1,False,False), + filters.DensityRulesClassifier() ]) DEFAULT_EXTRACTOR = Extractor(defaultFilterChain) @@ -137,19 +137,19 @@ def parseDoc(self,inputStr): # * For news articles, it may perform better than the {@link DefaultExtractor}, # * but usually worse than {@link ArticleExtractor}. largestContentFilterChain=filters.FilterChain([ - filters.NumWordsRulesClassifier(), - filters.BlockProximityFusion(1,False,False), - filters.KeepLargestBlockFilter() + filters.NumWordsRulesClassifier(), + filters.BlockProximityFusion(1,False,False), + filters.KeepLargestBlockFilter() ]) -# * Like {@link DefaultExtractor}, but keeps the largest text block only. +# * Like {@link DefaultExtractor}, but keeps the largest text block only. LARGEST_CONTENT_EXTRACTOR = Extractor(largestContentFilterChain) # class CanolaExtractor -# * Trained on krdwrd Canola (different definition of "boilerplate"). You may -# * give it a try. +# * Trained on krdwrd Canola (different definition of "boilerplate"). You may +# * give it a try. CANOLA_EXTRACTOR = Extractor(filters.CanolaFilter()) @@ -157,9 +157,9 @@ def parseDoc(self,inputStr): # class KeepEverythingExtractor # * Marks everything as content. -# * Dummy Extractor; should return the input text. Use this to double-check -# * that your problem is within a particular {@link BoilerpipeExtractor}, or -# * somewhere else. +# * Dummy Extractor; should return the input text. Use this to double-check +# * that your problem is within a particular {@link BoilerpipeExtractor}, or +# * somewhere else. KEEP_EVERYTHING_EXTRACTOR = Extractor(filters.MarkEverythingContentFilter()) @@ -176,9 +176,9 @@ def parseDoc(self,inputStr): # class ArticleSentencesExtractor # * A full-text extractor which is tuned towards extracting sentences from news articles. ARTICLE_SENTENCES_EXTRACTOR=Extractor(filters.FilterChain([ - articleFilterChain, - filters.SplitParagraphBlocksFilter(), - filters.MinClauseWordsFilter() + articleFilterChain, + filters.SplitParagraphBlocksFilter(), + filters.MinClauseWordsFilter() ])) @@ -186,10 +186,10 @@ def parseDoc(self,inputStr): # * For news articles, it may perform better than the {@link DefaultExtractor}, # * but usually worse than {@link ArticleExtractor}. class KeepEverythingWithMinKWordsFilter(filters.FilterChain): - def __init__(self, kMin): - filterArr = [ - filters.SimpleBlockFusionProcessor(), - filters.MarkEverythingContentFilter(), - filters.MinWordsFilter(kMin) - ] - super(KeepEverythingWithMinKWordsFilter, self).__init__(filters) + def __init__(self, kMin): + filterArr = [ + filters.SimpleBlockFusionProcessor(), + filters.MarkEverythingContentFilter(), + filters.MinWordsFilter(kMin) + ] + super(KeepEverythingWithMinKWordsFilter, self).__init__(filters) diff --git a/boilerpy/filters.py b/boilerpy/filters.py index 43d04e3..a9714ed 100644 --- a/boilerpy/filters.py +++ b/boilerpy/filters.py @@ -9,7 +9,7 @@ # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * -# * http://www.apache.org/licenses/LICENSE-2.0 +# * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, @@ -64,37 +64,37 @@ # Boilerpipe abstract interface class BoilerpipeFilter(object): - def process(self, doc): pass - - def subtractBlocks(self,blockArr,blocksToRemove): - #inefficient but in place: for block in blocksToRemove: blockArr.remove(blocksToRemove) - #efficiently subtracts second array from first assuming blocksToRemove shows up in the same order as blocArr - if len(blocksToRemove)==0: return blockArr - newBlockArr=[] - removeIter=iter(blocksToRemove) - curBlockToRemove = next(removeIter) - for idx,block in enumerate(blockArr): - if block==curBlockToRemove: - try: - curBlockToRemove = next(removeIter) - except StopIteration: - #add the rest - newBlockArr.extend(blockArr[idx+1:]) - break - else: newBlockArr.append(block) - return newBlockArr + def process(self, doc): pass + + def subtractBlocks(self,blockArr,blocksToRemove): + #inefficient but in place: for block in blocksToRemove: blockArr.remove(blocksToRemove) + #efficiently subtracts second array from first assuming blocksToRemove shows up in the same order as blocArr + if len(blocksToRemove)==0: return blockArr + newBlockArr=[] + removeIter=iter(blocksToRemove) + curBlockToRemove = next(removeIter) + for idx,block in enumerate(blockArr): + if block==curBlockToRemove: + try: + curBlockToRemove = next(removeIter) + except StopIteration: + #add the rest + newBlockArr.extend(blockArr[idx+1:]) + break + else: newBlockArr.append(block) + return newBlockArr # chain together multiple filters in sequence class FilterChain(BoilerpipeFilter): - def __init__(self,filterArr): - super(FilterChain, self).__init__() - self.filterArr=filterArr - - def process(self,doc): - isUpdated=False - for filtr in self.filterArr: - isUpdated|=filtr.process(doc) - return isUpdated + def __init__(self,filterArr): + super(FilterChain, self).__init__() + self.filterArr=filterArr + + def process(self,doc): + isUpdated=False + for filtr in self.filterArr: + isUpdated|=filtr.process(doc) + return isUpdated #----------------------------------------------------------------------- @@ -109,14 +109,14 @@ def process(self,doc): # * @author Christian Kohlschtter # class MarkEverythingContentFilter(BoilerpipeFilter): - def process(self, doc): - """ generated source for method process """ - changes = False - for tb in doc.getTextBlocks(): - if not tb.isContent(): - tb.setIsContent(True) - changes = True - return changes + def process(self, doc): + """ generated source for method process """ + changes = False + for tb in doc.getTextBlocks(): + if not tb.isContent(): + tb.setIsContent(True) + changes = True + return changes # @@ -126,12 +126,12 @@ def process(self, doc): # class InvertedFilter(BoilerpipeFilter): - def process(self, doc): - """ generated source for method process """ - tbs = doc.getTextBlocks() - if len(tbs)==0: return False - for tb in tbs: tb.setIsContent(not tb.isContent()) - return True + def process(self, doc): + """ generated source for method process """ + tbs = doc.getTextBlocks() + if len(tbs)==0: return False + for tb in tbs: tb.setIsContent(not tb.isContent()) + return True # @@ -140,14 +140,14 @@ def process(self, doc): # * @author Christian Kohlschtter # class BoilerplateBlockFilter(BoilerpipeFilter): - def process(self, doc): - """ generated source for method process """ - textBlocks = doc.getTextBlocks() - newBlocks=[tb for tb in textBlocks if tb.isContent()] - hasChanges = len(newBlocks)= self.minWords: return True - return n >= self.minWords + def __init__(self, minWords=5, acceptClausesWithoutDelimiter=False): + super(MinClauseWordsFilter, self).__init__() + self.minWords = minWords + self.acceptClausesWithoutDelimiter = acceptClausesWithoutDelimiter + + PAT_CLAUSE_DELIMITER = re.compile(r"\b[\,\.\:\;\!\?]+(?:\s+|\Z)",re.UNICODE) + PAT_WHITESPACE = re.compile("\s+") + + def process(self, doc): + """ generated source for method process """ + changes = False + for tb in doc.getTextBlocks(): + if not tb.isContent(): continue + hasClause = False + possibleClauseArr=self.PAT_CLAUSE_DELIMITER.split(tb.getText()) + for possibleClause in possibleClauseArr[:-1]: + hasClause = self.isClauseAccepted(possibleClause) + if hasClause: break + + # since clauses should *always end* with a delimiter, we normally + # don't consider text without one + if self.acceptClausesWithoutDelimiter: + hasClause |= self.isClauseAccepted(possibleClauseArr[-1]) + if not hasClause: + tb.setIsContent(False) + changes = True + # System.err.println("IS NOT CONTENT: " + text); + return changes + + def isClauseAccepted(self, text): + """ generated source for method isClause """ + n = 1 + for match in self.PAT_WHITESPACE.finditer(text): + n += 1 + if n >= self.minWords: return True + return n >= self.minWords # @@ -230,56 +230,56 @@ def isClauseAccepted(self, text): # * @see MinClauseWordsFilter # class SplitParagraphBlocksFilter(BoilerpipeFilter): - def process(self, doc): - changes = False - blocks = doc.getTextBlocks() - blocksNew = [] - for tb in blocks: - text = tb.getText(); - paragraphs = re.split(r"[\n\r]+",text) - if len(paragraphs)<2: - blocksNew.append(tb) - continue - isContent = tb.isContent() - labels = tb.getLabels() - for p in paragraphs: - tbP=document.TextBlock(p) - tbP.setIsContent(isContent) - tbP.addLabels(labels) - blocksNew.append(tbP) - changes = True - - if changes: doc.setTextBlocks(blocksNew) - return changes - + def process(self, doc): + changes = False + blocks = doc.getTextBlocks() + blocksNew = [] + for tb in blocks: + text = tb.getText(); + paragraphs = re.split(r"[\n\r]+",text) + if len(paragraphs)<2: + blocksNew.append(tb) + continue + isContent = tb.isContent() + labels = tb.getLabels() + for p in paragraphs: + tbP=document.TextBlock(p) + tbP.setIsContent(isContent) + tbP.addLabels(labels) + blocksNew.append(tbP) + changes = True + + if changes: doc.setTextBlocks(blocksNew) + return changes + class SurroundingToContentFilter(BoilerpipeFilter): - # this is now default when no arguments are passed - #INSTANCE_TEXT = SurroundingToContentFilter(TextBlockCondition()) - - #ctor - condition is an function for an additional condition to determine if it can be made content - def __init__(self, condition=lambda tb:tb.getLinkDensity()==0 and tb.getNumWords()>6): - super(SurroundingToContentFilter, self).__init__() - self.cond=condition - - def process(self, doc): - """ generated source for method process """ - tbs = doc.getTextBlocks() - n=len(tbs) - hasChanges=False - i=1 - while i6): + super(SurroundingToContentFilter, self).__init__() + self.cond=condition + + def process(self, doc): + """ generated source for method process """ + tbs = doc.getTextBlocks() + n=len(tbs) + hasChanges=False + i=1 + while i0: - newBlocks=self.subtractBlocks(textBlocks,blocksToRemove) - doc.setTextBlocks(newBlocks) - changes=True - - return changes + """ generated source for class BlockProximityFusion """ + #MAX_DISTANCE_1 = BlockProximityFusion(1, False, False) + #MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion(1, False, True) + #MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion(1, True, False) + #MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = BlockProximityFusion(1, True, True) + + # + # * Creates a new {@link BlockProximityFusion} instance. + # * + # * @param maxBlocksDistance The maximum distance in blocks. + # * @param contentOnly + # + def __init__(self, maxBlocksDistance=1, contentOnly=False, sameTagLevelOnly=False): + """ generated source for method __init__ """ + super(BlockProximityFusion, self).__init__() + self.maxBlocksDistance = maxBlocksDistance + self.contentOnly = contentOnly + self.sameTagLevelOnly = sameTagLevelOnly + + def process(self, doc): + """ generated source for method process """ + textBlocks = doc.getTextBlocks() + if len(textBlocks) < 2: return False + changes = False + + if self.contentOnly: + startIdx=None + for idx,block in enumerate(textBlocks): + if block.isContent(): + startIdx=idx + break + if startIdx == None: return False + else: + startIdx=0 + + prevBlock=textBlocks[startIdx] + blocksToRemove=[] + for block in textBlocks[startIdx+1:]: + if not block.isContent(): + prevBlock = block + continue + diffBlocks = block.getOffsetBlocksStart() - prevBlock.getOffsetBlocksEnd() - 1; + if diffBlocks <= self.maxBlocksDistance: + ok=True + if self.contentOnly: + if not prevBlock.isContent() or not block.isContent(): + ok = False + if self.sameTagLevelOnly and prevBlock.getTagLevel() != block.getTagLevel(): + ok = False + if ok: + prevBlock.mergeNext(block) + #remove current block + blocksToRemove.append(block) + changes = True + else: + prevBlock = block + else: + prevBlock = block + + if len(blocksToRemove)>0: + newBlocks=self.subtractBlocks(textBlocks,blocksToRemove) + doc.setTextBlocks(newBlocks) + changes=True + + return changes @@ -521,49 +521,49 @@ def process(self, doc): # * @author Christian Kohlschtter # class KeepLargestBlockFilter(BoilerpipeFilter): - """ generated source for class KeepLargestBlockFilter """ - #INSTANCE = KeepLargestBlockFilter(False) - #INSTANCE_EXPAND_TO_SAME_TAGLEVEL = KeepLargestBlockFilter(True) - - def __init__(self, expandToSameLevelText=False): - """ generated source for method __init__ """ - super(KeepLargestBlockFilter, self).__init__() - self.expandToSameLevelText = expandToSameLevelText - - def process(self, doc): - """ generated source for method process """ - textBlocks = doc.getTextBlocks() - if len(textBlocks) < 2: return False - - try: - contentBlockIter=(tb for tb in textBlocks if tb.isContent()) - largestBlock=max(contentBlockIter,key=lambda tb:tb.getNumWords()) - except ValueError: - #no content blocks exist / largest block not found - largestBlock=None - - for tb in textBlocks: - if tb == largestBlock: - tb.setIsContent(True) - else: - tb.setIsContent(False) - tb.addLabel(DefaultLabels.MIGHT_BE_CONTENT) - - if self.expandToSameLevelText and largestBlock!=None: - level = largestBlock.getTagLevel() - largestBlockIdx=textBlocks.index(largestBlock) - - for tb in textBlocks[largestBlockIdx::-1]: - tl=tb.getTagLevel() - if tl < level: break - elif tl == level: tb.setIsContent(True) - - for tb in textBlocks[largestBlockIdx:]: - tl=tb.getTagLevel() - if tl < level: break - elif tl == level: tb.setIsContent(True) - - return True + """ generated source for class KeepLargestBlockFilter """ + #INSTANCE = KeepLargestBlockFilter(False) + #INSTANCE_EXPAND_TO_SAME_TAGLEVEL = KeepLargestBlockFilter(True) + + def __init__(self, expandToSameLevelText=False): + """ generated source for method __init__ """ + super(KeepLargestBlockFilter, self).__init__() + self.expandToSameLevelText = expandToSameLevelText + + def process(self, doc): + """ generated source for method process """ + textBlocks = doc.getTextBlocks() + if len(textBlocks) < 2: return False + + try: + contentBlockIter=(tb for tb in textBlocks if tb.isContent()) + largestBlock=max(contentBlockIter,key=lambda tb:tb.getNumWords()) + except ValueError: + #no content blocks exist / largest block not found + largestBlock=None + + for tb in textBlocks: + if tb == largestBlock: + tb.setIsContent(True) + else: + tb.setIsContent(False) + tb.addLabel(DefaultLabels.MIGHT_BE_CONTENT) + + if self.expandToSameLevelText and largestBlock!=None: + level = largestBlock.getTagLevel() + largestBlockIdx=textBlocks.index(largestBlock) + + for tb in textBlocks[largestBlockIdx::-1]: + tl=tb.getTagLevel() + if tl < level: break + elif tl == level: tb.setIsContent(True) + + for tb in textBlocks[largestBlockIdx:]: + tl=tb.getTagLevel() + if tl < level: break + elif tl == level: tb.setIsContent(True) + + return True # * Marks all {@link TextBlock}s "content" which are between the headline and the part that @@ -574,44 +574,44 @@ def process(self, doc): # * @author Christian Kohlschtter # class ExpandTitleToContentFilter(BoilerpipeFilter): - def process(self, doc): - """ generated source for method process """ - i = 0 - titleIdx = -1 - contentStart = -1 - for tb in doc.getTextBlocks(): - if contentStart == -1 and tb.hasLabel(DefaultLabels.TITLE): - titleIdx = i - if contentStart == -1 and tb.isContent(): - contentStart = i - i += 1 - - if contentStart <= titleIdx or titleIdx == -1: return False - - changes = False - for tb in doc.getTextBlocks()[titleIdx:contentStart]: - if tb.hasLabel(DefaultLabels.MIGHT_BE_CONTENT): - changes |= tb.setIsContent(True) - return changes + def process(self, doc): + """ generated source for method process """ + i = 0 + titleIdx = -1 + contentStart = -1 + for tb in doc.getTextBlocks(): + if contentStart == -1 and tb.hasLabel(DefaultLabels.TITLE): + titleIdx = i + if contentStart == -1 and tb.isContent(): + contentStart = i + i += 1 + + if contentStart <= titleIdx or titleIdx == -1: return False + + changes = False + for tb in doc.getTextBlocks()[titleIdx:contentStart]: + if tb.hasLabel(DefaultLabels.MIGHT_BE_CONTENT): + changes |= tb.setIsContent(True) + return changes class ArticleMetadataFilter(BoilerpipeFilter): - #checks for date/time/author blocks - PATTERNS_SHORT = [re.compile(r"^[0-9 \,\./]*\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)?\b[0-9 \,\:apm\./]*(?:[CPSDMGET]{2,3})?$"), re.compile("^[Bb]y ")]; - - def process(self, doc): - """ generated source for method process """ - changed = False - for tb in doc.getTextBlocks(): - if tb.getNumWords() > 10: continue - for p in self.PATTERNS_SHORT: - text = tb.getText() - if p.search(text): - changed = True - tb.setIsContent(True) - tb.addLabel(DefaultLabels.ARTICLE_METADATA) - break - return changed + #checks for date/time/author blocks + PATTERNS_SHORT = [re.compile(r"^[0-9 \,\./]*\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)?\b[0-9 \,\:apm\./]*(?:[CPSDMGET]{2,3})?$"), re.compile("^[Bb]y ")]; + + def process(self, doc): + """ generated source for method process """ + changed = False + for tb in doc.getTextBlocks(): + if tb.getNumWords() > 10: continue + for p in self.PATTERNS_SHORT: + text = tb.getText() + if p.search(text): + changed = True + tb.setIsContent(True) + tb.addLabel(DefaultLabels.ARTICLE_METADATA) + break + return changed # @@ -620,36 +620,36 @@ def process(self, doc): # * @author Christian Kohlschtter # class AddPrecedingLabelsFilter(BoilerpipeFilter): - #INSTANCE = AddPrecedingLabelsFilter("") - #INSTANCE_PRE = AddPrecedingLabelsFilter("^") - - # - # * Creates a new {@link AddPrecedingLabelsFilter} instance. - # * - # * @param maxBlocksDistance The maximum distance in blocks. - # * @param contentOnly - # - def __init__(self, labelPrefix=""): - """ generated source for method __init__ """ - super(AddPrecedingLabelsFilter, self).__init__() - self.labelPrefix = labelPrefix - - def process(self, doc): - """ generated source for method process """ - textBlocks = doc.getTextBlocks() - if len(textBlocks) < 2: return False - changes = False - blockBelow = None - - for block in textBlocks[::-1]: - if blockBelow != None: - labels=block.getLabels() - if labels != None and len(labels)>0: - for l in labels: blockBelow.addLabel(self.labelPrefix + l) - changes = True - blockBelow = block - - return changes + #INSTANCE = AddPrecedingLabelsFilter("") + #INSTANCE_PRE = AddPrecedingLabelsFilter("^") + + # + # * Creates a new {@link AddPrecedingLabelsFilter} instance. + # * + # * @param maxBlocksDistance The maximum distance in blocks. + # * @param contentOnly + # + def __init__(self, labelPrefix=""): + """ generated source for method __init__ """ + super(AddPrecedingLabelsFilter, self).__init__() + self.labelPrefix = labelPrefix + + def process(self, doc): + """ generated source for method process """ + textBlocks = doc.getTextBlocks() + if len(textBlocks) < 2: return False + changes = False + blockBelow = None + + for block in textBlocks[::-1]: + if blockBelow != None: + labels=block.getLabels() + if labels != None and len(labels)>0: + for l in labels: blockBelow.addLabel(self.labelPrefix + l) + changes = True + blockBelow = block + + return changes # @@ -661,67 +661,67 @@ def process(self, doc): # class DocumentTitleMatchClassifier(BoilerpipeFilter): - """ generated source for class DocumentTitleMatchClassifier """ - - def __init__(self, title, useDocTitle=False): - """ generated source for method __init__ """ - super(DocumentTitleMatchClassifier, self).__init__() - self.useDocTitle=useDocTitle - if useDocTitle: self.potentialTitles=None - else: self.potentialTitles=self.findPotentialTitles(title) - - def findPotentialTitles(self,title): - if title == None: return None - title = title.strip() - if len(title)==0: - return None - else: - potentialTitles = set() - potentialTitles.add(title) - p = self.getLongestPart(title, "[ ]*[\||:][ ]*") - if p != None: potentialTitles.add(p) - p = self.getLongestPart(title, "[ ]*[\||:\(\)][ ]*") - if p != None: potentialTitles.add(p) - p = self.getLongestPart(title, "[ ]*[\||:\(\)\-][ ]*") - if p != None: potentialTitles.add(p) - p = self.getLongestPart(title, "[ ]*[\||,|:\(\)\-][ ]*") - if p != None: potentialTitles.add(p) - return potentialTitles - - def getPotentialTitles(self): - """ generated source for method getPotentialTitles """ - return self.potentialTitles - - def getLongestPart(self, title, pattern): - """ generated source for method getLongestPart """ - parts = re.split(pattern,title) - if len(parts)==1: return None - - longestNumWords = 0 - longestPart = "" - for p in parts: - if ".com" in p: continue - numWords=self.getNumWords(p) - if numWords > longestNumWords or len(p)>len(longestPart): - longestNumWords = numWords - longestPart = p - if len(longestPart)==0: return None - else: return longestPart.strip() - - def getNumWords(self,text): - return len(re.findall("\w+",text,re.UNICODE)) - - def process(self, doc): - """ generated source for method process """ - if self.useDocTitle: self.potentialTitles=self.findPotentialTitles(doc.getTitle()) - if self.potentialTitles == None: return False - changes = False - for tb in doc.getTextBlocks(): - text=tb.getText().strip().lower() - if any(candidate.lower()==text for candidate in self.potentialTitles): - tb.addLabel(DefaultLabels.TITLE) - changes = True - return changes + """ generated source for class DocumentTitleMatchClassifier """ + + def __init__(self, title, useDocTitle=False): + """ generated source for method __init__ """ + super(DocumentTitleMatchClassifier, self).__init__() + self.useDocTitle=useDocTitle + if useDocTitle: self.potentialTitles=None + else: self.potentialTitles=self.findPotentialTitles(title) + + def findPotentialTitles(self,title): + if title == None: return None + title = title.strip() + if len(title)==0: + return None + else: + potentialTitles = set() + potentialTitles.add(title) + p = self.getLongestPart(title, "[ ]*[\||:][ ]*") + if p != None: potentialTitles.add(p) + p = self.getLongestPart(title, "[ ]*[\||:\(\)][ ]*") + if p != None: potentialTitles.add(p) + p = self.getLongestPart(title, "[ ]*[\||:\(\)\-][ ]*") + if p != None: potentialTitles.add(p) + p = self.getLongestPart(title, "[ ]*[\||,|:\(\)\-][ ]*") + if p != None: potentialTitles.add(p) + return potentialTitles + + def getPotentialTitles(self): + """ generated source for method getPotentialTitles """ + return self.potentialTitles + + def getLongestPart(self, title, pattern): + """ generated source for method getLongestPart """ + parts = re.split(pattern,title) + if len(parts)==1: return None + + longestNumWords = 0 + longestPart = "" + for p in parts: + if ".com" in p: continue + numWords=self.getNumWords(p) + if numWords > longestNumWords or len(p)>len(longestPart): + longestNumWords = numWords + longestPart = p + if len(longestPart)==0: return None + else: return longestPart.strip() + + def getNumWords(self,text): + return len(re.findall("\w+",text,re.UNICODE)) + + def process(self, doc): + """ generated source for method process """ + if self.useDocTitle: self.potentialTitles=self.findPotentialTitles(doc.getTitle()) + if self.potentialTitles == None: return False + changes = False + for tb in doc.getTextBlocks(): + text=tb.getText().strip().lower() + if any(candidate.lower()==text for candidate in self.potentialTitles): + tb.addLabel(DefaultLabels.TITLE) + changes = True + return changes @@ -743,9 +743,9 @@ def process(self, doc): # * @author Christian Kohlschtter # class HeuristicFilterBase(BoilerpipeFilter): - def getNumFullTextWords(self, tb, minTextDensity=9): - if tb.getTextDensity() >= minTextDensity: return tb.getNumWords() - else: return 0 + def getNumFullTextWords(self, tb, minTextDensity=9): + if tb.getTextDensity() >= minTextDensity: return tb.getNumWords() + else: return 0 # # * Keeps only those content blocks which contain at least k full-text words @@ -754,17 +754,17 @@ def getNumFullTextWords(self, tb, minTextDensity=9): # * @author Christian Kohlschtter # class MinFulltextWordsFilter(HeuristicFilterBase): - def __init__(self, minWords=30): - self.minWords = minWords + def __init__(self, minWords=30): + self.minWords = minWords - def process(self, doc): - """ generated source for method process """ - changes = False - for tb in doc.getTextBlocks(): - if tb.isContent() and self.getNumFullTextWords(tb) < self.minWords: - tb.setIsContent(False) - changes = True - return changes + def process(self, doc): + """ generated source for method process """ + changes = False + for tb in doc.getTextBlocks(): + if tb.isContent() and self.getNumFullTextWords(tb) < self.minWords: + tb.setIsContent(False) + changes = True + return changes # @@ -785,21 +785,21 @@ def process(self, doc): # class KeepLargestFulltextBlockFilter(HeuristicFilterBase): - def process(self, doc): - """ generated source for method process """ - textBlocks = doc.getTextBlocks() - if len(textBlocks) < 2: return False - contentBlocks=[block for block in textBlocks if block.isContent()] - if len(contentBlocks)==0: return False - largestBlock=max(contentBlocks,key=self.getNumFullTextWords) - - for tb in textBlocks: - if tb == largestBlock: - tb.setIsContent(True) - else: - tb.setIsContent(False) - tb.addLabel(DefaultLabels.MIGHT_BE_CONTENT) - return True + def process(self, doc): + """ generated source for method process """ + textBlocks = doc.getTextBlocks() + if len(textBlocks) < 2: return False + contentBlocks=[block for block in textBlocks if block.isContent()] + if len(contentBlocks)==0: return False + largestBlock=max(contentBlocks,key=self.getNumFullTextWords) + + for tb in textBlocks: + if tb == largestBlock: + tb.setIsContent(True) + else: + tb.setIsContent(False) + tb.addLabel(DefaultLabels.MIGHT_BE_CONTENT) + return True # # * Marks all blocks as "non-content" that occur after blocks that have been @@ -811,28 +811,28 @@ def process(self, doc): # * @see TerminatingBlocksFinder # class IgnoreBlocksAfterContentFilter(HeuristicFilterBase): - """ generated source for class IgnoreBlocksAfterContentFilter """ - #DEFAULT_INSTANCE = IgnoreBlocksAfterContentFilter(60) - #INSTANCE_200 = IgnoreBlocksAfterContentFilter(200) - - def __init__(self, minNumWords=60): - self.minNumWords = minNumWords - - def process(self, doc): - """ generated source for method process """ - changes = False - numWords = 0 - foundEndOfText = False - for block in doc.getTextBlocks(): - if block.isContent(): - numWords += self.getNumFullTextWords(block) - if block.hasLabel(DefaultLabels.INDICATES_END_OF_TEXT) and numWords >= self.minNumWords: - foundEndOfText = True - if foundEndOfText: - changes = True - block.setIsContent(False) - - return changes + """ generated source for class IgnoreBlocksAfterContentFilter """ + #DEFAULT_INSTANCE = IgnoreBlocksAfterContentFilter(60) + #INSTANCE_200 = IgnoreBlocksAfterContentFilter(200) + + def __init__(self, minNumWords=60): + self.minNumWords = minNumWords + + def process(self, doc): + """ generated source for method process """ + changes = False + numWords = 0 + foundEndOfText = False + for block in doc.getTextBlocks(): + if block.isContent(): + numWords += self.getNumFullTextWords(block) + if block.hasLabel(DefaultLabels.INDICATES_END_OF_TEXT) and numWords >= self.minNumWords: + foundEndOfText = True + if foundEndOfText: + changes = True + block.setIsContent(False) + + return changes # # * Marks all blocks as "non-content" that occur after blocks that have been # * marked {@link DefaultLabels#INDICATES_END_OF_TEXT}, and after any content block. @@ -843,22 +843,22 @@ def process(self, doc): # class IgnoreBlocksAfterContentFromEndFilter(HeuristicFilterBase): - def process(self, doc): - """ generated source for method process """ - changes = False - words = 0 - blocks = doc.getTextBlocks() - if len(blocks)==0: return False - for tb in blocks[::-1]: - if tb.hasLabel(DefaultLabels.INDICATES_END_OF_TEXT): - tb.addLabel(DefaultLabels.STRICTLY_NOT_CONTENT) - tb.removeLabel(DefaultLabels.MIGHT_BE_CONTENT) - tb.setIsContent(False) - changes = True - elif tb.isContent(): - words += tb.getNumWords() - if words > 200: break - return changes + def process(self, doc): + """ generated source for method process """ + changes = False + words = 0 + blocks = doc.getTextBlocks() + if len(blocks)==0: return False + for tb in blocks[::-1]: + if tb.hasLabel(DefaultLabels.INDICATES_END_OF_TEXT): + tb.addLabel(DefaultLabels.STRICTLY_NOT_CONTENT) + tb.removeLabel(DefaultLabels.MIGHT_BE_CONTENT) + tb.setIsContent(False) + changes = True + elif tb.isContent(): + words += tb.getNumWords() + if words > 200: break + return changes # @@ -871,46 +871,46 @@ def process(self, doc): # class TerminatingBlocksFinder(BoilerpipeFilter): - # public static long timeSpent = 0; - def process(self, doc): - """ generated source for method process """ - changes = False - - for tb in doc.getTextBlocks(): - if tb.getNumWords() >=15: continue - text=tb.getText().strip() - if len(text)<8: continue - textLC = text.lower() - - startmatches=(" reuters","please rate this","post a comment") - inmatches=("what you think...","add your comment","add comment","reader views","have your say","reader comments","rtta artikeln") - eqmatch="thanks for your comments - this feedback is now closed" - - if textLC.startswith("comments") or self.startsWithNumber(textLC, " comments", " users responded in") or any(textLC.startswith(matchStr) for matchStr in startmatches) or any(matchStr in textLC for matchStr in inmatches) or textLC == eqmatch: - tb.addLabel(DefaultLabels.INDICATES_END_OF_TEXT) - changes = True - # timeSpent += System.currentTimeMillis() - t; - return changes - - # - # * Checks whether the given text t starts with a sequence of digits, - # * followed by one of the given strings. - # * - # * @param t - # * The text to examine - # * @param len - # * The length of the text to examine - # * @param str - # * Any strings that may follow the digits. - # * @return true if at least one combination matches - # - def startsWithNumber(self, text, *matchStrArr): - """ generated source for method startsWithNumber """ - numberMatch=re.search('\D',text) - if numberMatch==None: pos=len(text) - else: pos=numberMatch.start() - if pos==0: return False - else: return any(text.startswith(matchStr,pos) for matchStr in matchStrArr) + # public static long timeSpent = 0; + def process(self, doc): + """ generated source for method process """ + changes = False + + for tb in doc.getTextBlocks(): + if tb.getNumWords() >=15: continue + text=tb.getText().strip() + if len(text)<8: continue + textLC = text.lower() + + startmatches=(" reuters","please rate this","post a comment") + inmatches=("what you think...","add your comment","add comment","reader views","have your say","reader comments","rtta artikeln") + eqmatch="thanks for your comments - this feedback is now closed" + + if textLC.startswith("comments") or self.startsWithNumber(textLC, " comments", " users responded in") or any(textLC.startswith(matchStr) for matchStr in startmatches) or any(matchStr in textLC for matchStr in inmatches) or textLC == eqmatch: + tb.addLabel(DefaultLabels.INDICATES_END_OF_TEXT) + changes = True + # timeSpent += System.currentTimeMillis() - t; + return changes + + # + # * Checks whether the given text t starts with a sequence of digits, + # * followed by one of the given strings. + # * + # * @param t + # * The text to examine + # * @param len + # * The length of the text to examine + # * @param str + # * Any strings that may follow the digits. + # * @return true if at least one combination matches + # + def startsWithNumber(self, text, *matchStrArr): + """ generated source for method startsWithNumber """ + numberMatch=re.search('\D',text) + if numberMatch==None: pos=len(text) + else: pos=numberMatch.start() + if pos==0: return False + else: return any(text.startswith(matchStr,pos) for matchStr in matchStrArr) # @@ -923,46 +923,46 @@ def startsWithNumber(self, text, *matchStrArr): # class NumWordsRulesClassifier(BoilerpipeFilter): - def process(self, doc): - """ generated source for method process """ - textBlocks = doc.getTextBlocks() - hasChanges = False - - n=len(textBlocks) - for i,currentBlock in enumerate(textBlocks): - if i>0: prevBlock=textBlocks[i-1] - else: prevBlock=document.TextBlock.EMPTY_START - if i+10: prevBlock=textBlocks[i-1] + else: prevBlock=document.TextBlock.EMPTY_START + if i+10: prevBlock=textBlocks[i-1] - else: prevBlock=document.TextBlock.EMPTY_START - if i+10: prevBlock=textBlocks[i-1] + else: prevBlock=document.TextBlock.EMPTY_START + if i+1krdwrd 0: prevBlock=textBlocks[i-1] - else: prevBlock=document.TextBlock.EMPTY_START - if i+1 0 and next.getNumWords() > 11 - cond2=curr.getNumWords() > 19 - cond3=next.getNumWords() > 6 and next.getLinkDensity() == 0 and prev.getLinkDensity() == 0 and (curr.getNumWords() > 6 or prev.getNumWords() > 7 or next.getNumWords() > 19) - isContent = cond1 or cond2 or cond3 - return curr.setIsContent(isContent) + def process(self, doc): + """ generated source for method process """ + textBlocks = doc.getTextBlocks() + hasChanges = False + + n=len(textBlocks) + for i,currentBlock in enumerate(textBlocks): + if i>0: prevBlock=textBlocks[i-1] + else: prevBlock=document.TextBlock.EMPTY_START + if i+1 0 and next.getNumWords() > 11 + cond2=curr.getNumWords() > 19 + cond3=next.getNumWords() > 6 and next.getLinkDensity() == 0 and prev.getLinkDensity() == 0 and (curr.getNumWords() > 6 or prev.getNumWords() > 7 or next.getNumWords() > 19) + isContent = cond1 or cond2 or cond3 + return curr.setIsContent(isContent) diff --git a/boilerpy/parser.py b/boilerpy/parser.py index 5e90c43..fc0a835 100644 --- a/boilerpy/parser.py +++ b/boilerpy/parser.py @@ -8,7 +8,7 @@ # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * -# * http://www.apache.org/licenses/LICENSE-2.0 +# * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, @@ -31,25 +31,25 @@ class TagAction(object): - def start(self, contentHandler, tagName, attrs): return False - def end(self, contentHandler, tagName): return False - def changesTagLevel(self): return False + def start(self, contentHandler, tagName, attrs): return False + def end(self, contentHandler, tagName): return False + def changesTagLevel(self): return False # # * Marks this tag as "ignorable", i.e. all its inner content is silently skipped. # class IgnorableElementTagAction(TagAction): - """ generated source for class TA_IGNORABLE_ELEMENT """ - def start(self, contentHandler, tagName, attrs): - contentHandler.inIgnorableElement += 1 - return True + """ generated source for class TA_IGNORABLE_ELEMENT """ + def start(self, contentHandler, tagName, attrs): + contentHandler.inIgnorableElement += 1 + return True - def end(self, contentHandler, tagName): - contentHandler.inIgnorableElement -= 1 - return True + def end(self, contentHandler, tagName): + contentHandler.inIgnorableElement -= 1 + return True - def changesTagLevel(self): - return True + def changesTagLevel(self): + return True # # * Marks this tag as "anchor" (this should usually only be set for the <A> tag). @@ -59,286 +59,286 @@ def changesTagLevel(self): # * If boilerpipe encounters such nestings, a SAXException is thrown. # class AnchorTextTagAction(TagAction): - """ generated source for class TA_ANCHOR_TEXT """ - def start(self, contentHandler, tagName, attrs): - contentHandler.inAnchor += 1 - if contentHandler.inAnchor > 1: - # as nested A elements are not allowed per specification, we - # are probably reaching this branch due to a bug in the XML - # parser - print("Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow...") - self.end(contentHandler, tagName) - if contentHandler.inIgnorableElement == 0: - contentHandler.addToken(SpecialTokens.ANCHOR_TEXT_START) - return False - - def end(self, contentHandler, tagName): - contentHandler.inAnchor -= 1 - if contentHandler.inAnchor == 0 and contentHandler.inIgnorableElement == 0: - contentHandler.addToken(SpecialTokens.ANCHOR_TEXT_END) - return False - - def changesTagLevel(self): - return True + """ generated source for class TA_ANCHOR_TEXT """ + def start(self, contentHandler, tagName, attrs): + contentHandler.inAnchor += 1 + if contentHandler.inAnchor > 1: + # as nested A elements are not allowed per specification, we + # are probably reaching this branch due to a bug in the XML + # parser + print("Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow...") + self.end(contentHandler, tagName) + if contentHandler.inIgnorableElement == 0: + contentHandler.addToken(SpecialTokens.ANCHOR_TEXT_START) + return False + + def end(self, contentHandler, tagName): + contentHandler.inAnchor -= 1 + if contentHandler.inAnchor == 0 and contentHandler.inIgnorableElement == 0: + contentHandler.addToken(SpecialTokens.ANCHOR_TEXT_END) + return False + + def changesTagLevel(self): + return True # # * Marks this tag the body element (this should usually only be set for the <BODY> tag). # class BodyTagAction(TagAction): - """ generated source for class TA_BODY """ - def start(self, contentHandler, tagName, attrs): - contentHandler.flushBlock() - contentHandler.inBody += 1 - return False + """ generated source for class TA_BODY """ + def start(self, contentHandler, tagName, attrs): + contentHandler.flushBlock() + contentHandler.inBody += 1 + return False - def end(self, contentHandler, tagName): - contentHandler.flushBlock() - contentHandler.inBody -= 1 - return False + def end(self, contentHandler, tagName): + contentHandler.flushBlock() + contentHandler.inBody -= 1 + return False - def changesTagLevel(self): - return True + def changesTagLevel(self): + return True # # * Marks this tag a simple "inline" element, which generates whitespace, but no new block. # class InlineWhitespaceTagAction(TagAction): - """ generated source for class TA_INLINE_WHITESPACE """ - def start(self, contentHandler, tagName, attrs): - contentHandler.addWhitespaceIfNecessary() - return False + """ generated source for class TA_INLINE_WHITESPACE """ + def start(self, contentHandler, tagName, attrs): + contentHandler.addWhitespaceIfNecessary() + return False - def end(self, contentHandler, tagName): - contentHandler.addWhitespaceIfNecessary() - return False + def end(self, contentHandler, tagName): + contentHandler.addWhitespaceIfNecessary() + return False - def changesTagLevel(self): return False + def changesTagLevel(self): return False # # * Marks this tag a simple "inline" element, which neither generates whitespace, nor a new block. # class InlineTagAction(TagAction): - """ generated source for class TA_INLINE_NO_WHITESPACE """ - def start(self, contentHandler, tagName, attrs): return False - def end(self, contentHandler, tagName): return False - def changesTagLevel(self): return False + """ generated source for class TA_INLINE_NO_WHITESPACE """ + def start(self, contentHandler, tagName, attrs): return False + def end(self, contentHandler, tagName): return False + def changesTagLevel(self): return False # # * Explicitly marks this tag a simple "block-level" element, which always generates whitespace # class BlockTagAction(TagAction): - """ generated source for class TA_BLOCK_LEVEL """ - def start(self, contentHandler, tagName, attrs): return True - def end(self, contentHandler, tagName): return True - def changesTagLevel(self): return True + """ generated source for class TA_BLOCK_LEVEL """ + def start(self, contentHandler, tagName, attrs): return True + def end(self, contentHandler, tagName): return True + def changesTagLevel(self): return True # # * Special TagAction for the <FONT> tag, which keeps track of the # * absolute and relative font size. # class FontTagAction(TagAction): - """ generated source for class TA_FONT """ - #WARNING: POSSIBLE BUG -- used to be [0-9] without + - PAT_FONT_SIZE = re.compile("([\+\-]?)([0-9]+)") - - def start(self, contentHandler, tagName, attrs): - """ generated source for method start """ - sizeAttr = attrs.getValue("size") - size=None - if sizeAttr != None: - match = self.PAT_FONT_SIZE.match(sizeAttr) - if match!=None: - rel=match.group(0) - val=match.group(1) - if len(rel)==0: - # absolute - size = val - else: - # relative - #last non-none element from stack, default 3 - lastNonNone=(s for s in contentHandler.fontSizeStack[::-1] if s!=None) - prevSize=next(lastNonNone,3) - if rel[0] == '+': size = prevSize + val - else: size = prevSize - val - contentHandler.fontSizeStack.append(size) - return False - - def end(self, contentHandler, tagName): - contentHandler.fontSizeStack.pop() - return False - - def changesTagLevel(self): return False + """ generated source for class TA_FONT """ + #WARNING: POSSIBLE BUG -- used to be [0-9] without + + PAT_FONT_SIZE = re.compile("([\+\-]?)([0-9]+)") + + def start(self, contentHandler, tagName, attrs): + """ generated source for method start """ + sizeAttr = attrs.getValue("size") + size=None + if sizeAttr != None: + match = self.PAT_FONT_SIZE.match(sizeAttr) + if match!=None: + rel=match.group(0) + val=match.group(1) + if len(rel)==0: + # absolute + size = val + else: + # relative + #last non-none element from stack, default 3 + lastNonNone=(s for s in contentHandler.fontSizeStack[::-1] if s!=None) + prevSize=next(lastNonNone,3) + if rel[0] == '+': size = prevSize + val + else: size = prevSize - val + contentHandler.fontSizeStack.append(size) + return False + + def end(self, contentHandler, tagName): + contentHandler.fontSizeStack.pop() + return False + + def changesTagLevel(self): return False # # * {@link CommonTagActions} for inline elements, which triggers some {@link LabelAction} on the generated # * {@link TextBlock}. # class InlineTagLabelAction(TagAction): - """ generated source for class InlineTagLabelAction """ + """ generated source for class InlineTagLabelAction """ - def __init__(self, action): - """ generated source for method __init__ """ - super(InlineTagLabelAction, self).__init__() - self.action = action + def __init__(self, action): + """ generated source for method __init__ """ + super(InlineTagLabelAction, self).__init__() + self.action = action - def start(self, contentHandler, tagName, attrs): - """ generated source for method start """ - contentHandler.addWhitespaceIfNecessary() - contentHandler.addLabelAction(self.action) - return False + def start(self, contentHandler, tagName, attrs): + """ generated source for method start """ + contentHandler.addWhitespaceIfNecessary() + contentHandler.addLabelAction(self.action) + return False - def end(self, contentHandler, tagName): - """ generated source for method end """ - contentHandler.addWhitespaceIfNecessary() - return False + def end(self, contentHandler, tagName): + """ generated source for method end """ + contentHandler.addWhitespaceIfNecessary() + return False - def changesTagLevel(self): - """ generated source for method changesTagLevel """ - return False + def changesTagLevel(self): + """ generated source for method changesTagLevel """ + return False # # * {@link CommonTagActions} for block-level elements, which triggers some {@link LabelAction} on the generated # * {@link TextBlock}. # class BlockTagLabelAction(TagAction): - """ generated source for class BlockTagLabelAction """ + """ generated source for class BlockTagLabelAction """ - def __init__(self, action): - """ generated source for method __init__ """ - super(BlockTagLabelAction, self).__init__() - self.action = action + def __init__(self, action): + """ generated source for method __init__ """ + super(BlockTagLabelAction, self).__init__() + self.action = action - def start(self, contentHandler, tagName, attrs): - """ generated source for method start """ - contentHandler.addLabelAction(self.action) - return True + def start(self, contentHandler, tagName, attrs): + """ generated source for method start """ + contentHandler.addLabelAction(self.action) + return True - def end(self, contentHandler, tagName): - """ generated source for method end """ - return True + def end(self, contentHandler, tagName): + """ generated source for method end """ + return True - def changesTagLevel(self): - """ generated source for method changesTagLevel """ - return True + def changesTagLevel(self): + """ generated source for method changesTagLevel """ + return True class Chained(TagAction): - def __init__(self, tagAction1, tagAction2): - """ generated source for method __init__ """ - super(Chained, self).__init__() - self.tagAction1 = tagAction1 - self.tagAction2 = tagAction2 + def __init__(self, tagAction1, tagAction2): + """ generated source for method __init__ """ + super(Chained, self).__init__() + self.tagAction1 = tagAction1 + self.tagAction2 = tagAction2 - def start(self, contentHandler, tagName, attrs): - """ generated source for method start """ - return self.tagAction1.start(contentHandler, tagName, attrs) | self.tagAction2.start(contentHandler, tagName, attrs) + def start(self, contentHandler, tagName, attrs): + """ generated source for method start """ + return self.tagAction1.start(contentHandler, tagName, attrs) | self.tagAction2.start(contentHandler, tagName, attrs) - def end(self, contentHandler, tagName): - """ generated source for method end """ - return self.tagAction1.end(contentHandler, tagName) | self.tagAction2.end(contentHandler, tagName) + def end(self, contentHandler, tagName): + """ generated source for method end """ + return self.tagAction1.end(contentHandler, tagName) | self.tagAction2.end(contentHandler, tagName) - def changesTagLevel(self): - """ generated source for method changesTagLevel """ - return self.tagAction1.changesTagLevel() or self.tagAction2.changesTagLevel() + def changesTagLevel(self): + """ generated source for method changesTagLevel """ + return self.tagAction1.changesTagLevel() or self.tagAction2.changesTagLevel() class MarkupTagAction(TagAction): - """ generated source for class MarkupTagAction """ - - def __init__(self, isBlockLevel): - """ generated source for method __init__ """ - super(MarkupTagAction, self).__init__() - self.isBlockLevel = isBlockLevel - self.labelStack = [] - - PAT_NUM = re.compile("[0-9]+") - - def start(self, contentHandler, tagName, attrs): - """ generated source for method start """ - labels = [] - labels.append(DefaultLabels.MARKUP_PREFIX + tagName) - classVal = attrs.getValue("class") - if classVal != None and len(classVal)>0: - classVal = self.PAT_NUM.sub("#",classVal).strip() - vals = classVal.split(r"[ ]+") - labels.append(DefaultLabels.MARKUP_PREFIX + "." + classVal.replace(' ', '.')) - if len(vals)>1: - for s in vals: - labels.append(DefaultLabels.MARKUP_PREFIX + "." + s) - id = attrs.get("id") - if id != None and len(id)<0: - id = self.PAT_NUM.sub("#",id) - labels.append(DefaultLabels.MARKUP_PREFIX + "#" + id) - ancestors = self.getAncestorLabels() - labelsWithAncestors = [] - for l in labels: - for an in ancestors: - labelsWithAncestors.append(an) - labelsWithAncestors.append(an + " " + l) - labelsWithAncestors.append(l) - contentHandler.addLabelAction(LabelAction(labelsWithAncestors)) - self.labelStack.append(labels) - return self.isBlockLevel - - def end(self, contentHandler, tagName): - """ generated source for method end """ - self.labelStack.pop() - return self.isBlockLevel - - def changesTagLevel(self): - """ generated source for method changesTagLevel """ - return self.isBlockLevel - - def getAncestorLabels(self): - """ generated source for method getAncestorLabels """ - labelSet = set() - for labels in self.labelStack: - if labels == None:continue - labelSet.update(labels) - return labelSet + """ generated source for class MarkupTagAction """ + + def __init__(self, isBlockLevel): + """ generated source for method __init__ """ + super(MarkupTagAction, self).__init__() + self.isBlockLevel = isBlockLevel + self.labelStack = [] + + PAT_NUM = re.compile("[0-9]+") + + def start(self, contentHandler, tagName, attrs): + """ generated source for method start """ + labels = [] + labels.append(DefaultLabels.MARKUP_PREFIX + tagName) + classVal = attrs.getValue("class") + if classVal != None and len(classVal)>0: + classVal = self.PAT_NUM.sub("#",classVal).strip() + vals = classVal.split(r"[ ]+") + labels.append(DefaultLabels.MARKUP_PREFIX + "." + classVal.replace(' ', '.')) + if len(vals)>1: + for s in vals: + labels.append(DefaultLabels.MARKUP_PREFIX + "." + s) + id = attrs.get("id") + if id != None and len(id)<0: + id = self.PAT_NUM.sub("#",id) + labels.append(DefaultLabels.MARKUP_PREFIX + "#" + id) + ancestors = self.getAncestorLabels() + labelsWithAncestors = [] + for l in labels: + for an in ancestors: + labelsWithAncestors.append(an) + labelsWithAncestors.append(an + " " + l) + labelsWithAncestors.append(l) + contentHandler.addLabelAction(LabelAction(labelsWithAncestors)) + self.labelStack.append(labels) + return self.isBlockLevel + + def end(self, contentHandler, tagName): + """ generated source for method end """ + self.labelStack.pop() + return self.isBlockLevel + + def changesTagLevel(self): + """ generated source for method changesTagLevel """ + return self.isBlockLevel + + def getAncestorLabels(self): + """ generated source for method getAncestorLabels """ + labelSet = set() + for labels in self.labelStack: + if labels == None:continue + labelSet.update(labels) + return labelSet class CommonTagActions(object): - TA_IGNORABLE_ELEMENT=IgnorableElementTagAction() - TA_ANCHOR_TEXT=AnchorTextTagAction() - TA_BODY=BodyTagAction() - TA_INLINE_WHITESPACE=InlineWhitespaceTagAction() - TA_INLINE_NO_WHITESPACE=InlineTagAction() - TA_BLOCK_LEVEL=BlockTagAction() - TA_FONT=FontTagAction() + TA_IGNORABLE_ELEMENT=IgnorableElementTagAction() + TA_ANCHOR_TEXT=AnchorTextTagAction() + TA_BODY=BodyTagAction() + TA_INLINE_WHITESPACE=InlineWhitespaceTagAction() + TA_INLINE_NO_WHITESPACE=InlineTagAction() + TA_BLOCK_LEVEL=BlockTagAction() + TA_FONT=FontTagAction() defaultTagActionMap={ - "STYLE" : CommonTagActions.TA_IGNORABLE_ELEMENT, - "SCRIPT" : CommonTagActions.TA_IGNORABLE_ELEMENT, - "OPTION" : CommonTagActions.TA_IGNORABLE_ELEMENT, - "OBJECT" : CommonTagActions.TA_IGNORABLE_ELEMENT, - "EMBED" : CommonTagActions.TA_IGNORABLE_ELEMENT, - "APPLET" : CommonTagActions.TA_IGNORABLE_ELEMENT, - #Note: link removed because it can be self-closing in HTML5 - #"LINK" : CommonTagActions.TA_IGNORABLE_ELEMENT, - "A" : CommonTagActions.TA_ANCHOR_TEXT, - "BODY" : CommonTagActions.TA_BODY, - "STRIKE" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "U" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "B" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "I" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "EM" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "STRONG" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "SPAN" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - # New in 1.1 (especially to improve extraction quality from Wikipedia etc., - "SUP" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - # New in 1.2 - "CODE" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "TT" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "SUB" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "VAR" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - "ABBR" : CommonTagActions.TA_INLINE_WHITESPACE, - "ACRONYM" : CommonTagActions.TA_INLINE_WHITESPACE, - "FONT" : CommonTagActions.TA_INLINE_NO_WHITESPACE, - # could also use TA_FONT - # added in 1.1.1 - "NOSCRIPT" : CommonTagActions.TA_IGNORABLE_ELEMENT + "STYLE" : CommonTagActions.TA_IGNORABLE_ELEMENT, + "SCRIPT" : CommonTagActions.TA_IGNORABLE_ELEMENT, + "OPTION" : CommonTagActions.TA_IGNORABLE_ELEMENT, + "OBJECT" : CommonTagActions.TA_IGNORABLE_ELEMENT, + "EMBED" : CommonTagActions.TA_IGNORABLE_ELEMENT, + "APPLET" : CommonTagActions.TA_IGNORABLE_ELEMENT, + #Note: link removed because it can be self-closing in HTML5 + #"LINK" : CommonTagActions.TA_IGNORABLE_ELEMENT, + "A" : CommonTagActions.TA_ANCHOR_TEXT, + "BODY" : CommonTagActions.TA_BODY, + "STRIKE" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "U" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "B" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "I" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "EM" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "STRONG" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "SPAN" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + # New in 1.1 (especially to improve extraction quality from Wikipedia etc., + "SUP" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + # New in 1.2 + "CODE" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "TT" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "SUB" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "VAR" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + "ABBR" : CommonTagActions.TA_INLINE_WHITESPACE, + "ACRONYM" : CommonTagActions.TA_INLINE_WHITESPACE, + "FONT" : CommonTagActions.TA_INLINE_NO_WHITESPACE, + # could also use TA_FONT + # added in 1.1.1 + "NOSCRIPT" : CommonTagActions.TA_IGNORABLE_ELEMENT } @@ -353,30 +353,30 @@ class CommonTagActions(object): # * @author Christian Kohlschtter # class LabelAction(object): - def __init__(self, *labels): - self.labels = labels + def __init__(self, *labels): + self.labels = labels - def addTo(self, textBlock): - self.addLabelsTo(textBlock) + def addTo(self, textBlock): + self.addLabelsTo(textBlock) - def addLabelsTo(self, textBlock): - textBlock.addLabels(self.labels) + def addLabelsTo(self, textBlock): + textBlock.addLabels(self.labels) - def __str__(self): - return str(self.labels) + def __str__(self): + return str(self.labels) class ConditionalLabelAction(LabelAction): - def __init__(self, condition, *labels): - super(ConditionalLabelAction, self).__init__(*labels) - self.condition = condition + def __init__(self, condition, *labels): + super(ConditionalLabelAction, self).__init__(*labels) + self.condition = condition - def addTo(self, textBlock): - if self.condition(textBlock): self.addLabelsTo(textBlock) + def addTo(self, textBlock): + if self.condition(textBlock): self.addLabelsTo(textBlock) class SpecialTokens(object): - ANCHOR_TEXT_START = u'\ue00astart' - ANCHOR_TEXT_END = u'\ue00aend' + ANCHOR_TEXT_START = u'\ue00astart' + ANCHOR_TEXT_END = u'\ue00aend' #---------------------------------------------------------------------------- @@ -392,296 +392,296 @@ class SpecialTokens(object): class BoilerpipeBaseParser(object): - EVENT_START_TAG=0 - EVENT_END_TAG=1 - EVENT_CHARACTERS=2 - EVENT_WHITESPACE=3 - #all word characters except underscore -- i.e. not (not word or underscore) - PAT_VALID_WORD_CHARACTER = re.compile(r"[^\W_]", re.UNICODE) - PAT_WORD = re.compile(r"\ue00a?[\w\"'\.,\!\@\-\:\;\$\?\(\)/]+", re.UNICODE) - - """ generated source for class BoilerpipeHTMLContentHandler """ - # - # * Constructs a {@link BoilerpipeHTMLContentHandler} using the given - # * {@link TagActionMap}. - # * - # * @param tagActions - # * The {@link TagActionMap} to use, e.g. - # * {@link DefaultTagActionMap}. - # - def __init__(self, tagActions=None): - """ generated source for method __init___0 """ - #super(BoilerpipeHTMLContentHandler, self).__init__() - if tagActions==None: self.tagActions=defaultTagActionMap - else: self.tagActions = tagActions - - - self.clearTextBuffer() - self.inBody = 0 - self.inAnchor = 0 - self.inIgnorableElement = 0 - self.textElementIdx = 0 - self.lastStartTag = None - self.lastEndTag = None - self.lastEvent = None - self.offsetBlocks = 0 - self.currentContainedTextElements=set() - self.flush = False - self.inAnchorText = False - - self.title = None - self.tagLevel = 0 - self.blockTagLevel = -1 - self.textBlocks = [] - self.labelStacks = [] - self.fontSizeStack = [] - - # - # * Recycles this instance. - # - def recycle(self): - """ generated source for method recycle """ - self.clearTextBuffer() - self.inBody = 0 - self.inAnchor = 0 - self.inIgnorableElement = 0 - self.textElementIdx = 0 - self.lastStartTag = None - self.lastEndTag = None - self.lastEvent = None - self.offsetBlocks = 0 - self.currentContainedTextElements=set() - self.flush = False - self.inAnchorText = False - self.textBlocks=[] - - #--------- added ------- - self.title = None - self.tagLevel = 0 - self.blockTagLevel = -1 - self.labelStacks = [] - self.fontSizeStack = [] + EVENT_START_TAG=0 + EVENT_END_TAG=1 + EVENT_CHARACTERS=2 + EVENT_WHITESPACE=3 + #all word characters except underscore -- i.e. not (not word or underscore) + PAT_VALID_WORD_CHARACTER = re.compile(r"[^\W_]", re.UNICODE) + PAT_WORD = re.compile(r"\ue00a?[\w\"'\.,\!\@\-\:\;\$\?\(\)/]+", re.UNICODE) + + """ generated source for class BoilerpipeHTMLContentHandler """ + # + # * Constructs a {@link BoilerpipeHTMLContentHandler} using the given + # * {@link TagActionMap}. + # * + # * @param tagActions + # * The {@link TagActionMap} to use, e.g. + # * {@link DefaultTagActionMap}. + # + def __init__(self, tagActions=None): + """ generated source for method __init___0 """ + #super(BoilerpipeHTMLContentHandler, self).__init__() + if tagActions==None: self.tagActions=defaultTagActionMap + else: self.tagActions = tagActions + + + self.clearTextBuffer() + self.inBody = 0 + self.inAnchor = 0 + self.inIgnorableElement = 0 + self.textElementIdx = 0 + self.lastStartTag = None + self.lastEndTag = None + self.lastEvent = None + self.offsetBlocks = 0 + self.currentContainedTextElements=set() + self.flush = False + self.inAnchorText = False + + self.title = None + self.tagLevel = 0 + self.blockTagLevel = -1 + self.textBlocks = [] + self.labelStacks = [] + self.fontSizeStack = [] + + # + # * Recycles this instance. + # + def recycle(self): + """ generated source for method recycle """ + self.clearTextBuffer() + self.inBody = 0 + self.inAnchor = 0 + self.inIgnorableElement = 0 + self.textElementIdx = 0 + self.lastStartTag = None + self.lastEndTag = None + self.lastEvent = None + self.offsetBlocks = 0 + self.currentContainedTextElements=set() + self.flush = False + self.inAnchorText = False + self.textBlocks=[] + + #--------- added ------- + self.title = None + self.tagLevel = 0 + self.blockTagLevel = -1 + self.labelStacks = [] + self.fontSizeStack = [] #------------------------------- SAX Parser methods ---------------------------------------- - # @Override - def endDocument(self): - """ generated source for method endDocument """ - self.flushBlock() - - # @Override - def startDocument(self): pass - - # @Override - def startElement(self, name,attrs): - self.labelStacks.append([]) - - tagAction = self.tagActions.get(name.strip().upper()) - - if tagAction != None: - self.flush |= tagAction.start(self, name, attrs) - if tagAction.changesTagLevel(): self.tagLevel += 1 - else: - self.tagLevel += 1 - self.flush = True - self.lastEvent = self.EVENT_START_TAG - self.lastStartTag = name - - # @Override - def endElement(self, name): - tagAction = self.tagActions.get(name.strip().upper()) - - - if tagAction != None: - self.flush |= tagAction.end(self, name) - if tagAction.changesTagLevel(): self.tagLevel -= 1 - else: - self.flush = True - self.tagLevel -= 1 - - if self.flush: self.flushBlock() - self.lastEvent = self.EVENT_END_TAG - self.lastEndTag = name - self.labelStacks.pop() - - # @Override - def characters(self, content): - self.textElementIdx += 1 - if self.flush: - self.flushBlock() - self.flush = False - if self.inIgnorableElement != 0: return - - if len(content) == 0: return - - strippedContent=content.strip() - - if len(strippedContent) == 0: - self.addWhitespaceIfNecessary() - self.lastEvent = self.EVENT_WHITESPACE - return - - startWhitespace=content[0].isspace() - if startWhitespace: self.addWhitespaceIfNecessary() - - if self.blockTagLevel == -1: - self.blockTagLevel = self.tagLevel - self.textBuffer+=strippedContent - self.tokenBuffer+=strippedContent - - endWhitespace=content[-1].isspace() - if endWhitespace: self.addWhitespaceIfNecessary() - - self.lastEvent = self.EVENT_CHARACTERS - self.currentContainedTextElements.add(self.textElementIdx) - - # @Override - def ignorableWhitespace(self, whitespace): - self.addWhitespaceIfNecessary() + # @Override + def endDocument(self): + """ generated source for method endDocument """ + self.flushBlock() + + # @Override + def startDocument(self): pass + + # @Override + def startElement(self, name,attrs): + self.labelStacks.append([]) + + tagAction = self.tagActions.get(name.strip().upper()) + + if tagAction != None: + self.flush |= tagAction.start(self, name, attrs) + if tagAction.changesTagLevel(): self.tagLevel += 1 + else: + self.tagLevel += 1 + self.flush = True + self.lastEvent = self.EVENT_START_TAG + self.lastStartTag = name + + # @Override + def endElement(self, name): + tagAction = self.tagActions.get(name.strip().upper()) + + + if tagAction != None: + self.flush |= tagAction.end(self, name) + if tagAction.changesTagLevel(): self.tagLevel -= 1 + else: + self.flush = True + self.tagLevel -= 1 + + if self.flush: self.flushBlock() + self.lastEvent = self.EVENT_END_TAG + self.lastEndTag = name + self.labelStacks.pop() + + # @Override + def characters(self, content): + self.textElementIdx += 1 + if self.flush: + self.flushBlock() + self.flush = False + if self.inIgnorableElement != 0: return + + if len(content) == 0: return + + strippedContent=content.strip() + + if len(strippedContent) == 0: + self.addWhitespaceIfNecessary() + self.lastEvent = self.EVENT_WHITESPACE + return + + startWhitespace=content[0].isspace() + if startWhitespace: self.addWhitespaceIfNecessary() + + if self.blockTagLevel == -1: + self.blockTagLevel = self.tagLevel + self.textBuffer+=strippedContent + self.tokenBuffer+=strippedContent + + endWhitespace=content[-1].isspace() + if endWhitespace: self.addWhitespaceIfNecessary() + + self.lastEvent = self.EVENT_CHARACTERS + self.currentContainedTextElements.add(self.textElementIdx) + + # @Override + def ignorableWhitespace(self, whitespace): + self.addWhitespaceIfNecessary() #------------------------------- utility methods ---------------------------------------- - def flushBlock(self): - """ generated source for method flushBlock """ - if self.inBody == 0: - if self.lastStartTag.lower()=="title": self.setTitle(self.textBuffer.strip()) - self.clearTextBuffer() - return - if len(self.tokenBuffer.strip())==0: - self.clearTextBuffer() - return - - tokens = self.tokenize(self.tokenBuffer) - numWords = 0 - numLinkedWords = 0 - numWrappedLines = 0 - currentLineLength = -1 - # don't count the first space - maxLineLength = 80 - numTokens = 0 - numWordsCurrentLine = 0 - - for token in tokens: - if token==SpecialTokens.ANCHOR_TEXT_START: self.inAnchorText = True - elif token==SpecialTokens.ANCHOR_TEXT_END: self.inAnchorText = False - elif self.isWord(token): - numTokens += 1 - numWords += 1 - numWordsCurrentLine += 1 - if self.inAnchorText: - numLinkedWords += 1 - currentLineLength += len(token) + 1 - if currentLineLength > maxLineLength: - numWrappedLines += 1 - currentLineLength = len(token) - numWordsCurrentLine = 1 - else: - numTokens += 1 - - #if only special tokens (numTokens excludes special tokens) - if numTokens == 0: - self.clearTextBuffer() - return - - if numWrappedLines == 0: - numWordsInWrappedLines = numWords - numWrappedLines = 1 - else: - numWordsInWrappedLines = numWords - numWordsCurrentLine - - tb = document.TextBlock(self.textBuffer.strip(), self.currentContainedTextElements, numWords, numLinkedWords, numWordsInWrappedLines, numWrappedLines, self.offsetBlocks) - self.currentContainedTextElements = set() - self.offsetBlocks += 1 - self.clearTextBuffer() - tb.setTagLevel(self.blockTagLevel) - self.addTextBlock(tb) - self.blockTagLevel = -1 - - def addTextBlock(self, tb): - """ generated source for method addTextBlock """ - for fontSize in self.fontSizeStack[::-1]: - if fontSize != None: - tb.addLabel("font-" + str(fontSize)) - break - for labelStack in self.labelStacks: - for labels in labelStack: - labels.addTo(tb) - self.textBlocks.append(tb) - - - def isWord(self, token): - """ generated source for method isWord """ - return self.PAT_VALID_WORD_CHARACTER.search(token)!=None - - def tokenize(self,text): - return self.PAT_WORD.findall(text) - - def getTextBlocks(self): - """ generated source for method getTextBlocks """ - return self.textBlocks - - def getTitle(self): - """ generated source for method getTitle """ - return self.title - - def setTitle(self, s): - """ generated source for method setTitle """ - if s == None or len(s)==0: return - self.title = s - - # - # * Returns a {@link TextDocument} containing the extracted {@link TextBlock} - # * s. NOTE: Only call this after parsing. - # * - # * @return The {@link TextDocument} - # - def toTextDocument(self): - """ generated source for method toTextDocument """ - # just to be sure - self.flushBlock() - return document.TextDocument(self.getTextBlocks(), self.getTitle()) - - def addWhitespaceIfNecessary(self): - """ generated source for method addWhitespaceIfNecessary """ - if len(self.textBuffer)==0 or not self.textBuffer[-1].isspace(): - self.textBuffer+=' ' - if len(self.tokenBuffer)==0 or not self.tokenBuffer[-1].isspace(): - self.tokenBuffer+=' ' - - def clearTextBuffer(self): - self.textBuffer='' - self.tokenBuffer='' - - def addToken(self,token): - self.addWhitespaceIfNecessary() - self.tokenBuffer+=token - self.addWhitespaceIfNecessary() - - def addLabelAction(self, la): - """ generated source for method addLabelAction """ - if len(self.labelStacks)==0: self.labelStacks.append([]) - self.labelStacks[-1].append(la) + def flushBlock(self): + """ generated source for method flushBlock """ + if self.inBody == 0: + if self.lastStartTag.lower()=="title": self.setTitle(self.textBuffer.strip()) + self.clearTextBuffer() + return + if len(self.tokenBuffer.strip())==0: + self.clearTextBuffer() + return + + tokens = self.tokenize(self.tokenBuffer) + numWords = 0 + numLinkedWords = 0 + numWrappedLines = 0 + currentLineLength = -1 + # don't count the first space + maxLineLength = 80 + numTokens = 0 + numWordsCurrentLine = 0 + + for token in tokens: + if token==SpecialTokens.ANCHOR_TEXT_START: self.inAnchorText = True + elif token==SpecialTokens.ANCHOR_TEXT_END: self.inAnchorText = False + elif self.isWord(token): + numTokens += 1 + numWords += 1 + numWordsCurrentLine += 1 + if self.inAnchorText: + numLinkedWords += 1 + currentLineLength += len(token) + 1 + if currentLineLength > maxLineLength: + numWrappedLines += 1 + currentLineLength = len(token) + numWordsCurrentLine = 1 + else: + numTokens += 1 + + #if only special tokens (numTokens excludes special tokens) + if numTokens == 0: + self.clearTextBuffer() + return + + if numWrappedLines == 0: + numWordsInWrappedLines = numWords + numWrappedLines = 1 + else: + numWordsInWrappedLines = numWords - numWordsCurrentLine + + tb = document.TextBlock(self.textBuffer.strip(), self.currentContainedTextElements, numWords, numLinkedWords, numWordsInWrappedLines, numWrappedLines, self.offsetBlocks) + self.currentContainedTextElements = set() + self.offsetBlocks += 1 + self.clearTextBuffer() + tb.setTagLevel(self.blockTagLevel) + self.addTextBlock(tb) + self.blockTagLevel = -1 + + def addTextBlock(self, tb): + """ generated source for method addTextBlock """ + for fontSize in self.fontSizeStack[::-1]: + if fontSize != None: + tb.addLabel("font-" + str(fontSize)) + break + for labelStack in self.labelStacks: + for labels in labelStack: + labels.addTo(tb) + self.textBlocks.append(tb) + + + def isWord(self, token): + """ generated source for method isWord """ + return self.PAT_VALID_WORD_CHARACTER.search(token)!=None + + def tokenize(self,text): + return self.PAT_WORD.findall(text) + + def getTextBlocks(self): + """ generated source for method getTextBlocks """ + return self.textBlocks + + def getTitle(self): + """ generated source for method getTitle """ + return self.title + + def setTitle(self, s): + """ generated source for method setTitle """ + if s == None or len(s)==0: return + self.title = s + + # + # * Returns a {@link TextDocument} containing the extracted {@link TextBlock} + # * s. NOTE: Only call this after parsing. + # * + # * @return The {@link TextDocument} + # + def toTextDocument(self): + """ generated source for method toTextDocument """ + # just to be sure + self.flushBlock() + return document.TextDocument(self.getTextBlocks(), self.getTitle()) + + def addWhitespaceIfNecessary(self): + """ generated source for method addWhitespaceIfNecessary """ + if len(self.textBuffer)==0 or not self.textBuffer[-1].isspace(): + self.textBuffer+=' ' + if len(self.tokenBuffer)==0 or not self.tokenBuffer[-1].isspace(): + self.tokenBuffer+=' ' + + def clearTextBuffer(self): + self.textBuffer='' + self.tokenBuffer='' + + def addToken(self,token): + self.addWhitespaceIfNecessary() + self.tokenBuffer+=token + self.addWhitespaceIfNecessary() + + def addLabelAction(self, la): + """ generated source for method addLabelAction """ + if len(self.labelStacks)==0: self.labelStacks.append([]) + self.labelStacks[-1].append(la) class BoilerpipeHTMLParser(HTMLParser,BoilerpipeBaseParser): - def __init__(self): - HTMLParser.__init__(self) - BoilerpipeBaseParser.__init__(self) - - def feed(self,data): - self.startDocument() - HTMLParser.feed(self,data) - self.endDocument() - - def handle_starttag(self, tag, attrs): self.startElement(tag,attrs) - def handle_endtag(self, tag): self.endElement(tag) - def handle_data(self, data): self.characters(data) + def __init__(self): + HTMLParser.__init__(self) + BoilerpipeBaseParser.__init__(self) + + def feed(self,data): + self.startDocument() + HTMLParser.feed(self,data) + self.endDocument() + + def handle_starttag(self, tag, attrs): self.startElement(tag,attrs) + def handle_endtag(self, tag): self.endElement(tag) + def handle_data(self, data): self.characters(data) class BoilerpipeSAXContentHandler(ContentHandler,BoilerpipeBaseParser): - def __init__(self): - ContentHandler.__init__(self) - BoilerpipeBaseParser.__init__(self) + def __init__(self): + ContentHandler.__init__(self) + BoilerpipeBaseParser.__init__(self) diff --git a/setup.py b/setup.py index 6088060..aebbf74 100644 --- a/setup.py +++ b/setup.py @@ -6,22 +6,22 @@ # README file and 2) it's easier to type in the README file than to put a raw # string in below ... def read(fname): - return open(os.path.join(os.path.dirname(__file__), fname)).read() + return open(os.path.join(os.path.dirname(__file__), fname)).read() setup( - name = "boilerpy", - version = "1.0", - author = "Sam Myer", - author_email = "mail@frozencavemanmedia.com", - description = "Python port of Boilerpipe, Boilerplate Removal and Fulltext Extraction from HTML pages", - license = "Apache 2.0", - keywords = "boilerpipe fulltext extraction", - url = "https://github.com/sammyer/BoilerPy", - packages=['boilerpy'], - long_description=read('README.txt'), - classifiers=[ - "Development Status :: 4 - Beta", - "Topic :: Utilities", - "License :: OSI Approved :: Apache License", - ] -) \ No newline at end of file + name = "boilerpy", + version = "1.0", + author = "Sam Myer", + author_email = "mail@frozencavemanmedia.com", + description = "Python port of Boilerpipe, Boilerplate Removal and Fulltext Extraction from HTML pages", + license = "Apache 2.0", + keywords = "boilerpipe fulltext extraction", + url = "https://github.com/sammyer/BoilerPy", + packages=['boilerpy'], + long_description=read('README.txt'), + classifiers=[ + "Development Status :: 4 - Beta", + "Topic :: Utilities", + "License :: OSI Approved :: Apache License", + ] +) diff --git a/tests/unittests.py b/tests/unittests.py index 96e367b..f12ab6e 100644 --- a/tests/unittests.py +++ b/tests/unittests.py @@ -7,428 +7,428 @@ from boilerpy.extractors import Extractor def runTests(): - suite = unittest.TestLoader().loadTestsFromTestCase(TestFilters) - unittest.TextTestRunner(verbosity=2).run(suite) - suite = unittest.TestLoader().loadTestsFromTestCase(TestParser) - unittest.TextTestRunner(verbosity=2).run(suite) + suite = unittest.TestLoader().loadTestsFromTestCase(TestFilters) + unittest.TextTestRunner(verbosity=2).run(suite) + suite = unittest.TestLoader().loadTestsFromTestCase(TestParser) + unittest.TextTestRunner(verbosity=2).run(suite) def runOneTest(): - testName='test_anchor' - suite = unittest.TestSuite() - suite.addTest(TestParser(testName)) - unittest.TextTestRunner(verbosity=2).run(suite) + testName='test_anchor' + suite = unittest.TestSuite() + suite.addTest(TestParser(testName)) + unittest.TextTestRunner(verbosity=2).run(suite) class TestFilters(unittest.TestCase): - defaultWords="Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec fermentum tincidunt magna, eu pulvinar mauris dapibus pharetra. In varius, nisl a rutrum porta, sem sem semper lacus, et varius urna tellus vel lorem. Nullam urna eros, luctus eget blandit ac, imperdiet feugiat ipsum. Donec laoreet tristique mi a bibendum. Sed pretium bibendum scelerisque. Mauris id pellentesque turpis. Mauris porta adipiscing massa, quis tempus dui pharetra ac. Morbi lacus mauris, feugiat ac tempor ut, congue tincidunt risus. Pellentesque tincidunt adipiscing elit, in fringilla enim scelerisque vel. Nulla facilisi. ".split(' ') - - def makedoc(self,wordsArr,numAnchorWordsArr=None,isContentArr=None,labelArr=None): - textBlocks=[] - for idx,words in enumerate(wordsArr): - if type(words)==int: - numWords=words - text=' '.join(self.defaultWords[:numWords]) - else: - text=words - numWords=text.count(' ') - try: - numAnchorWords=numAnchorWordsArr[idx] - except (TypeError, IndexError): - numAnchorWords=0 - block=TextBlock(text,set(),numWords,numAnchorWords,0,0,idx) - try: - block.setIsContent(isContentArr[idx]) - except (TypeError, IndexError): - pass - try: - label=labelArr[idx] - if label==None: pass - elif type(label)==list: - for l in label: block.addLabel(l) - else: block.addLabel(label) - except (TypeError, IndexError): - pass - - textBlocks.append(block) - - return TextDocument(textBlocks) - - def verifyContent(self,filtr,doc,contentArr,show=False): - isContentBefore=[block.isContent() for block in doc.getTextBlocks()] - isChanged=filtr.process(doc) - isContent=[block.isContent() for block in doc.getTextBlocks()] - self.assertEqual(isContent,contentArr) - self.assertEqual(isChanged,isContent!=isContentBefore) - - def test_markEveryhingContent(self): - doc=self.makedoc([5,100,80],None,[False,True,False]) - self.verifyContent(MarkEverythingContentFilter(),doc,[True,True,True]) - - def test_inverted(self): - doc=self.makedoc([5,100,80],None,[False,True,False]) - self.verifyContent(InvertedFilter(),doc,[True,False,True]) - - def test_boilerplateBlock(self): - #keeps if isContent - doc=self.makedoc([5,100,10,50,80],None,[False,True,False,True,False]) - initBlocks=doc.getTextBlocks() - finalBlocks=[initBlocks[1],initBlocks[3]] - filtr=BoilerplateBlockFilter() - isChanged=filtr.process(doc) - isContent=[block.isContent() for block in doc.getTextBlocks()] - self.assertEqual(doc.getTextBlocks(),finalBlocks) - self.assertEqual(isContent,[True,True]) - self.assertEqual(isChanged,True) - - def test_minWords(self): - #rejects if #words6 - self.verifyContent(SurroundingToContentFilter(defaultCondition),doc,[True,True,True,False,True,False,False,True]) - - def test_labelToBoilerplate(self): - #reject block if it has a particular label - lb_not=DefaultLabels.STRICTLY_NOT_CONTENT - lb_maybe=DefaultLabels.MIGHT_BE_CONTENT - doc=self.makedoc([10,10,10,10],None,[True,True,True,True],[lb_not,lb_maybe,[lb_not,lb_maybe],None]) - self.verifyContent(LabelToBoilerplateFilter(DefaultLabels.STRICTLY_NOT_CONTENT),doc,[False,True,False,True]) - - def test_labelToContent(self): - #accept block if it has a particular label - lb_not=DefaultLabels.STRICTLY_NOT_CONTENT - lb_maybe=DefaultLabels.MIGHT_BE_CONTENT - doc=self.makedoc([10,10,10,10],None,[False,False,False,False],[lb_not,lb_maybe,[lb_not,lb_maybe],None]) - self.verifyContent(LabelToContentFilter(DefaultLabels.MIGHT_BE_CONTENT),doc,[False,True,True,False]) - - - def test_simpleBlockFusion(self): - #join blocks with the same number of words per line - doc=self.makedoc(["two words","three fucking words","another three words"],None,[False,False,False]) - filtr=SimpleBlockFusionProcessor() - isChanged=filtr.process(doc) - blockIdxs=[(block.getOffsetBlocksStart(),block.getOffsetBlocksEnd()) for block in doc.getTextBlocks()] - self.assertEqual(blockIdxs,[(0,0),(1,2)]) - self.assertEqual(isChanged,True) - - def test_contentFusion(self): - #join blocks with low link density - filtr=ContentFusion() - - #merge - doc=self.makedoc([10,10],[0,0],[True,False]) - isChanged=filtr.process(doc) - self.assertEqual(len(doc.getTextBlocks()),1) - self.assertEqual(isChanged,True) - - #dont merge if tagged not content - doc=self.makedoc([10,10],[0,0],[True,False],[None,DefaultLabels.STRICTLY_NOT_CONTENT]) - isChanged=filtr.process(doc) - self.assertEqual(len(doc.getTextBlocks()),2) - self.assertEqual(isChanged,False) - - #dont merge if link density is high - doc=self.makedoc([10,10],[0,8],[True,False]) - isChanged=filtr.process(doc) - self.assertEqual(len(doc.getTextBlocks()),2) - self.assertEqual(isChanged,False) - - #multiple pass merging - doc=self.makedoc([10,10,10,10],[0,0,0,0],[True,False,True,False]) - isChanged=filtr.process(doc) - self.assertEqual(len(doc.getTextBlocks()),1) - self.assertEqual(isChanged,True) - - def test_labelFusion(self): - #fuse blocks with identical labels - ONLY LOOKS AT LABELS with markup prefix - - lb1=DefaultLabels.MARKUP_PREFIX+".title" - lb2=DefaultLabels.MARKUP_PREFIX+".menu" - doc=self.makedoc([10,10,10,10,10,10,10],None,None,[None,None,lb1,lb1,lb2,lb2,[lb1,lb2]]) - filtr=LabelFusion() - isChanged=filtr.process(doc) - blockIdxs=[(block.getOffsetBlocksStart(),block.getOffsetBlocksEnd()) for block in doc.getTextBlocks()] - self.assertEqual(blockIdxs,[(0,1),(2,3),(4,5),(6,6)]) - self.assertEqual(isChanged,True) - - def test_blockProximity(self): - #fuse blocks close to each other - doc=self.makedoc([10,10,10,10,10,10,10],None,[False,True,True,True,True,True,False]) - filtr=BlockProximityFusion(1,True,False) - isChanged=filtr.process(doc) - blockIdxs=[(block.getOffsetBlocksStart(),block.getOffsetBlocksEnd()) for block in doc.getTextBlocks()] - self.assertEqual(blockIdxs,[(0,0),(1,5),(6,6)]) - self.assertEqual(isChanged,True) - - def test_largestBlock(self): - #choose largest block - doc=self.makedoc([10,10,50,10],None,[False,True,True,True]) - self.verifyContent(KeepLargestBlockFilter(),doc,[False,False,True,False]) - - def test_expandTitleToContent(self): - #marks all between title and content start - lb1=DefaultLabels.MIGHT_BE_CONTENT - doc=self.makedoc([10,10,10,10],None,[False,False,False,True],[lb1,[lb1,DefaultLabels.TITLE],lb1,lb1]) - self.verifyContent(ExpandTitleToContentFilter(),doc,[False,True,True,True]) - - def test_articleMetadata(self): - #marks as content and tags blocks with date/time data - doc=self.makedoc([" May 1, 2009 8:00pm EST","May not be date 1","By Frank Sinatra","By looking at this sentence, you can see there is no author"],None,[False,False,False,False]) - self.verifyContent(ArticleMetadataFilter(),doc,[True,False,True,False]) - labels=[block.getLabels() for block in doc.getTextBlocks()] - self.assertIn(DefaultLabels.ARTICLE_METADATA,labels[0]) - - def test_largestBlock(self): - #accept largest block and reject all others - doc=self.makedoc([10,10,50,10],None,[False,True,True,True]) - self.verifyContent(KeepLargestBlockFilter(),doc,[False,False,True,False]) - - def test_addPrecedingLabels(self): - #add prefix+preceding label to each block - lb1=DefaultLabels.TITLE - lb2=DefaultLabels.MIGHT_BE_CONTENT - prefix="^" - doc=self.makedoc([10,10,10],None,None,[lb1,lb2,None]) - filtr=AddPrecedingLabelsFilter(prefix) - isChanged=filtr.process(doc) - labels=[block.getLabels() for block in doc.getTextBlocks()] - self.assertEqual(labels,[set([lb1]),set([prefix+lb1,lb2]),set([prefix+lb2])]) - self.assertEqual(isChanged,True) - - def test_documentTitleMatch(self): - #add title label to blocks matching sections of the title - doc=self.makedoc(["News","This is the real title","Red herring"]) - doc.setTitle("News - This is the real title") - filtr=DocumentTitleMatchClassifier(None,True) - isChanged=filtr.process(doc) - labels=[block.getLabels() for block in doc.getTextBlocks()] - self.assertEqual(labels,[set(),set([DefaultLabels.TITLE]),set()]) - self.assertEqual(isChanged,True) - - def test_minFulltextWords(self): - #choose largest block - doc=self.makedoc([10,50],None,[True,True]) - self.verifyContent(MinFulltextWordsFilter(30),doc,[False,True]) - - def test_largestFulltextBlock(self): - #accept largest block that has been marked as content and reject all others - doc=self.makedoc([10,50,80,10],None,[True,True,False,False]) - self.verifyContent(KeepLargestFulltextBlockFilter(),doc,[False,True,False,False]) - - def test_ignoreBlocksAfterContent(self): - #rejects all blocks after(&including) first block with ENDOFTEXT label - #Also: ENDOFTEXT labels are ignored until the total number of words in content blocks reaches a certain number - lb=DefaultLabels.INDICATES_END_OF_TEXT - doc=self.makedoc([10,30,50,80,20],None,[False,True,True,True,True],[lb,None,None,lb,None]) - self.verifyContent(IgnoreBlocksAfterContentFilter(60),doc,[False,True,True,False,False]) - - def test_ignoreBlocksAfterContentFromEnd(self): - #rejects all blocks with ENDOFTEXT label - #works backwards until the total number of words in content blocks reaches 200 and then halts - lb=DefaultLabels.INDICATES_END_OF_TEXT - doc=self.makedoc([80,80,80,80,80],None,[True,True,True,True,True],[lb,None,None,lb,None]) - self.verifyContent(IgnoreBlocksAfterContentFromEndFilter(),doc,[True,True,True,False,True]) - - def test_terminatingBlocks(self): - #add ENDOFTEXT label at detected beginning of comments section - lb=DefaultLabels.INDICATES_END_OF_TEXT - s1="Comments can be the first word of article text. If there are many words in the block, it is not comments" - s2="Thanks for your comments - this feedback is now closed" - doc=self.makedoc(["Comments","Please have your say","48 Comments today",s1,s2]) - filtr=TerminatingBlocksFinder() - isChanged=filtr.process(doc) - hasLabel=[(lb in block.getLabels()) for block in doc.getTextBlocks()] - self.assertEqual(hasLabel,[True,True,True,False,True]) - self.assertEqual(isChanged,True) - - def test_numWordsClassifier(self): - #accepts or rejects block based on machine-trained decision tree rules - #using features from previous, current and next block - filtr=NumWordsRulesClassifier() - - doc=self.makedoc([2,10,10],[0,0,0],[True,True,True]) - isChanged=filtr.process(doc) - #test middle block only - self.assertEqual(doc.getTextBlocks()[1].isContent(),False) - - doc=self.makedoc([10,10,10],[0,0,0],[True,True,True]) - isChanged=filtr.process(doc) - self.assertEqual(doc.getTextBlocks()[1].isContent(),True) - - def test_densityClassifier(self): - #accepts or rejects block based on a different set of machine-trained decision tree rules - #using features from previous, current and next block - doc=self.makedoc([10,10,5],[10,0,0],[True,True,True]) - isChanged=DensityRulesClassifier().process(doc) - self.assertEqual(doc.getTextBlocks()[1].isContent(),False) - - def test_canolaClassifier(self): - #accepts or rejects block based on a different set of machine-trained decision tree rules - #using features from previous, current and next block - doc=self.makedoc([5,10,30],[5,10,0],[True,False,True]) - isChanged=CanolaFilter().process(doc) - self.assertEqual(doc.getTextBlocks()[1].isContent(),True) + defaultWords="Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec fermentum tincidunt magna, eu pulvinar mauris dapibus pharetra. In varius, nisl a rutrum porta, sem sem semper lacus, et varius urna tellus vel lorem. Nullam urna eros, luctus eget blandit ac, imperdiet feugiat ipsum. Donec laoreet tristique mi a bibendum. Sed pretium bibendum scelerisque. Mauris id pellentesque turpis. Mauris porta adipiscing massa, quis tempus dui pharetra ac. Morbi lacus mauris, feugiat ac tempor ut, congue tincidunt risus. Pellentesque tincidunt adipiscing elit, in fringilla enim scelerisque vel. Nulla facilisi. ".split(' ') + + def makedoc(self,wordsArr,numAnchorWordsArr=None,isContentArr=None,labelArr=None): + textBlocks=[] + for idx,words in enumerate(wordsArr): + if type(words)==int: + numWords=words + text=' '.join(self.defaultWords[:numWords]) + else: + text=words + numWords=text.count(' ') + try: + numAnchorWords=numAnchorWordsArr[idx] + except (TypeError, IndexError): + numAnchorWords=0 + block=TextBlock(text,set(),numWords,numAnchorWords,0,0,idx) + try: + block.setIsContent(isContentArr[idx]) + except (TypeError, IndexError): + pass + try: + label=labelArr[idx] + if label==None: pass + elif type(label)==list: + for l in label: block.addLabel(l) + else: block.addLabel(label) + except (TypeError, IndexError): + pass + + textBlocks.append(block) + + return TextDocument(textBlocks) + + def verifyContent(self,filtr,doc,contentArr,show=False): + isContentBefore=[block.isContent() for block in doc.getTextBlocks()] + isChanged=filtr.process(doc) + isContent=[block.isContent() for block in doc.getTextBlocks()] + self.assertEqual(isContent,contentArr) + self.assertEqual(isChanged,isContent!=isContentBefore) + + def test_markEveryhingContent(self): + doc=self.makedoc([5,100,80],None,[False,True,False]) + self.verifyContent(MarkEverythingContentFilter(),doc,[True,True,True]) + + def test_inverted(self): + doc=self.makedoc([5,100,80],None,[False,True,False]) + self.verifyContent(InvertedFilter(),doc,[True,False,True]) + + def test_boilerplateBlock(self): + #keeps if isContent + doc=self.makedoc([5,100,10,50,80],None,[False,True,False,True,False]) + initBlocks=doc.getTextBlocks() + finalBlocks=[initBlocks[1],initBlocks[3]] + filtr=BoilerplateBlockFilter() + isChanged=filtr.process(doc) + isContent=[block.isContent() for block in doc.getTextBlocks()] + self.assertEqual(doc.getTextBlocks(),finalBlocks) + self.assertEqual(isContent,[True,True]) + self.assertEqual(isChanged,True) + + def test_minWords(self): + #rejects if #words6 + self.verifyContent(SurroundingToContentFilter(defaultCondition),doc,[True,True,True,False,True,False,False,True]) + + def test_labelToBoilerplate(self): + #reject block if it has a particular label + lb_not=DefaultLabels.STRICTLY_NOT_CONTENT + lb_maybe=DefaultLabels.MIGHT_BE_CONTENT + doc=self.makedoc([10,10,10,10],None,[True,True,True,True],[lb_not,lb_maybe,[lb_not,lb_maybe],None]) + self.verifyContent(LabelToBoilerplateFilter(DefaultLabels.STRICTLY_NOT_CONTENT),doc,[False,True,False,True]) + + def test_labelToContent(self): + #accept block if it has a particular label + lb_not=DefaultLabels.STRICTLY_NOT_CONTENT + lb_maybe=DefaultLabels.MIGHT_BE_CONTENT + doc=self.makedoc([10,10,10,10],None,[False,False,False,False],[lb_not,lb_maybe,[lb_not,lb_maybe],None]) + self.verifyContent(LabelToContentFilter(DefaultLabels.MIGHT_BE_CONTENT),doc,[False,True,True,False]) + + + def test_simpleBlockFusion(self): + #join blocks with the same number of words per line + doc=self.makedoc(["two words","three fucking words","another three words"],None,[False,False,False]) + filtr=SimpleBlockFusionProcessor() + isChanged=filtr.process(doc) + blockIdxs=[(block.getOffsetBlocksStart(),block.getOffsetBlocksEnd()) for block in doc.getTextBlocks()] + self.assertEqual(blockIdxs,[(0,0),(1,2)]) + self.assertEqual(isChanged,True) + + def test_contentFusion(self): + #join blocks with low link density + filtr=ContentFusion() + + #merge + doc=self.makedoc([10,10],[0,0],[True,False]) + isChanged=filtr.process(doc) + self.assertEqual(len(doc.getTextBlocks()),1) + self.assertEqual(isChanged,True) + + #dont merge if tagged not content + doc=self.makedoc([10,10],[0,0],[True,False],[None,DefaultLabels.STRICTLY_NOT_CONTENT]) + isChanged=filtr.process(doc) + self.assertEqual(len(doc.getTextBlocks()),2) + self.assertEqual(isChanged,False) + + #dont merge if link density is high + doc=self.makedoc([10,10],[0,8],[True,False]) + isChanged=filtr.process(doc) + self.assertEqual(len(doc.getTextBlocks()),2) + self.assertEqual(isChanged,False) + + #multiple pass merging + doc=self.makedoc([10,10,10,10],[0,0,0,0],[True,False,True,False]) + isChanged=filtr.process(doc) + self.assertEqual(len(doc.getTextBlocks()),1) + self.assertEqual(isChanged,True) + + def test_labelFusion(self): + #fuse blocks with identical labels - ONLY LOOKS AT LABELS with markup prefix + + lb1=DefaultLabels.MARKUP_PREFIX+".title" + lb2=DefaultLabels.MARKUP_PREFIX+".menu" + doc=self.makedoc([10,10,10,10,10,10,10],None,None,[None,None,lb1,lb1,lb2,lb2,[lb1,lb2]]) + filtr=LabelFusion() + isChanged=filtr.process(doc) + blockIdxs=[(block.getOffsetBlocksStart(),block.getOffsetBlocksEnd()) for block in doc.getTextBlocks()] + self.assertEqual(blockIdxs,[(0,1),(2,3),(4,5),(6,6)]) + self.assertEqual(isChanged,True) + + def test_blockProximity(self): + #fuse blocks close to each other + doc=self.makedoc([10,10,10,10,10,10,10],None,[False,True,True,True,True,True,False]) + filtr=BlockProximityFusion(1,True,False) + isChanged=filtr.process(doc) + blockIdxs=[(block.getOffsetBlocksStart(),block.getOffsetBlocksEnd()) for block in doc.getTextBlocks()] + self.assertEqual(blockIdxs,[(0,0),(1,5),(6,6)]) + self.assertEqual(isChanged,True) + + def test_largestBlock(self): + #choose largest block + doc=self.makedoc([10,10,50,10],None,[False,True,True,True]) + self.verifyContent(KeepLargestBlockFilter(),doc,[False,False,True,False]) + + def test_expandTitleToContent(self): + #marks all between title and content start + lb1=DefaultLabels.MIGHT_BE_CONTENT + doc=self.makedoc([10,10,10,10],None,[False,False,False,True],[lb1,[lb1,DefaultLabels.TITLE],lb1,lb1]) + self.verifyContent(ExpandTitleToContentFilter(),doc,[False,True,True,True]) + + def test_articleMetadata(self): + #marks as content and tags blocks with date/time data + doc=self.makedoc([" May 1, 2009 8:00pm EST","May not be date 1","By Frank Sinatra","By looking at this sentence, you can see there is no author"],None,[False,False,False,False]) + self.verifyContent(ArticleMetadataFilter(),doc,[True,False,True,False]) + labels=[block.getLabels() for block in doc.getTextBlocks()] + self.assertIn(DefaultLabels.ARTICLE_METADATA,labels[0]) + + def test_largestBlock(self): + #accept largest block and reject all others + doc=self.makedoc([10,10,50,10],None,[False,True,True,True]) + self.verifyContent(KeepLargestBlockFilter(),doc,[False,False,True,False]) + + def test_addPrecedingLabels(self): + #add prefix+preceding label to each block + lb1=DefaultLabels.TITLE + lb2=DefaultLabels.MIGHT_BE_CONTENT + prefix="^" + doc=self.makedoc([10,10,10],None,None,[lb1,lb2,None]) + filtr=AddPrecedingLabelsFilter(prefix) + isChanged=filtr.process(doc) + labels=[block.getLabels() for block in doc.getTextBlocks()] + self.assertEqual(labels,[set([lb1]),set([prefix+lb1,lb2]),set([prefix+lb2])]) + self.assertEqual(isChanged,True) + + def test_documentTitleMatch(self): + #add title label to blocks matching sections of the title + doc=self.makedoc(["News","This is the real title","Red herring"]) + doc.setTitle("News - This is the real title") + filtr=DocumentTitleMatchClassifier(None,True) + isChanged=filtr.process(doc) + labels=[block.getLabels() for block in doc.getTextBlocks()] + self.assertEqual(labels,[set(),set([DefaultLabels.TITLE]),set()]) + self.assertEqual(isChanged,True) + + def test_minFulltextWords(self): + #choose largest block + doc=self.makedoc([10,50],None,[True,True]) + self.verifyContent(MinFulltextWordsFilter(30),doc,[False,True]) + + def test_largestFulltextBlock(self): + #accept largest block that has been marked as content and reject all others + doc=self.makedoc([10,50,80,10],None,[True,True,False,False]) + self.verifyContent(KeepLargestFulltextBlockFilter(),doc,[False,True,False,False]) + + def test_ignoreBlocksAfterContent(self): + #rejects all blocks after(&including) first block with ENDOFTEXT label + #Also: ENDOFTEXT labels are ignored until the total number of words in content blocks reaches a certain number + lb=DefaultLabels.INDICATES_END_OF_TEXT + doc=self.makedoc([10,30,50,80,20],None,[False,True,True,True,True],[lb,None,None,lb,None]) + self.verifyContent(IgnoreBlocksAfterContentFilter(60),doc,[False,True,True,False,False]) + + def test_ignoreBlocksAfterContentFromEnd(self): + #rejects all blocks with ENDOFTEXT label + #works backwards until the total number of words in content blocks reaches 200 and then halts + lb=DefaultLabels.INDICATES_END_OF_TEXT + doc=self.makedoc([80,80,80,80,80],None,[True,True,True,True,True],[lb,None,None,lb,None]) + self.verifyContent(IgnoreBlocksAfterContentFromEndFilter(),doc,[True,True,True,False,True]) + + def test_terminatingBlocks(self): + #add ENDOFTEXT label at detected beginning of comments section + lb=DefaultLabels.INDICATES_END_OF_TEXT + s1="Comments can be the first word of article text. If there are many words in the block, it is not comments" + s2="Thanks for your comments - this feedback is now closed" + doc=self.makedoc(["Comments","Please have your say","48 Comments today",s1,s2]) + filtr=TerminatingBlocksFinder() + isChanged=filtr.process(doc) + hasLabel=[(lb in block.getLabels()) for block in doc.getTextBlocks()] + self.assertEqual(hasLabel,[True,True,True,False,True]) + self.assertEqual(isChanged,True) + + def test_numWordsClassifier(self): + #accepts or rejects block based on machine-trained decision tree rules + #using features from previous, current and next block + filtr=NumWordsRulesClassifier() + + doc=self.makedoc([2,10,10],[0,0,0],[True,True,True]) + isChanged=filtr.process(doc) + #test middle block only + self.assertEqual(doc.getTextBlocks()[1].isContent(),False) + + doc=self.makedoc([10,10,10],[0,0,0],[True,True,True]) + isChanged=filtr.process(doc) + self.assertEqual(doc.getTextBlocks()[1].isContent(),True) + + def test_densityClassifier(self): + #accepts or rejects block based on a different set of machine-trained decision tree rules + #using features from previous, current and next block + doc=self.makedoc([10,10,5],[10,0,0],[True,True,True]) + isChanged=DensityRulesClassifier().process(doc) + self.assertEqual(doc.getTextBlocks()[1].isContent(),False) + + def test_canolaClassifier(self): + #accepts or rejects block based on a different set of machine-trained decision tree rules + #using features from previous, current and next block + doc=self.makedoc([5,10,30],[5,10,0],[True,False,True]) + isChanged=CanolaFilter().process(doc) + self.assertEqual(doc.getTextBlocks()[1].isContent(),True) class TestParser(unittest.TestCase): - extractor=Extractor(None) - defaultWords="Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec fermentum tincidunt magna, eu pulvinar mauris dapibus pharetra. In varius, nisl a rutrum porta, sem sem semper lacus, et varius urna tellus vel lorem. Nullam urna eros, luctus eget blandit ac, imperdiet feugiat ipsum. Donec laoreet tristique mi a bibendum. Sed pretium bibendum scelerisque. Mauris id pellentesque turpis. Mauris porta adipiscing massa, quis tempus dui pharetra ac. Morbi lacus mauris, feugiat ac tempor ut, congue tincidunt risus. Pellentesque tincidunt adipiscing elit, in fringilla enim scelerisque vel. Nulla facilisi. ".split(' ') - - def contentitem(self,s): - if type(s)==int: - return ' '.join(self.defaultWords[:s]) - else: return s - - def makecontent(self,strArr): - return [self.contentitem(s) for s in strArr] - - def makedoc(self,template,contentArr): - templateArr=template.split('*') - s="" - for i,j in zip(templateArr[:-1],contentArr): - s+=i+j - s+=templateArr[-1] - doc=self.extractor.parseDoc(s) - return doc - - def test_blocks(self): - template="

*

*

*

*
" - content=self.makecontent([4,5,6,7]) - doc=self.makedoc(template,content) - - blocks=doc.getTextBlocks() - textArr=[block.getText() for block in blocks] - numWords=[block.getNumWords() for block in blocks] - self.assertEqual(textArr,content) - self.assertEqual(numWords,[4,5,6,7]) - - def test_anchor(self): - template="

*

*

" - content=self.makecontent([6,"end with space ",3,6]) - doc=self.makedoc(template,content) - - blocks=doc.getTextBlocks() - textArr=[block.getText() for block in blocks] - densityArr=[block.getLinkDensity() for block in blocks] - numAnchorWords=[block.getNumWordsInAnchorText() for block in blocks] - self.assertEqual(textArr,[content[0],content[1]+content[2],content[3]]) - self.assertEqual(numAnchorWords,[0,3,6]) - self.assertEqual(densityArr,[0.0,0.5,1.0]) - - def test_title(self): - titleText="THIS IS TITLE" - s=""+titleText+"

THIS IS CONTENT

" - doc=self.extractor.parseDoc(s) - self.assertEqual(doc.getTitle(),titleText) - - def test_body(self): - bodyText="THIS IS CONTENT" - s="

NOT IN BODY

"+bodyText+"

" - doc=self.extractor.parseDoc(s) - textArr=[block.getText() for block in doc.getTextBlocks()] - self.assertEqual(textArr,[bodyText]) - - def test_inline(self): - template="

*

*

**
" - content=['AA','BB','CC','DD'] - doc=self.makedoc(template,content) - - blocks=doc.getTextBlocks() - textArr=[block.getText() for block in blocks] - numWords=[block.getNumWords() for block in blocks] - self.assertEqual(textArr,[content[0],content[1],content[2]+content[3]]) - - def test_ignorable(self): - template="

*

" - content=self.makecontent([10,12]) - doc=self.makedoc(template,content) - - blocks=doc.getTextBlocks() - textArr=[block.getText() for block in blocks] - self.assertEqual(textArr,[content[0]]) - - def assertRange(self,val,minval,maxval): - self.assertTrue(val>=minval and val<=maxval) - - def test_textDensity(self): - template="

*

*

" - content=self.makecontent([80,"one, !!! two"]) - doc=self.makedoc(template,content) - - blocks=doc.getTextBlocks() - numArr=[[block.getNumWords(),block.numWordsInWrappedLines,block.numWrappedLines,block.getTextDensity()] for block in blocks] - - #exact values are unknown, approximate value range to check - self.assertEqual(blocks[0].getNumWords(),80) - self.assertRange(blocks[0].numWordsInWrappedLines,60,80) - self.assertRange(blocks[0].numWrappedLines,4,7) - self.assertRange(blocks[0].getTextDensity(),8,16) - - self.assertEqual(numArr[1],[2,2,1,2]) - - def test_blockIdxs(self): - template="

*

*

*

*

" - content=self.makecontent([11,12,13,14]) - doc=self.makedoc(template,content) - - blocks=doc.getTextBlocks() - idxArr=[[block.getOffsetBlocksStart(),block.getOffsetBlocksEnd()] for block in blocks] - self.assertEqual(idxArr,[[0,0],[1,1],[2,2],[3,3]]) - - def test_tagLevel(self): - template="

*

*
" - content=self.makecontent([5,6]) - doc=self.makedoc(template,content) - - blocks=doc.getTextBlocks() - levelArr=[block.getTagLevel() for block in blocks] - self.assertEqual(levelArr,[5,3]) - - def test_merge(self): - block1=TextBlock("AA BB CC ",set([0]),3,3,3,1,0) - block2=TextBlock("DD EE FF GG HH II JJ .",set([1]),6,0,6,2,1) - block1.addLabels(DefaultLabels.MIGHT_BE_CONTENT) - block2.addLabels(DefaultLabels.ARTICLE_METADATA) - block1.mergeNext(block2) - self.assertEqual(block1.getText(),"AA BB CC \nDD EE FF GG HH II JJ .") - self.assertEqual(block1.getNumWords(),9) - self.assertEqual(block1.getNumWordsInAnchorText(),3) - self.assertAlmostEqual(block1.getLinkDensity(), 1.0 / 3.0) - self.assertEqual(block1.getTextDensity(),3) - self.assertEqual(block1.getLabels(),set([DefaultLabels.MIGHT_BE_CONTENT,DefaultLabels.ARTICLE_METADATA])) - self.assertEqual(block1.getOffsetBlocksStart(),0) - self.assertEqual(block1.getOffsetBlocksEnd(),1) - - - def test_getDocFromUrl(self): - """getDocFromUrl() should run (was dying because of undefined 'filename')""" - url = "http://www.example.com/" - fake_readFromUrl = mock.Mock(return_value=u"

Example

") - tmp_filter = MarkEverythingContentFilter() - - with mock.patch.object(self.extractor, "readFromUrl", fake_readFromUrl): - with mock.patch.object(self.extractor, "filter", tmp_filter): - self.assertIsInstance(self.extractor.getDocFromUrl(url), TextDocument) + extractor=Extractor(None) + defaultWords="Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec fermentum tincidunt magna, eu pulvinar mauris dapibus pharetra. In varius, nisl a rutrum porta, sem sem semper lacus, et varius urna tellus vel lorem. Nullam urna eros, luctus eget blandit ac, imperdiet feugiat ipsum. Donec laoreet tristique mi a bibendum. Sed pretium bibendum scelerisque. Mauris id pellentesque turpis. Mauris porta adipiscing massa, quis tempus dui pharetra ac. Morbi lacus mauris, feugiat ac tempor ut, congue tincidunt risus. Pellentesque tincidunt adipiscing elit, in fringilla enim scelerisque vel. Nulla facilisi. ".split(' ') + + def contentitem(self,s): + if type(s)==int: + return ' '.join(self.defaultWords[:s]) + else: return s + + def makecontent(self,strArr): + return [self.contentitem(s) for s in strArr] + + def makedoc(self,template,contentArr): + templateArr=template.split('*') + s="" + for i,j in zip(templateArr[:-1],contentArr): + s+=i+j + s+=templateArr[-1] + doc=self.extractor.parseDoc(s) + return doc + + def test_blocks(self): + template="

*

*

*

*
" + content=self.makecontent([4,5,6,7]) + doc=self.makedoc(template,content) + + blocks=doc.getTextBlocks() + textArr=[block.getText() for block in blocks] + numWords=[block.getNumWords() for block in blocks] + self.assertEqual(textArr,content) + self.assertEqual(numWords,[4,5,6,7]) + + def test_anchor(self): + template="

*

**

*

" + content=self.makecontent([6,"end with space ",3,6]) + doc=self.makedoc(template,content) + + blocks=doc.getTextBlocks() + textArr=[block.getText() for block in blocks] + densityArr=[block.getLinkDensity() for block in blocks] + numAnchorWords=[block.getNumWordsInAnchorText() for block in blocks] + self.assertEqual(textArr,[content[0],content[1]+content[2],content[3]]) + self.assertEqual(numAnchorWords,[0,3,6]) + self.assertEqual(densityArr,[0.0,0.5,1.0]) + + def test_title(self): + titleText="THIS IS TITLE" + s=""+titleText+"

THIS IS CONTENT

" + doc=self.extractor.parseDoc(s) + self.assertEqual(doc.getTitle(),titleText) + + def test_body(self): + bodyText="THIS IS CONTENT" + s="

NOT IN BODY

"+bodyText+"

" + doc=self.extractor.parseDoc(s) + textArr=[block.getText() for block in doc.getTextBlocks()] + self.assertEqual(textArr,[bodyText]) + + def test_inline(self): + template="

*

*

**
" + content=['AA','BB','CC','DD'] + doc=self.makedoc(template,content) + + blocks=doc.getTextBlocks() + textArr=[block.getText() for block in blocks] + numWords=[block.getNumWords() for block in blocks] + self.assertEqual(textArr,[content[0],content[1],content[2]+content[3]]) + + def test_ignorable(self): + template="

*

" + content=self.makecontent([10,12]) + doc=self.makedoc(template,content) + + blocks=doc.getTextBlocks() + textArr=[block.getText() for block in blocks] + self.assertEqual(textArr,[content[0]]) + + def assertRange(self,val,minval,maxval): + self.assertTrue(val>=minval and val<=maxval) + + def test_textDensity(self): + template="

*

*

" + content=self.makecontent([80,"one, !!! two"]) + doc=self.makedoc(template,content) + + blocks=doc.getTextBlocks() + numArr=[[block.getNumWords(),block.numWordsInWrappedLines,block.numWrappedLines,block.getTextDensity()] for block in blocks] + + #exact values are unknown, approximate value range to check + self.assertEqual(blocks[0].getNumWords(),80) + self.assertRange(blocks[0].numWordsInWrappedLines,60,80) + self.assertRange(blocks[0].numWrappedLines,4,7) + self.assertRange(blocks[0].getTextDensity(),8,16) + + self.assertEqual(numArr[1],[2,2,1,2]) + + def test_blockIdxs(self): + template="

*

*

*

*

" + content=self.makecontent([11,12,13,14]) + doc=self.makedoc(template,content) + + blocks=doc.getTextBlocks() + idxArr=[[block.getOffsetBlocksStart(),block.getOffsetBlocksEnd()] for block in blocks] + self.assertEqual(idxArr,[[0,0],[1,1],[2,2],[3,3]]) + + def test_tagLevel(self): + template="

*

*
" + content=self.makecontent([5,6]) + doc=self.makedoc(template,content) + + blocks=doc.getTextBlocks() + levelArr=[block.getTagLevel() for block in blocks] + self.assertEqual(levelArr,[5,3]) + + def test_merge(self): + block1=TextBlock("AA BB CC ",set([0]),3,3,3,1,0) + block2=TextBlock("DD EE FF GG HH II JJ .",set([1]),6,0,6,2,1) + block1.addLabels(DefaultLabels.MIGHT_BE_CONTENT) + block2.addLabels(DefaultLabels.ARTICLE_METADATA) + block1.mergeNext(block2) + self.assertEqual(block1.getText(),"AA BB CC \nDD EE FF GG HH II JJ .") + self.assertEqual(block1.getNumWords(),9) + self.assertEqual(block1.getNumWordsInAnchorText(),3) + self.assertAlmostEqual(block1.getLinkDensity(), 1.0 / 3.0) + self.assertEqual(block1.getTextDensity(),3) + self.assertEqual(block1.getLabels(),set([DefaultLabels.MIGHT_BE_CONTENT,DefaultLabels.ARTICLE_METADATA])) + self.assertEqual(block1.getOffsetBlocksStart(),0) + self.assertEqual(block1.getOffsetBlocksEnd(),1) + + + def test_getDocFromUrl(self): + """getDocFromUrl() should run (was dying because of undefined 'filename')""" + url = "http://www.example.com/" + fake_readFromUrl = mock.Mock(return_value=u"

Example

") + tmp_filter = MarkEverythingContentFilter() + + with mock.patch.object(self.extractor, "readFromUrl", fake_readFromUrl): + with mock.patch.object(self.extractor, "filter", tmp_filter): + self.assertIsInstance(self.extractor.getDocFromUrl(url), TextDocument) runTests() From e152a0653bd71c76a582526108d1fa58011e9d49 Mon Sep 17 00:00:00 2001 From: Jesir Vargas Date: Fri, 6 Apr 2018 11:49:41 -0400 Subject: [PATCH 4/4] remove accent for encoding sanity --- README.txt | 2 +- dist/boilerpy-1.0.zip | Bin 27822 -> 0 bytes 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 dist/boilerpy-1.0.zip diff --git a/README.txt b/README.txt index 078fa4b..5e06b9b 100644 --- a/README.txt +++ b/README.txt @@ -5,7 +5,7 @@ BoilerPy About --------------------------------------- -BoilerPy is a native Python port of Christian Kohlschütter's Boilerpipe library, released under the Apache 2.0 Licence. (http://code.google.com/p/boilerpipe/ +BoilerPy is a native Python port of Christian Kohlschutter's Boilerpipe library, released under the Apache 2.0 Licence. (http://code.google.com/p/boilerpipe/ ) I created this port since I don't have access to Java on my webhost and I wanted to create a pure Python version. Another Python version which consists of Python hooks to the original Java library can be found here : (https://github.com/misja/python-boilerpipe diff --git a/dist/boilerpy-1.0.zip b/dist/boilerpy-1.0.zip deleted file mode 100644 index a849c7c221511e0f39daa839cfff6b17cd8292f9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 27822 zcmZ^~V~j3L)Ghe5ZQHhO+qP}necHBd+qP}Mx4_R%HJL=6_-AU}bCO?C3?uM9;_|A}J&gh_u zvcj^U#K@}TC?luL$jr#9zObrxKQm1`DLbP?DLpGCM=QSy0#N*al2iZb;h%Hr zX{FR2`d`)vuBN0EF;2q>xpw?mN(7H*1P5^xgblC0s50~^bSrIa*RB*evR`iK+PE_^ zVp=*pyd$K4e^Qab`jl*fag*JFLX=EN#7OOkx#^=%?vKP{+DZquLw96Gmxr@mBPpO4 zIiuy;&^xlwqTiZFUodjsZ4^@cDniWqyTAECyCJBtZ2WRb{|TR7Xj>Jtxl{OJ#~ zNVVv+?z~p@rU+4ZGXKa=2I$w8Sxuc}vbq!+_+jesp>FGG-PoIJ2TAZK#y~!sum)+f zQb^9zO0&yyPC(&CU4uMeK@9Ge8zwb=BQLsHB?W zd5*C>160m}QWphmdECNLcq^B&M9FRXr|=be2vBLh0xKL^9(_e59%>KnA6O>Ip2z;; zn&oU0{Le{F`oo?3Ji1$1_2v+@44Rr{AfcYkkxw^^i@h%-5Y zz>+rmhAA8~O0sdJu|TC$qmj%1ehq7{;5`uv6MB|VMMxAV5wc_m5}~XZLIu+yA1p>P z1%hBeL5tQP!=_2ofqhz@^jea%w6{JvraxdH+LXjdO2Ta3-LU11EXx;}jtfg40-=vv z!fJvEG^2q$($W~kDpke2NVx*pkoRJ$$&k`M3Ogids(oN1J}M_ld`^PbsNk0)(pW)K z9Sc??lt}OTtinT-ek93E={iWnVvc~?hBhdIVkZ1I37<>lneX@-_3ljH*b8|FA`uxA z9Z#=t2gUg7<_LcDfb<0_qYX#`MT2q|yAt)%4K`Ocnwx_>XI($^l~6jLm+Ks_V}=P@ z^)_&{Ud09+(pyk2L(hudMgJ>!ya+QCfb3>XgDL|4xi9uZ4SGs(e#xsSa%U^_L{Ec3 z;p~;G@iYU%d(=H6Q!n7P*^%% zHX=JHk+@3Mf9X1(cg2_fhmeNPD*ZcHqgQ<>{;V(j!|~Ln(;u+itU^s&f#BgsvS+B2 zOpMe_S3-Z8^{7dhil&xo;UE$lWK|c4LN!h`j)pB6ITJA$F&IgAlq9bV zly?}6He{awb&$L?(j+<-w;oo|w4A`@-fk9`7xyofpL*Wsxp`22YP)VO{QUdFvdtpr ztgN7koP;VzlS>7s&FKuXXvmEzJ-CYk4ax3s+u@qB0-+y0<)lG0GHbls5J6wELMh;J z*^#>&PdDCT_@R%_mmcNvcZ{XjFw2}JDuG8TCV}Px8!{~FqcJvzubhE^Y&vl|HYPQp zxD3%kF62p&yHJ5;*hH%b_VXzM?(_IcDPUf3mi4%fI^y-tt!WJkHbXUdP-qAPCJPuc z1{6crdA|oLU1QN5+}{mzz{2_rKBg3BQ+F|N^%Ta+j|+Ei2VMZ4BHx05eoWwIvz+E^ z3Q-SFSxRNuB}7Eb4=86b#lZgJ8iKKsT2%OnSPoe2y<;Ytngo~{t;REGEJYNJwZYXY zJmyXhP>W;AdZN8}HNzBR)1mvh;#5dS+1*xUc?&F}5!gylr-jL z=s)m6zytU5dmPE#g0Dnkzb$oIl0)la ztQbEyZg-4B>hbdX^6PesER7#O3%|M*i9R!q%8s_`b`xp6il7Pocz5>ww%W0W17!(k z;9T;1ijkJ6;B{#gXk{hBffVtS@!ntV{e0NWoTBou^pRhtatiG}=ijNXFT5Rjc@C~K zRHdV7;@#1ey+XNJGx#4gCo7j1W+apFi30@y$YB5gjQ<_YmBoZa<^C5fm#WLTY>UJ4 zX|VhtWc*)v{(7>Nr)0+ML=q4~OEx>HpOnz?;GDjRz|!@c{<0>ZZYVYBmStrume)w- z4aCa}K=AS1@pJd?$EsY=uXpzzk7w}kE}jAHU&is>|3kMHC&-Oh5SESrxlUt%4cFR; zKLb2gnmQ*_cvwr7Iz8S0`-pG1FJz-PjV&}e1Kqs^5e*$S3>-|q5X^ui-a2tHUwuL* zYZ}Kr4(cv*cQt&-$N@$!W0D4)yrVkoone;)@8RtO`-tltJ(R3qfIJs5E#(Yyu86t& zrU)HO_lTPd;&9;cKp4nep}t_Kj(sllH&~uSSc0SMNs?=Q(4N^}@Tz1HYCsZ4*uMWA zg0*x;W+K*oB*MhG3#63kjfwT#rbbtqPBpsN1l*aJQA87y7<__b$Y?s41UqdhohTYs zb;%EPnEMltH%UkrkmFNv()4^Sxxi2dlCj0Y-Wiw*ML5wofy#`D<2sVSOXWtg6TT3B zBtAn{mGyLJ@s(4oT;a#+zgpTccYJSCQ0WqkX8oQBr+uyWP~p4fRhVL!vdu<8Qik7! zG0k=o{EnbcEz!!oZewqT6A4dOQ)x^OT357%kh1cgaWh0|hFzIHaEHLPq2S>`LFefB zS0s0(y0yQXho8{{^+)S{vb4sy-Q7&LHSh2Dr&w@)FIp$Z8d>qhwJ)!f!?J++@}j3Z zn_Sx8@o;YSe=JiL_N(^&IN4=Z6RN6i9PHn{1kQfqevaS!d_GlKzJ#vBdnbz znyhReQS!Or;febgDJx{%R&E`ARtPTmSG6XRnQuT3mC> zEQ&u&+syV{Fe+}SC>LYtI_MX`clb?%)G}lo{%j=#hIWuU<4+>_La)d+h{md)85A=G zpQcshoi0)fyVwL$RxEcT}=7XJ6*$_F6nL12=HT^Mq^RN2iV)n+B zU&^7)e$rtM5T+qXm(+?uemR;^53e@n<2TqVG9_#Wk;$BLVy|m8=&bd2yt#E0!a6t# zBhPvU`S7ftKBo=hpq$t6fI#{T%XSU#*ox;|d11e1e_GBx8MoisUTp#O4@i$3g7LPF z38H@h%l!WGba4Ln|J7QU2FMGyXMxBg{MS}Qfc$?5QWrB^%!7q5!JVmjgYG6WH zuy(A8i6COhXC)Z7IJ4|G=Xcs9^>yTAZ>KodpdQw7NP&W*A+Nb6ni>^}r!C0XO>1U+ zuiJf`2Kd>X@Es`Y$9?r50o5gm`yNW;#-a)Ih5fll(NwXgq-dA2Xe`a)XG5%OJq|s` z`%r#S0WSaK0BNE{<>50YQwG*+?q`id;MN3c^5Zucg*$|-B);V#%bJn{sLRm)N8X`) z?mqj1d>=TEUdRhi$l-qEk+Z@!io&E>1D#Sx2GDI7uf=)3+UfT5T@p^GkpyZg{8FgO z_E;3M_yc##Q?%~p7qt%SY??=OxRLjG&Ox1gZkq^QO}camDLJ1AIJ%I9tXR!g<$UkE zjx`(9#vitSR~Of0P~Pn>SFEvG{g_;OmT6*|m7tsO zgBI_Q9+Jdpcyjr*$!0Nc@xt)*F=g=~2sxj90fDhDdwK6v_g?pAf}~&5ra8Sp|3iY2 zF%rTy<sv#HybERQCzqB~iaU0G3~q%0|wAp+G|*T3Gj!32>>g~T3bT!kn76I`a+NEDWF)42c+gia-e9Ub(w@R2STRO{N-4aNfFMPWu`4~!tdwm zA+MADR2Z~!6Z0W7A`Q`oc|iBx{Rw=C8=yW(B}t~i+AA0B3HD@0P*BUW?BGly((bZ# za8z2(*ob71OHvrjfD}MN+x_+uI7v`T>lLD+=fB5C=k{$lV6sob)9Pv6<>~Fo7f)^+ z1Jom&IfCR2Xd%vTg)z?931fv|I!R1$m;@epchGCJ8qP4j2Be7QW7#ItRSIIgDZPPM z8%faDbK69zK-$@iXqdDHpn-$~Y6n)5y=N#e5@4E7yJ;l^#I(X7D^|u6^AvSZtuk%K#!@1k zCPY_Rb*+XX(^P)Nyg+MHnU8QN*LDCCi3pWB4mn7rIu5k8`#T2aFHM4v6ft+iFA=yI zEJ6|M3_B~(d{6>a`c$J|BZh@SO+kJsj^sMF2$EvBK zPggD<)G>h9NhDneT}DjKb$Qf)HKjM*LV-K)JP}Qf65&5XB za>OR^CW)k4a@+e_{&y8vgFiqLNlf^Ylm-nVh#df#x*uIR_t zYc>V0+8EUe4#(YzSR^i6NVgJKccO_%N1bqG%5MRc{N$yv;JMju`O=CwOywz(Ss8)| zLbus~xL1R2*@p?(imUhyJ~{4KeG%pD$_Gf^;M1FRTRxKa%N$5#swTW zdX66FlCb*n#|fuW^t5Mbv{#vU^{b?5uM@nG%5lfW0qRsoNYA zP@As6(l6x9H6ZG>evZ{r>=3NEuy^csDKbNb(fOBfn0WN8a8!a8>X^2u(=~rh)lIL8 zNR?+qdO&WVKiD6!C$I31?XpBDB-ouYVX#d`+-^leMZxBTBh7+k$STieP0SeOwk>__m0*d9KL z#!a&noUfzzRbO67F-eG_67QFv6mhvit}rXmF#ls^DRHGJH`%QXH4PJIN%ZXk ze0I5Lw9b6n+(Ul9{!21CYpoe&gM##`;?Mf;Di!II?C@xi0yDvolo;w1G(V5lqP|1 z_JK|^tt}og^fuY#7|~{palraZ+b!Xr?3DgLY9j>hw5xHGX7|vdwGh?K%hh#d)%W6k z>g^hRd(Fw8b+?(z%H2V?()ZzS+n@jj{yaYLd4!C3rx7jJvmE=*ic)qk z`w6bW?Nps@{A+KFT1Lz+$cN~dKWGI*2G;4M-Iwl*(G@qQ7#!_4d6Gh-N@x?*z0e2q z49IWY`ux-k+-1yes{JRKnO?AEkEquD(wbqFTY=kUy(+qGI3XEIsE4RopuU5o2=95A zE;PoRCyL9BV`g1`vSmn4J%#?<)w1%@>v(=clo7{AgyS_cvn^vF8&}rKZZCD~sO8Mv zzUCoaDe2-lI!6nF3VUc_DlgO)P~Q5~g=EcVrBQ`g)2PEl9eVAKZ0gkb?ecBPaaz;p zbUyqw6VsW8V;AIp1@WeqMCZI@`#O3xPKn_(Jtu#SUZ$UTx(HnI9@E_0%t9^e=7JEd zCQ?eg4$$ZLS(n#}V~VD1KA&+=>LRO(WTj9i04TIDLa^L~4$F8eaPk@0)sEO@IGm}( zbW#4M9M1Zv3|(8{z_C=HqDnuJY-<+nfX4|0Fc=A>)>JY8-`F2D&gvo+O zeq7`2+)Ceq-r6YE0&(j9=9$59Rx7AKnb;}{H^Itfrkswzo;fi=pe=}x`HZSbPcIk0 z6|nRPRIC=ADWee*J{}dhtI=yuGiAa$Udn@H46WrMa3Pjn+r; z;qB!D*SH=9+`7;v_CAIIEggd>5NLv;Db<8<%6mBaf+3$j7Ao#@WyX#8NANi3v@E2aF70-Gf zb>)I}`Z^&CZUzwM0KSC+ksDN7K_9{=#Og%*c*tMGaK=?YSWGdTwqDaeR(2>W23gPW zeqDuO6?i=e)e^AybTuNy{bTQ&zP3((H>_Cz{^(HC+Or_qb4_x`8=nDj>c0^Gqt$gg zwS+~>CL2s40RYec+$xg)U#)KD>FR7`;_Be+@}FAY)zop>?m+Wzv-l!VJQq@~Qrx?i zxf0z}BlRxUPUA!F-IFDM5hSHF5<{W|Xs_A6^}pc)N-DA$ZgdEIi9GKQbMABpIRdOs zrQ67P?HZqhOlkN~E+!vvf(1_KG9mLAGaq6$!P(yWG5kfX%X0<(xnWa=#7rRxG`Zx-$HZRpk;JI{MQNMzCB7mZ%?*a zjiiw#e^Om0iJ7&KvVy{ias&6E^pFpt<;g-k29PS6Q*@tPw@9T&yroKA;35M9CMi0> zC)r}dx&}nm01==Xx$a{n0{mq8JjXvB7ml}|gXve>fje&E#dC~*|Cu=R5$|3Z)e}6| z|G9S+;vTL)X%e_*g91T5qW;pVCn zGVeB}V`u^A~oh-}|K2?Y_p+Y}#O7x84C%zE2ZqTD< z4o zC+}}o4izh};{^yhD>=$65K^vLvnZSA%cfZ*T(LVc#RMRD=Qr1npO1tWwO|)bW z;_s4)+>A5gO4~@@X zNUfjnM%t11NLsvj%j8oi!<}bnf`aOy%EUoPx3;C02)S+E9$CDkqBHlVOTQx2qqGQw zFY^VL{pyI+E;hEL+-xql28)`v-nNC~9B<4Z8*2P2G@ZI>H+r9r(Wod|qd)ET#Q}4; zHEuWcAUEIhR*L$FyChN>H3ZpU-Z`~FKJK(lmIM(HyZ3Hg-$fgCOjx)bCIXoCuz7iB zLr0VRO_j4@(3o%cZ(vV_==N*m03j(T@vRPH@7TXk8?)Z$1H%(uw>gA05pK7Mbr492 zCOEjb{--_ca&%B5uy&FLWafHcm3HN%q(Pop>tv_X+cR&5pt;^p*A_*m4_n)&Kct4r zWbEJ=KA;bz7fht5m`?h9O{MBC>sKk8`^O&0Pc|$GWIRhg(!}t%;=GlI`q@(EtkPFG zy*2(Ne~#|9e|%Tzm)z3#-+B3+gMgPqz~{!6*IVUUjbDpQK7ZSeKxg+Z=r46RDSlw*BVyr1RJ?mb@Uh!Yn433QEgO3Y}3;gW$sX zze+qzv>6mt=R?9ya>iLB4h>Uq76w*(>~Jo@`iAQAllXj;yg8}(QC0=l%EkPJ6}}?6`a{9#X;o(x1qWUcndvKuraCx zQTObina!~@qrMIl!xu^j6PhI);cqKy(&lq_M{z3b9It*Vn-cpH&yh-7x5VQX_1{7- zA(V`@**iHWoMWg}^k1|D%W3=(nXx9IRF$%4A<ebaCulAW z(6M@#y~*i;R2FxhhKGn+JKD3G^2 z)Ca@om>TAKsI^9}wI+uiHgM6%p}D%4161Slz6sk-L(If?72AIxOeST2c~GNM9nxnUXQF z5vx5JhU))wWy+J?B6;mRM>QgV9j6ZYBWEGc;TlS|vGftRAz&#pS}o9hZx`_^-x_a) zu4Kx$Z=R!!wMcPJj3*ReH%4v<9`HKWKX12W$pgLv+!VAcq(q~cwlv~N)YD@ zO2Jvs=3qFU74;x@cH$3jgT2~)DVy+*n37vK-zy-&Ty0ij!`lcP`1wJ}Wh13!)Ptt? z1-RV(UdH!<8+;BM{W5uHXx7>H6@0ic|oC1b);VCu^?lO$gUMeq1ZG$NYGvj)Z<-mHG_zEp7Z5$wW$ zJ!Hkc(qllV1e6bTC9#-GYIIBvR*$n%7@l1KCuBIFW`pc>5ybn+-rlmD*=ls``s>vy zne^liR7jU6nKv7=Zs&5=c|xz8L}67`rPe4pKL2j1zN)40Oz0MzG);}+L*TwEi=JuR z<{r0`kd<6;xMX`?AU37>GH14ndTFa|{tW^?Gp=$bZ2hSzlzW0IJxpcias+%L{EtZS zDU}zt%h;?o!Uq8Gq5%NH|6h?}Ze{Cg_P-R$4d1uR_E@6;4a)}s{YrLP3#GI(c6%8Z zSBvPjEvYoglG`Jx8c31^G8$w9z%onaBOsvLhk(;J5tXicF1sXI9Q5|}zwz%Cep}#r zcF3^)+|y_0-#}Tk<+aa#fChEyMuZP<0|~!jp#aQh4%~O3+}GgD>Gcf!Z5DwiU>488 z_1XvNt!OZ;NP;?LChEu%^S`2#m|q6J1=|= zqqA-Ne+9hTAP@)8E>iKB$>alp1v%sPyA9(t4HTRRX9SP-ea#jyVQ~ORiY~M;wf?)$ zS6OVC`WXyac;I_&X=4CNYt)irr&QB#00U%fz?^*`Wrytq&I!DJuo}mWfBu+W0%r*( zcA~_RRZ!wrjzJ=Y9={RebZsnHLHCE7vzSYqUAJM%dTQ@)XV0IrWh>Ty3|cD8y56<8 z?bv!`bz}Q+J@U+-8chKI+-0H+Sihjp=znS&$HTE`?M!QX!DbM@-`ZXMsRXx{=G9N<6c&2r=!v?H~TO zc@Ufv0BgO^U#(fSdri(^->hAkm;dV+KI=2YE1|Ox{_m30p9aVRlTUfTBW7KyfU=;|p!u?8rDQ4ez0Fc_IA0-Pp_ z=EZ{L+?h$afVs&(kSJ>Bew0p_wc{NSA=ukMBcMU#sn(GuRT4C}GtawkLqzM1en);V z`sV@;OKd}qP5oIF*iZziCXzE}o|p~1Y?6$uP_OH)0s&|u8O=q^Pn0?AxH}+|r$tZ+ zT=2H{V%I17Zu#C}ShjjUg zgBSH=aKetDICF~ix}g39NKkA|UbmbcB0YcN76IIzbFYaW8@fR{~xDx9T(C7gG< zkj!>zWi=OytsKk)ig=(v!}NiHy8r^~EFWA_0vM}6(V9*T473|BRy*^8dQVM55Q3hf z=K^dIZ6iMGO(S`t5YZufSb;-<5^Odfr`s4QgTO|lL(-mL^s3{*xy;Z-cFx2TIA zdXkLthCFbW!fbtH_0)iCi~7Tsss3bdbMg_ifM;X^30P+;Gpz2>n;jZS^Cw8l-h?Ew z5t8Kx>#uU#`OD7JKKnX(^KrFp|FnLqEDId4^lWT$!|g$c!N=INQ5`*(Vo^L_aShF- zP#|*<&nV@H8d(`Hh<7An2~?2=ZJo@8LD}R zES$(VIQS{Cv}Uz#ir|_-OdSJiZqI5C6fC9UJV27g)-_Zp{PWJx+7FcV%~SY$Ah=r)04n1wJ@G2kMtaD)hdAiMh< zsi7{{*b$8&3&TV{O~suE-l_cbDKyk74DATi_PJFAZGiYF%MDR1;V?r*h|q)QA)*n& z*&qpRNrB;W0V}XGChUvW1|i0W8~GxLEMl~jm`xP8#uTVBI+COXLB~Or&YkIFBTcVY z4dIKl_pD1BTgOitkL7M|Aqpe1+JYX_LPzJaP%X%A$_TX`3bP=4)G9QjIV|s*t0Tt+ zG*rWi>X?v6Bva2LO-l)*|7s~UC;HcLeQadL?s6yck9eSg(PHcxQbk1}1xY?;2u0xr z0=Jel^#^I);V(ImAJq|X!+1V%IB_xtuAvhG$!{pA~;nCJGyka0}#yh)-ydSu2}a7n`e1q$>*iqt=L> zlF81;pmOBa2n8CSOT!4GzWav2l-!A0>H=@UKaTq_eaJ24&9Q~oOlIFY%CMDB6Vfz6 z3Y;9@8JT={@-P+YjA1!iQ*go!{`fk(dUQbiI*`t=FPTS<>~?`C$eD611IUHw8iT4> z5;VU%rL_rEo{-q^9L6%OLE#Geqy9{xxlbTRpi{~So3Yq^WJt-Trp}wes^dpQ6wdiq zm2{N&?b)r3lEd5(0qF@t=Bc3%9uho8^B=a$t@ zF}u8`1=B7-+rZ7XqY9#(60+oWciH8w*Ij4O-|=(!c{(fqVfM|v)7k&CFI@>ux8JHW zeoiqP1^V_w2e{B>jmwr2B>SsU#oI*p9Xu4{oknQ^yr1=H^!HhpUcc~3jwVrJTstC0 z6CJxyCNVz2afA`-PthjpN%iOLVC)J8IWubkNdSjsULNMVNLyOx$zQ*0oUPvaUXHjZ!c12wkGZ**~81f8hWkc}3Hw%$7F>-|l9BX{B zr&ZqU%$slTWie}@M(VT6F_eAL)1@BnXvU*tYy`$Lr=E$IsZw=_RJ>%aQ*N3@ye!94 zvA!~L^Z5dXj=ay>(O2leBOe4sKE)mW!Jfvry(DJH!5oePRv})@=e@+;NX_dBX)LNV zN6cePu4QJk=(WAoDK-MVoo$biHt@0`MjVz=RkVbX>lM%1DO^+9n8Ae9O-$R^=vQ#N z7+V@5e@ELkIA;&q9Xx))oGf~YVF>R6dQ2sI} zMleF6>GuGYtCaE|6!*#vU%m~zxy86%wA(t*{WBL!;w8C8ezZGKn8X;J8C83R&|uY! zKf>QIWSA*+b-h;y=H;qn{Hi^F;v9F1o!?cVtHni5ZHn@cS4+lAN3;%%D_p$kJZw?( zh*!1PTNqKgDV=?5spx4_X=|;3Q_Io5xhtifE5t#37!!)2aH+$dx-^SIOhqXT{@X6O z0HE=@gdno)quvpnJFm7Q!Xu4$A=eXtBqscN@->GbDT?1X8abgq^J{C{;uE&%m5?Gutv4i9; z{%Pae!NmPrLAhVQtVp|__YBU?D6jVj&spzGQy5d&xltnP^-p46R-xYWR_2RuWMIkC zcXkD2M#~&O>Ch&J+Le|W8&R@La8tzLO;B=%^?-o(ujgqXQW?d!79oB_e#RSGzlSM^ zRY_qkRg0)Foym|6F#zlsIM zd9M7`s#S8Y_Zv7S%1Sr^CkX@>f=@^p&gNj?WuIfY6K&EMHSl;A+es)kiZEEXA#l-Y z*U!Yo_C%whse392c#bqCD){%P)a0Kv4_oAOjMeOu>O^69J$PAX+ZqM#-RU)_x94`9 zd$g4}ep+K9=E&VzZT&k=bSbJJ3S~2#B3;%QWTCaT@l_mK15&y~Rv%J~w)TMrkc{wq zbn*&eRg9hjUon@W_^g!o9N@xvqbauQEpOON6`}4VHFYYJRSm0SP@7sCfzJWt}z0XuHNGP`4r zRC|H)$0@r-b8zqz8d8H6$Vi7gwDGu-(kgD44=c-Jn4O1V@A(I1qMax01Agn=WM$Zxzfz60jcB7^puJ(1q#XcVlr+ZJx}WZ}bW3-ORLuRpMI; z$@PsB5^3oEj$c}qz;TDtlxatxhjA2pP<@i{Srf+e$qBwlT682*L{d2LbcB(D&}pXq zugfflNB@*BM%T>XDCARPYLXh+LTdv1z#2P=0=$4bTFsg&Ul^@)=z1e}hf{Ai$VaDe ze2k`517FWmujp~oPv!u|J*~ZB5J|xBQ-Fq{CEQjY)8`LRUrOj=o2vNRe=KLNo$YB{OfqAEV`#)Q4c9J zl>U6~LkCD`c5z|>vNCoV*2G9oO*4aJw(V2E>TtQF1m75NuiTS;5ZTwhqk>;XN#h5H z_k!OKu6*H42b5((;fM_jEY>FedSMWUDle4dh|zk1ak9aXb;C7y^n-xbR0@T%$f&o- zjbn62gFfF$%v+?2LOL0EOD$jKSo8tZL?QUbXdtCaTj~S}tn|(;kjKAit)AHD@XU0w z#?Z)@M8%J(kV(jgNh^I()@%#XLa0YST2nu{Z1e$M9!@?^o4&iBtVOGZnE}8_q1BD(Br~Z-ry+Qo(6_9uly=b-G95uK@x0Nhx zIAA|sfvV`2H4j%eQZnX6fz?kEV2ecNX}^`GK6~!i7PH&-$Fo*X1hdPlj;a|s;3T>E z+fT0-4l)IXZ)6i0ifd!C11S-;MvYGgzA9xrEJ<#PY(6^9bPajkHu3U9q$0>P(LMHdbNkHa#X2Pvp`~Dz68#ho!tU}4`kJ~+m9>Ks)wP#MX{h};CgX{6 z40x-Q`vxK4meeZ?VdxPFz6Q4M#^Zv+TN>~|uc)-cn@Y3xqS+8!(p7hHncEQ}_1B;< z#@Q$Q*8^DaFH_d-m%f~!#T0oSj=GTV?#Yp;%~B}7l(vrd!iQ#od@JZ&qYub(zjKfq z&>w>H>;j5lS#%^|aY+%yFM?QF$~an1W0HMgO8$(?VxNuJBo>gr_BWIsIvZ;zOe259 z2N2*~HN4uk+Gn}moLd-dlwEUghDA=%Mw=gcjM1s&ElQzj1ddZ|V_QRYK;ubgA_)#w zp$_H?G-ywfrE8ue3sDoOdyy#OOE~`oHX`(;J7;3oMT0jwK!DVJn;+xa7KvtKqQG28 zsP@wdO+=FI%-%eIY${w7;RzIVimLS-3Zp}+f=R_d6BeuUiUU^y_)GqjPy6a|iCYA+ zzk)u+^EjNqpy%Ys-$(&RW5i%}>w5n-eO!qZ`#Ep(&3kHtF&bmgj9VQZ-M8)or4fAx zf&Ti&lMBG=-LGJ=PnN9&#cigp*L`W2NiadtF>0Q&j$Gmym{Q9g zF(U^P&yRl(yIb^Nf%ehH?Yw@u8VUSDCJnx}WFdP8SN(1HOBkKx9GsG$keA-r9-|+< z>2d0xX9SaWLov*wMw}YwtYTemd?KE9k9*06vPfW=28@EC(|979G{zx9;yPp;ymIgP zGTyXVP&6w^4^^)`a4F8@aH8F0|DTDVv;ziC<~!@HS5W2or}@SE)a+X@*!-z}nsP(^ zGJy-L7^eU)NDl@VN*_h1PDbV3g{Ne@*XHex`(B=b@ynXpQNg4@{i{bl3$IO+$J0rd zhQHQt_swqqS+CCm`dh3O;8$+dR4>!>>ptJfodTw$v z+x+cJK%rW|zpCdeU%51k^Wky6TBA1`j7H6#vB*!gQk7>cJ0+S7p#rZlkjKTS#O|)1jtWlqfP|gCyY^u9^jfoNm&4i>J2UtTs)C># zw42_uV1cr$@A|fINw+eT*OG57N`R3*o&D=F`@K#Kxvwhz_P}luUH0!cz#?t!0L{aX z7cwahcTv{G4tk_wgUndSuozz& z?bUnZO)KSiMB1LTU~q+(IFR2J*}oH2&IrnPimJ0va;qQes&M?$IQU`FQ_dGM8rNr& zt1sVv-{8O07cQnMwdR_8kjoIl{qzPJeP594-3G2Pm*+9W7#ypDGUTwi zQq7zulM81xC{;$*#Xpa0;=_OvMMUo{&MrzLbrbs7c{*`VnQ0;MgnI*xK=rsb=psE6 z(dbB`+2jbN9A6qELZzZ}tKXeuMY>Ult^E8ki-Oe&3EEhgEcqP4(+F9cn54(m<_Q|h z$Tw$nl@LmD#wXzHlA&aT*s;d0$aF0QX-j52nTkC|+#2L6e#i0Y(yLBhJ!^gG`wa~E z3epp6G=qx%)aLMh(>e{ISs`Y_WAd?C&2nDa_!53%7Gz|ti z=?G|^y1ffbrm#+@+hW2{6sTO&EaO}fLW4U>$ggGFel9z=J^#&RLRT4^SNOGOy5upj zgvTFsN54_JtHd2$rk*8@jvLAyI=`shdjcr-uK0m*S0TSIwZK?}4w?X^c~MzVCaO6l zXr^%*{q9SCt@9>p@;*bKULV`1!;ehz|TxaEm$e@_H<#Z zmA|ZvK7VGSS8zuY#(Ec+g@!tM+(}<51J9wQy3Eiu?80B>>V^d(u--VWP=S+tjG}t@ zB?9bCG$;TAcL&{zF}JvB8s(rY^C})rmD{+gOLfYt1nQCXG=uY{fU;0&&MTuM7vpZv zd#J8(wyvA%ai!*kbgi^{1URq((huXLNFHNJNXzs!f+@9~mp|RXf=x7J3dny535r!o z&)px0>E3V7?qVK;7rKy*brUK~yjo*8#w?)>KhhQHYaJjfI|?P6r{oUzC02#3ue>kx z8nSob>%nS0LAc+hUKH47a{uV+FT_m^7y6v6TDX}X!HBYN8A{ieqmlTrQ5~*8n)>)s z!bXrlx5x1O=G|x@{{cG5r9KgacKlUtZ-(Tv$N^e#r+eG_Y}qsTM_&r;Ik~NuuK!~2 z_r6BbsO~*5kQSB4s?WZM=VlL^hn;2th?wHlF7NiP{uVSA8fG+*f9kF^uN0S!q=&=b z&&y+0=2Wyoy7B^U?oX39?cs(MUfWDNP1VM#$c^UcT%Hn8wGf(Z%Xw6BPoem^&JoWN zsvH@6X5?iq1SZEvlm3a)lwOoCUi!JSKaX}bwJu(;txt3bxb$CiEu|+P4W=^=-o7Z6 zL=syUl_l>hF5F(SHUU38UF+le4skLASM{?PJM;cK7QVPvAr<gWWe7He651~6opCDrcCg@P#w@{6+Wm5sRt@> zf9T4uE*g0=b0-j&e;0*_yS!8WsCR-!8HZa9E4Q+azNfK}0(m99PmD8^8=}S~Hf%*d z9^JaIGQBq{e4!5M)vAZczat@fqW-l%Qu=~{;T@Fqb$rK#Ac7IY6ZS#sPrGRpLlk3& zn!@woiv;`SFfvMIv>}$+m{EG@f=9x2;s3iyEEKFk{HiA_mtq{nNzBj3#~y#5_I&T! zBZ+HYxQ^M|KFPokZLg-zoDfDp=nRyX23Ow4L5M>cxIiYGnP`cfONrRdRB4b>$dM)u z!iY0bD*v_&BbtjUpedVVTY5XE@gX*a9?Fo$s#x>*s;3pO8m3_pcg0PVixeGIw~dA- zYYRh@Au3nY6@&2K=O>RCRa4T5q9*1~6Z#5yh`EW#p{j7!XT6hvse>h_*FqpTuqm8K zZ!+uJ--IxkL~p?ISQZ6S&-}Xt&RvBphsWZmv}+&*$$(N^R}oLafA50=^U07oW2%SQB87<;lD({#kv6_liqp-%}?zrljD56@Q*X`A*rdc`t z{xN+QS;N7+S&20s=GQCy;O^63Rq!TRnpeWzEi7R(?12xAi`x*XZIKPNd(|l#!w5~9 z=838U39D1fIhT9^j;pP>!o(2%iD!a#KBP}oaqb2F`5w4Pgiiv9?S7+bjCP8U~P}sqCWJTu_H$RAyfC?=i&lc|7+M^#IMXthG2fS1=PV+=H zTBoe-nte?-*?P7q_SU{F-|n=z8t4HJ-^;Ccm<86G48UvG0%;LKaZ~U$pxq|!dnlcf zrB*#gnt?TV^j6c>hLftH8$u}njd(%V23lMN8FO1bYjua@IJc*D1(TogAI2KJOyV%XO2O7$P6eg-^cgi@y*x!8xs0Al+t?K^^NW1Fpx}@vtjKfFK`dW#8j2{!r~in}o=~Z)0Ubr@?o5ZR-6W zhh9|nJCWQgLBNXl$7?|J`QIm|!u){Z-nE(4K^S0Q?VrHFNdD{bw3DfetEJ1I{@l5s zod$tiQl4|@hlH*HL1n{w8;VZOn7fA~W^WV;4GB)o5eocC7 zgA`-~yB@aBSko%$8ynagZ`4wU?i+EuvN|&$XA$rXlZWiN>su1PY&O8J>dKl<;sdH^JAlzG-k(9V)Z{2kz?L=lUFfqJ_j7X;Xv z4Sn<>yfF=1ld@le?W3}&Xe`}&p`xWSV@e5AP?d_)d#KW|J5e;52&BwlOk3*%2((wG*uu`aXB+9Ewuv~7R?3A<|vr|7ISS+O4 zB0t3($>OM()dC3@RPOxyaz&{gCUgTBR07hFic45G3V9~+QZ^b!LsgaxXyR(Nto@7FBwb&nWY{!mY|b`# zbxaVv#WT1Dt^rhLcKy;E_>-GTo}eGQ(hkFo!>g&&88w~d??E|WQzY518gUx;p0Rfa zLQBr^pMAas_`9P`J)&%dLrrDm^s*1yTT4rsMMPq*Ht zxwR`A#0&Gi--+Xd3VB!&q3o;0(*%?lEv*DRTTGN`Bmo_g7B%pdv+F>L-Q02qKf*7t zK3nrlBsJBe7>wUgk~?m&mVoCE8^9AldPZd@ z>9NM_Kz=v+CI=1^8$=LJIjX+miFTAk3Io!B_n;c@*H1EnjtN@;jb(KY&Lhy#M*~xz z2U|6(7Dr>jZE--B_=iy=-y^r24hug#Jl*E_aDOv7)pDKhz~mE)At}W^ zcFVmwkQi*WOKHWZuT6idG^fWV*Nn&@U7JLn0dnHt`i)9k;gu6%f-z4R3g;mhOoG8G z%BeAYgu-%vT-6i0dik|yvfoaeSTJA@HJl`EgMTMkN#J2R(8gsYQnv_Hmm%N2ZL0)P+b zbtxrvU_bG_U<-)2KyPDL*~uPVQt8fOd%FjlIz#NZ{G!Y}+%+9vnH)T;juAqq%e{xS zYEIi?Vp?OS7ol2X)f<2cF&^cN)!ti6+cQ8Ym%O1G27qQs@&wQFM&O1p;B6-_6-$QX zihksa(Mb8E<|-V3D+`IRZMd~g4p&%hB;cMYvM!Sd)=NIUq9=+`S#IB~v&ROiMG`*{ zE2RkqqcQ!WI4O0qkjba?O`g}`W#6%*=dJzwy3?q$tf5mI@#{#1Wr*m}mer~e*q0=_ zaBNnD>Pb~r5cDVhx^)LU9J8&+h<7~B?dTu71S8J2NG)2J%^@wOnVLG(LrI3#-^ZA7 ztWB#=Uv?jpr0>w+r;%3E7uYbb%s4M{VhO^(svf+HwJ2EYojO=b?%;_8obme%4D46= z)?G^yWe&c7`!xQdGZRuYz_KX)2|o`aWBVNlb1jnFl^O2EF9K_R+3>S`0gSTc2%ALW z`%-pmv+YjAi+rRRsa^ylePlmrAs5|4sgt}qI-P&S2WdUPK^z_c#9~}Q%G?5nFF=uS zuN)R3{f&qGJn(bqxZ}N*hyTLOC!WS*#U`u&-Az_#)M#U`<$Z;Ej~GLUGqy{iJaKfj zGK|2qyG^eJCGF_yXj}Y^=K!h8F(V%a_tltbSIFFmLDH8B(7GMxldX00t0g&d7ZC2^ zwBQ%OcI0=0L_BKlAj5504`|LBpyHs~$Wvxz*qF>N+HTzfQUayzaqr6JJ)QWMml`>IH=x|phOf1<&6t(CEcjvkO85zG9)4XxCa?G!Lh?`ZnvOtg}Ff?5V4)_G6bQTG_a$v|EHLVgpS_=+I zW@W@~x{Dl5Z@N6dTG0x@%bN9M z^!Mt8b}Tp}z;-Il46K?Z1oeApT?P8?%CUIPj|RuS!dTTc4>p8o0K z>^`-zj8{<{P0Gp)4VJR~2=AP6yb15-_Z%vniK{s4;S`=-JFB%(8M#!8jaBUq104nD z*sy-s!W{H(3W2j@IKrwLWF~z5@B|3D=sz?y?`DmR)aJHGhnY+}PP~l~2TmJX{2*w& z<(HYx_qF-BBi(a*`Y>MY9 zUb|yg&0m}wgOw~MEm)Uy=D?HF(#fiP{Y>Hkp@sZPBfc!UbIC*Lvg#yZxhlFaWocd? zui)f67tmvHf`vcUE9`*iT*oFEU>^B1B2LWYO=jSGb_fwDt?CM$RPcs)jv(bDw^8PV zb03Feo2g2wOOG;NNJk|pk|MHapLtOK`zz!tXDcH10*Zk+$Mm+Oub@9F5(}E#!;zEg zb$q&GQVHQpYa6uhNzP})c|P7&gmTzfD6B`9^b^d$nf9L%PsCMkfu!6(sl^)yh2P?m zq5BewVuXUqLM^szsNfx9-j znlV>tyO`?0=r!3G20|UZL4F$~8AR16N&$PcdlREFb4KkQ9q^f+x*)-e3z-r$^!Lp7HW&yGlI>ZwslsK{k`LoyG zOfi}%hIK+ZkZ&~{f`q@7{r7UuN6nCPv<+=Do|Ye|d;0S>#@`_{#o$E!svlo35^4y> z9ymV%9{b$nF*=!z%0cjg4Tma(S&()D?lv^k4<@oQ&fTpMaWh%@bb?MAzUb@Mg{q5xJ@omAp^R4ZT}f5`jx(DCe80d=piq=K#0hmXJab@Ni_aQuL< zG$L5nR4-Z`Zw|vph8`~V<~H;px&Fu2n|K|LLU%o38bgi#cm7GlGfUlT3{Q@%bF&2kB&B65i9+! zM5z@s++^^RN*E`W4Z4of#=>`V=M&x$PvsttsgKxyRh}}iFT6YJBsME@{TXBE1zX4D zu&#Ag^}K=exj_t^MUjp$qU=1n3CmO}E=*{p(@pNlBh{axn0aCk#f201ppAnC%|6+% zNKT7FKW-o1qKJSf-1_q6S2c4rEnWdfAQ!6tpHaU%@&MgwuDs!s$g=qqm%|jjTTidx zw7Lnq-3pN<$`X5b!GZ9G23@eUJD>Y@G*!?E6|}Bw%!0)Y;drqHIg62$irw^1a^Y7J zV{GMFcp|OE=(7gUCET5ZSZ`r4Yv8w7nOvw%mB)M}RvQp^r6+Q&VEBtDBF*bt`ay6WEZ4iU-NvwB1er`(1lviJ!;0_U)^8h=nQN)s?$N zapt$|>IfqYk!`H^MUbqvGL{1p-2=KTix`Z1Y*_|0<-5&hq~aFe?vc?fJNJdg@Zq<2 z8kpi!YG<+=WF><=1C)QouRoVkC4sUD*xNLx_Q-j@SLRQ;#RJZgqr2+#Fv;a>TWSpu zvE-RA0xF@3AwVzK<14ICyN-fCIj}Xh!t3*?xOIcIJ#(!d!{tA@(+}6jScnk>STr;l~OMfq;A)>R1aSd!wFG#HLu5$adOra(XK(Zeq`p7jY+ z54Um}LJJk>(z4|`bBv3?=uN3$^&PT)oU5UA$E%~4>(3}Fu@<6PHuVj|(&ici%Labi zeS>zCBu%5;5C*S)m-s{a1^Hc*!8~nCvj4kj-Zv)0N&I)AX>bqp0FpyySZ?m)=L-V} zIma3vHWZFU;v}NdDyORgX54F!j$!b!EL0{)NAp)Bmh$Y4w&5>Th;DI2)I1WOGRb^41;89p8x~S{|bN$+Q>a;j`sD%GEqTWZoSZ zNH&S^(D3W5K|wd^L%b(hR+(6KNdlY78cM^$czUHipVhOah@*5T8P{4%b=bb^GH56d zmMk%)8*-a^Y*Z6ANFWv!Z$5B+Z-}}mmru$&5x|TYPy$b|^J2pI^vraYcFeWb;nF?z z%aRVE99lmUx<>Ek^nx94>Cnk@LoqWT^kFTFa-l0lg8-|@#f8FTIn`wXA?lQ*oj6mRZ=PwQ{`166PTv&0%LMOw7A97YSaR5ws@RaDFNhg`dbj zPkOV-YN2iNe1Uo+NwtooFv4kI>D6?y-=eBqx|4EuXlf;?sbLaP1Ie8@z--j`^Lrr4 zTNy339nIi?kP7X7f!VAnfWe*AHl}ZPSJv+qRgTCVZ3iqU}A>bi2djnwc4nP(#HLuvY zrF1KKxHJT*onX@j9)mohrEv_OBFt57cXu*r@^;54VR{t#B{3CChDt21@3c)k{cBNX zgn~4%Rl-9Ha?A3p-I33mqhjxIzd))F!sY<;$oBH=(|&p9suR;0PNNe?G13mcmsO_n zz2%hTG<`8Eu{KxVMlI9nTnCpvY`F2Z$JQ5)88e>bFZ6Z}qh`4`agFapzV2`Jy6zN) zhS_~M)nY^)I097(nuy7C%q9I3PYwt8~)pZeoeMHFTc{TZad za=tYr=6I@Xlw~T7028^1Ei}$AIg?si4%jJv1`(+;I3Rg>nkF~ROPY4 z_KG2y5J})%ArcvLc|)6vb%Z!dl}aBW@nB#Eurh7aS`0cmlF}>T z_LbSu`GoMeTfUIC3Oz)ZE{ak;CDRF{{fNi?wI`2$h=g8TZJW54aDecbx@SYI#H(U+ zZS|tstI@DdrJLyBvT|Y4HRUi*z6o@BMv}Wn1lf2H0OTXZ!)kM-NywLC0p`s)TP}kt z;`bMnjGhZX$0pAv_cGhiwJuWKdT8Qflm6BVQuZpi3NDy|==jAC#N4BU#wW!`qIz00 z^(>+=+1x6<)zB zZTt1cLMo7iSyt?mhNtN`tv{cu;Aw#@7mo5M1RjQ;aWzbPpl(4rH{LLnPwj)zy03EZ zC=rBX!z%wzpEr4F^1b4`x{cv>hWJ%M>y|ZS!SdmBR(RnYjFAxi%W`v=pi5TxYP%URXHJyPhQCF5t zlr4vzfS{jnb!h5lUNw7KqXV?0li1w}X-=1LAjuVKoZJZ}_CtU-5o$z#b0|@eHuGxN z1noB3HN|0!lP;K)8YyLcCJQi;@;w-e(6>rSn)#_898FH%fQ+bLblMbbvj9Sz20eqL zM;Ik?fbsLVgoIRKuO;Q)7l>)&RtZAPGNl@d5mN=j2?!L~oIl+_q#l^H%mD*}k|x@A zew5FE5?A6)1Z-Qw!5cnA{?KvM6_g9T!7$6iKHIkK}y0* zx00~!DpV7Mu$V641RO0EX4WSLuSPwiU@kF0fTSv3@bBJEq9{rCUcnjfK1J&iT&#IX zL?9hIt97xGS9}PybZ`q=>^t3vy1ITHclj9AEm$}F48wNK=vZVH4?m475KcyRmO8HG zcTh{4%KdCn^C2J`g>Xk(!^KLOWEKWSOmx$lPZi?`M`NxB`c8vvTFB%FQZjnPR4Yb+ z#swnAETrISo?^WTGCRVH+{2d~*P~n*L~0Yk{8C@414@IZ($>XG1fB(~+CXs?RLznJ zr+i`A`S(qX8ua|HUlo#fGpyItAvY3Vc_p%r>5qvS*3w}3ZzIFqmgWH}JQS@J2C4^&R-+iSnje=Yd=EH`dVF7U|()|nGnZ$hvZ6u|m9QFBp) zKvt%HDogj2PFY{S;Uwe~r(N6Qa%4$mUmD|te;;jxvH#f406>sNuNlw*e=8r8;%S%tBpe!3dSY-F-UE=<=SiQ-rc(lvi&3NA zYF68EePtbh!35$oqYr7cT@k;ePqNK^Xqu&RAoFj%MPW13Ikdo@NT(M1=XY>JP7TVC zKB*{)ARPR3?Whg6e+ja3rpun`P9->8r6|XPUfUJpeWfr{K2gPc(sO6xa|sa)Yy(<|L2-T5S>tZ{I& z^o$#i2@IdpB>)oRI~97%371j67^f)7x)0VG6X&_p*q6$_MPI!0)Gm6$1ev4Zk4bN8kA zsI`{mP+wGr;;^BH0_kzs)v0;;!)liVR%=4M3y?}$F7wd$d=>nZt%=9k0>q*S!!sLZ z$G>!sML6iH?cHn}ARn{yjJ=CP-K(g`)b!Yz)I(KBEaYZEWWlw6YjOsJ%c8C$=@wvhd?vRGEXcL!DW@{ksRK1oNu#Xg!x<;iG@tN>V zvg-*x#lYeiVH%10>x18;I4$s1hJ_+r45wT{iRL4nKt7rmnfUL4Ljnt~3=C^bWQr`3 z%@!(Jl%_o_=PBl8x5U<)p(o)4&>O zp7QZoudu30kE`md?^AD6O1Pv#pALdyI=?O}P|qBVI%S(1O!pWTLX^6N)>Ct5=ypk5 z_#Lm9;TCorAxBqfemA6~e;ncl&>ae(vF<+2>dMnJ9MfZGZ01g8wU!d4eW z_MufG=bEZTHbOuK-8tG5o*w^}yeoRg6Wf{d{-v4D<#0 zy9yB-#VhSn=}&nUJdMo8i^N_22wyEtZHWt5l9J+|Wt{b_Qx7VwLCVvn+Q;SioK~L# z&9iP2^K|{I>O&ox?Rux=b3z{UX(scja2^5twj3jwf$a*eT!w}^tRVFWSRpK>g9 z5))t0;yt2EI}Z(hLAfEz_Dz-~j<9z-+2D>7nv?QD;$7eiw=*3?iKq^ByH@&=El>JR zG`Q~^-{d;`5Z$%(Jx2aHFlpc6z;#TCz8$G=9jsqV3)$3i!`p#fA-pu1dT)8NU@|75wJnXnp{JOitD4+qgg3>@(J%SlVDUy(VT4g-8I5jo> z-GY@`JEU=Y@tZ?zws>MZtP6AQPQ~eD`w%406&LRpDqq6&zPP|8S|sV{qq)`_RLlw|TJp48{7J_&u!! z{aaYX0o!8Mtowmi&lr6(?X}g9w<<)Or?;6M@^bI^5Rs=kMFMlta>!j=Q!pO^#ph3jQ?}&O6JnTzPeH>)*Naa0Bbj=sNDPDD)b&@vbxSJTBL_ro z5>^>4u%46Jh6M7_G7)Y>hkk{FqjJqH=Ud$*ev?g4GAg@8x!G==zfQOC6c|27E*G`t z(DR&e%NiWvg>=|iiP5ix_=MNnni9vgg6iy&|`b%;*a3#~o_fy&m7c?%ZTAij>) zz*9^J_D{c8WV8x+j_udN@b!#cKzXfZdW8+YEi83z>(|nmSA(mnFBO z3Coy}7=eTtt6N!|tT{79`uUY_O92bzc8g}<-pBr00^ zWWn2vlTaehj4nm+@`$3_xukzh23s$t(B6Gg8PQgJ7CpPLq)pga&Cvuy-CJj_#VaZ#Yh?^NVSVo|+?R#ZFDd5nz64#){ELE$M{t5;dv~?3BJt zHszWL>2tMCYMk3*WABi9YkK1DRyXK)(3&Q&ez#uBL6svJnTNy?&&cfLoV=VLqW^0b zUO&8I&K7^zWSd{8CQL)BPtkhuH^s;)IJFq0B|?HIpJjyKODTktL7m`QLo~_oI+;WP zpVwZqotS9)t)YL=1xxC}b0gNWI53Y41jNt3P4hs=c#jO>`}ybN)yqL5$jhjFW(lp( z+8BaZj(2zPgk#-vb^ikUKAs;`IxE(Vx+}AcQ*_(s>FEPwu(4_IlZ>Re*qUGUK?i; zZhsm7ID}p0weg^O+tR)mtNkK(ENdPzUzhwM%*$LHL`T`Di8uBTeg!3fN)uE{N7@7DEI@l1R2mSpTY;hY=eJa>* zyWMI`OZD{vLM{|hh+x7ZecT8;BQ6HJ4YQ`}9KdM-RdW1Zd_0-tgR(mLwy{9;(AoFh zV`}-@ZxB(Eu0$kI8xR!|df*Ga1{L3X79kt^0fxZyXddFI2M*GajXWe5+jHXj_Dj<9 z8R_1_6tj0v8?<-*s^jT-yAmBt`XlGq)WtUW5%-?sKHZdS^^V0yf~9AK|6BJ<^H{E< zs+KQ-O;Cqbz0j5K#BLZn!ZS;1J@cv1W)wc zk-T%JW(6Hp3y?ruRty-GiVqr<6*AIXBkO@T@{0n4uHhQ5)gOrbZAI{x%d|d}Vo^DMeb=-?~1!X0Cg1_`8P?h^X0dCm92|{NN0;><$ht{4Sc| zUEBON1uzfx;ruRtHl>eQ^Co^fO9kOUkDU^h-bvnN?J7v%K|OHxaW0$R2*aP?)b3hb zre}i`-JpzCWAe=HeQS-^11rk5a=l+d-@ll@(S zDsX3xree!KH!p7$tQ7htpB#f(fOaenjv#IIhXT&D zPQzsN;^kf23fo#qU>g&cyixMlSnfKG7R&UREu=IyAn)P9w2RNtywR^3ST;=Sd`Eat zUZKpd`ptuN4b6TF%d|NuO}LTQW=6vMcU+t`t6?rR2;lPdt+AFrwaaAhHL9DJ=A(9#6xY7(GQ(IDv5mC=ZBdOXoJ zf1N){>y2OIw$Jdi+;Hs6^z$yn&*Mf$FANzT(>H{5D9#%6#|HtG37h6q2_0NFDxp;5 zzhEor(7{KWTkZ|kda)r{r3OEjE%TVOw^Lj3;Plx3yxrkK-a%G_noc=CW%gY5F`CpH zd+Z;w3(g*hg(rz&9)^Du)a}$zx!mb36efjl)v12ZoxF1PXlEl*yoU+#YY*^%COJI3 za(1p4c`Iex3&Rk*YbrZ#5}{Rmnd7F(=F`uhMc&JeUpLR}CB`{J-*h%yeHpIzC5Qvw z6Sz-Iw}Q>Lo*}p%skfe>_ymjS`DHpdD=mgb&v!eNX@y)p6g$F3XJ@LI=E$^g%b0}| zHGLFF5VTq;biZ5(pbLA)r8h$Y(Ksbj1F{i>9g=SS;jZE|2wZ5>Gsj&j@Mz_l=Jf_D z_xS>XVI#-Cv`3iu_d{t4qvv_mFpo1Q~B-y0S;nfQE``8?H3(j~~2p^Or86HRz~ z*9X0WmQMH_ld&C@(fT35<5yR85&TQVR(Xe^TotJAhPeShT6r?gJVDluP#DEV$^?1! z4J!ibGOc2s=a?;MOU28xh9R%b-}_^(Ze45)zMek~6yZq(3V8uWP$5;W&*}<>3%PTA zd=oWiCAG5JdnMGuK4B!OW7sl$T`u|hd(DhR!PQwTT*Adm&i^&>^*07y+^EQh+*&C_ z=e4t>z|}ejDC{nOOySvOeuD5g|4)t9PHELSWpX*=-i>B4aEViyol=SGI$NJ&t2CFL z?3!e~Ws1e~$TZPmi&03o@W;0nr=jcnIz#wk&EQhWl0F<<(TB`Ja z8DL;7L5BZH`P+)6zf=Bcq0e6wnm-BS|GJldulM;o;-9ui{DruK`47ZjJ0<=O`6rM2 zUyxSB|A744>i&1cKZ(5mLOdh?2jbuV>+f3czvKSNHTV~<6#u{B{+D_1@4$cZ0{sP4 z_~SDC_bvU;WB>OB%U_nDztjGy#`zcRMC`xO{$Hu{@34PLmH&d3YyLOb-!kUE