From 40fdc972b48ff47b3d29ace275df55b5626aed38 Mon Sep 17 00:00:00 2001 From: Lucas Cimon <925560+Lucas-C@users.noreply.github.com> Date: Tue, 23 Dec 2025 10:54:52 +0100 Subject: [PATCH 1/2] JSON cache structure evolved to now store the linkbacks requests status - fix #2 --- .pylintrc | 2 +- CHANGELOG.md | 12 + README.md | 11 +- pelican/plugins/linkbacks/linkbacks.py | 327 +++++++++++++++----- pelican/plugins/linkbacks/test_linkbacks.py | 48 ++- 5 files changed, 304 insertions(+), 96 deletions(-) diff --git a/.pylintrc b/.pylintrc index 9f1aaa0..451a7b8 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,5 +1,5 @@ [MESSAGES CONTROL] -disable = broad-except, missing-docstring, multiple-imports, too-few-public-methods +disable = broad-except, missing-docstring, multiple-imports, too-few-public-methods, too-many-arguments, too-many-locals, too-many-positional-arguments [FORMAT] max-line-length = 180 diff --git a/CHANGELOG.md b/CHANGELOG.md index 6305911..573207e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,18 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](http://semver.org/). +## [1.0.6] - Not released yet + +### Added + +- manual execution mode: `python linkbacks.py $pelican_generated_html_file` + +### Changed + +- JSON cache structure evolved to now store the linkbacks requests status. + This is **not** backward-compatible, and the plugin will ask you to remove any pre-existing `pelican-plugin-linkbacks.json` file. + _cf._ [issue #2](https://github.com/pelican-plugins/linkbacks/issues/2) + ## [1.0.5] - 2025-12-22 ### Added diff --git a/README.md b/README.md index a0d0f6a..8d1080f 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,15 @@ where `$CACHE_PATH` is [a Pelican setting](https://docs.getpelican.com/en/latest time in seconds allowed for each HTTP linkback request before abandon +## Manual execution +The `linkbacks.py` module can be used as script to test this plugin behavior: + + export SITEURL=... + python path/to/pelican/plugins/linkbacks/linkbacks.py $pelican_generated_html_file + +Optionally the `colorama` package can be installed to get colored logs in the output. + + ## Contributing Contributions are welcome and much appreciated. Every little bit helps. You can contribute by improving the documentation, @@ -99,7 +108,7 @@ With a valid configuration in `~/.config/pypoetry/`: ## Linter & tests To execute them: - pylint *linkbacks.py + pylint pelican/plugins/linkbacks/ pytest ### Integration tests diff --git a/pelican/plugins/linkbacks/linkbacks.py b/pelican/plugins/linkbacks/linkbacks.py index 8e6e202..c2f365d 100644 --- a/pelican/plugins/linkbacks/linkbacks.py +++ b/pelican/plugins/linkbacks/linkbacks.py @@ -1,3 +1,5 @@ +from abc import ABC, abstractmethod +from collections import defaultdict from contextlib import closing try: from contextlib import nullcontext @@ -7,8 +9,9 @@ import json import logging import os +import sys from os import makedirs -from os.path import splitext +from os.path import basename, splitext from ssl import CERT_NONE, SSLError import xmlrpc.client import warnings @@ -44,14 +47,9 @@ def process_all_articles_linkbacks(generators): article_generator = next(g for g in generators if isinstance(g, ArticlesGenerator)) config = LinkbackConfig(article_generator.settings) + cache = Cache.load_from_json(config) - try: - with open(config.cache_filepath, encoding='utf8') as cache_file: - cache = json.load(cache_file) - except FileNotFoundError: - cache = {} - - original_cache_links_count = sum(len(urls) for slug, urls in cache.items()) + original_cache_links_count = cache.links_count() successful_notifs_count = 0 try: for article in article_generator.articles: @@ -59,37 +57,19 @@ def process_all_articles_linkbacks(generators): with nullcontext() if config.cert_verify else warnings.catch_warnings(): if not config.cert_verify: warnings.simplefilter('ignore', InsecureRequestWarning) - successful_notifs_count += process_all_links_of_an_article(article, cache, config) + successful_notifs_count += process_all_links_of_an_article(config, cache, article.url, article.slug, article.content) return successful_notifs_count finally: # We save the cache & log our progress even in case of an interruption: - with open(config.cache_filepath, 'w+', encoding='utf8') as cache_file: - json.dump(cache, cache_file) - new_cache_links_count = sum(len(urls) for slug, urls in cache.items()) + cache.dump_to_json() LOGGER.info("Linkback plugin execution took: %s - Links processed & inserted in cache: %s - Successful notifications: %s", - datetime.now() - start_time, new_cache_links_count - original_cache_links_count, successful_notifs_count) - -class LinkbackConfig: - def __init__(self, settings=None): - if settings is None: - settings = {} - self.siteurl = settings.get('SITEURL', '') - self.cache_filepath = settings.get('LINKBACKS_CACHEPATH') - if not self.cache_filepath: - cache_dir = settings.get('CACHE_PATH', '') - self.cache_filepath = os.path.join(cache_dir, CACHE_FILENAME) - if cache_dir: - makedirs(cache_dir, exist_ok=True) - self.cert_verify = settings.get('LINKBACKS_CERT_VERIFY', DEFAULT_CERT_VERIFY) - self.timeout = settings.get('LINKBACKS_REQUEST_TIMEOUT', DEFAULT_TIMEOUT) - self.user_agent = settings.get('LINKBACKS_USERAGENT', DEFAULT_USER_AGENT) + datetime.now() - start_time, cache.links_count() - original_cache_links_count, successful_notifs_count) -def process_all_links_of_an_article(article, cache, config): - source_url = os.path.join(config.siteurl, article.url) +def process_all_links_of_an_article(config, cache, url, slug, content): + source_url = os.path.join(config.siteurl, url) successful_notifs_count = 0 - links_cache = set(cache.get(article.slug, [])) # Even if an entry exists in the cache, we always extract all links, # in order to support articles edits that could add new links. - doc_soup = BeautifulSoup(article.content, BS4_HTML_PARSER) + doc_soup = BeautifulSoup(content, BS4_HTML_PARSER) for anchor in doc_soup('a'): if 'href' not in anchor.attrs: continue @@ -102,92 +82,211 @@ def process_all_links_of_an_article(article, cache, config): if splitext(link_url)[1] in ('.gif', '.jpg', '.pdf', '.png', '.svg'): LOGGER.debug("Link url %s skipped because it appears to be an image or PDF file", link_url) continue - if link_url in links_cache: - LOGGER.debug("Link url %s skipped because it has already been processed (present in cache)", link_url) + cache_status = cache.get_status(slug, link_url) + if cache_status: + LOGGER.debug("Link url %s skipped because it is present in cache with status: %s", link_url, cache_status) continue LOGGER.debug("Now attempting to send Linkbacks for link url %s", link_url) try: resp_content, resp_headers = requests_get_with_max_size(link_url, config) except Exception as error: LOGGER.debug("Failed to retrieve web page for link url %s: [%s] %s", link_url, error.__class__.__name__, error) + cache.add_failure(slug, link_url, error) continue - for notifier in (send_pingback, send_webmention): - if notifier(source_url, link_url, config, resp_content, resp_headers): + for notifier_class in (PingbackNotifier, WebmentionNotifier): + try: + notifier = notifier_class(source_url, link_url, config) + notifier.discover_server_uri(resp_content, resp_headers) + if notifier.server_uri: + LOGGER.debug("%s URI detected: %s", notifier.kind, notifier.server_uri) + else: + cache.add_failure(slug, link_url, f"No {notifier.kind} URI found", notifier.kind) + continue + response = notifier.send() + LOGGER.info("%s notification sent for URL %s, endpoint response: %s", notifier.kind, link_url, response) + cache.add_success(slug, link_url, notifier.kind, notifier.server_uri) successful_notifs_count += 1 - links_cache.add(link_url) - cache[article.slug] = list(links_cache) + except (ConnectionError, HTTPError, RequestException, SSLError, xmlrpc.client.ProtocolError) as error: + LOGGER.error("Failed to send %s for link url %s: [%s] %s", notifier.kind, link_url, error.__class__.__name__, error) + cache.add_failure(slug, link_url, error, notifier.kind, notifier.server_uri) + except Exception as error: # unexpected exception => we display the stacktrace: + LOGGER.exception("Failed to send %s for link url %s", notifier.kind, link_url) + cache.add_failure(slug, link_url, error, notifier.kind, notifier.server_uri) return successful_notifs_count -def send_pingback(source_url, target_url, config=LinkbackConfig(), resp_content=None, resp_headers=None): - try: +class LinkbackConfig: + def __init__(self, settings=None): + if settings is None: + settings = {} + self.siteurl = settings.get('SITEURL', '') + self.cache_filepath = settings.get('LINKBACKS_CACHEPATH') + if not self.cache_filepath: + cache_dir = settings.get('CACHE_PATH', '') + self.cache_filepath = os.path.join(cache_dir, CACHE_FILENAME) + if cache_dir: + makedirs(cache_dir, exist_ok=True) + self.cert_verify = settings.get('LINKBACKS_CERT_VERIFY', DEFAULT_CERT_VERIFY) + self.timeout = settings.get('LINKBACKS_REQUEST_TIMEOUT', DEFAULT_TIMEOUT) + self.user_agent = settings.get('LINKBACKS_USERAGENT', DEFAULT_USER_AGENT) + +class Cache: + def __init__(self, config, data): + self.cache_filepath = config.cache_filepath + # Cache structure: + # { + # $article_slug: { + # $link_url: { + # "pingback": { + # "server_uri": "http...", // optional string + # "error": // string or null if successfull + # }, + # "webmention": { + # "server_uri": "http...", // optional string + # "error": // string or null if successfull + # } + # }, + # ... + # }, + # ... + # } + self.data = defaultdict(dict) + self.data.update(data) + def add_success(self, article_slug, link_url, kind, server_uri): + article_links = self.data[article_slug] + link_status = article_links.get(link_url) + if link_status is None: + link_status = {} + article_links[link_url] = link_status + link_status[kind] = { + "server_uri": server_uri + } + def add_failure(self, article_slug, link_url, error, notifier_kind=None, server_uri=None): + article_links = self.data[article_slug] + link_status = article_links.get(link_url) + if link_status is None: + link_status = {} + article_links[link_url] = link_status + kinds = [notifier_kind] if notifier_kind else ["pingback", "webmention"] + for kind in kinds: + status = { + "error": error if isinstance(error, str) else f"[{error.__class__.__name__}] {error}" + } + if server_uri: + status["server_uri"] = server_uri + link_status[kind] = status + def get_status(self, article_slug, link_url): + "Return None if a notification should be sent; otherwise return the reason why it should be skipped" + article_links = self.data[article_slug] + link_status = article_links.get(link_url) + if link_status is None: + return None # link not processed yet + pingback_status = link_status.get("pingback") + webmention_status = link_status.get("webmention") + if pingback_status is None or webmention_status is None: + return None # defensive, should never happen + # For now we never retry sending pingbacks & webmentions if there is already an entry in the cache. + # Later on, we could for example consider retrying on HTTP 5XX errors. + pingback_error = pingback_status.get("error") + webmention_error = webmention_status.get("error") + if pingback_error is None or webmention_error is None: + return "ALREADY SUBMITTED" + return pingback_error or webmention_error + def links_count(self): + return sum(len(url_statuses) for url_statuses in self.data.values()) + @classmethod + def load_from_json(cls, config): + try: + with open(config.cache_filepath, encoding='utf8') as cache_file: + data = json.load(cache_file) + except FileNotFoundError: + data = {} + is_old_cache = data and isinstance(list(data.values())[0], list) + if is_old_cache: + raise EnvironmentError( + f"Old cache format detected in {config.cache_filepath}: please remove this file before publishing your website." + " All linkbacks will be processed again on next pelican execution.", + ) + return cls(config, data) + def dump_to_json(self): + with open(self.cache_filepath, 'w+', encoding='utf8') as cache_file: + json.dump(self.data, cache_file) + +class Notifier(ABC): + """ + Public properties: + * kind: 'pingback' or 'webmention' + * server_uri: URL of the notification endpoint + """ + @abstractmethod + def discover_server_uri(self): + """ + Sets .server_uri if a notification endpoint is found for target_url. + Must be called before calling send(). + """ + @abstractmethod + def send(self): + "Sends the actual notification." + +class PingbackNotifier(Notifier): + def __init__(self, source_url, target_url, config=LinkbackConfig()): + self.kind = "pingback" + self.source_url = source_url + self.target_url = target_url + self.config = config + self.server_uri = None + def discover_server_uri(self, resp_content=None, resp_headers=None): if resp_content is None: - resp_content, resp_headers = requests_get_with_max_size(target_url, config) + resp_content, resp_headers = requests_get_with_max_size(self.target_url, self.config) # Pingback server autodiscovery: - server_uri = resp_headers.get('X-Pingback') - if not server_uri and resp_headers.get('Content-Type', '').startswith('text/html'): + self.server_uri = resp_headers.get('X-Pingback') + if not self.server_uri and resp_headers.get('Content-Type', '').startswith('text/html'): # As a falback, we try parsing the HTML, looking for elements doc_soup = BeautifulSoup(resp_content, BS4_HTML_PARSER) link = doc_soup.find(rel='pingback', href=True) if link: - server_uri = link['href'] - if not server_uri: - return False - LOGGER.debug("Pingback URI detected: %s", server_uri) + self.server_uri = link['href'] + def send(self): # Performing pingback request: - transport = SafeXmlRpcTransport(config) if server_uri.startswith('https') else XmlRpcTransport(config) - xml_rpc_client = xmlrpc.client.ServerProxy(server_uri, transport) + transport = SafeXmlRpcTransport(self.config) if self.server_uri.startswith('https') else XmlRpcTransport(self.config) + xml_rpc_client = xmlrpc.client.ServerProxy(self.server_uri, transport) try: - response = xml_rpc_client.pingback.ping(source_url, target_url) + return xml_rpc_client.pingback.ping(self.source_url, self.target_url) except xmlrpc.client.Fault as fault: if fault.faultCode == 48: # pingback already registered - LOGGER.debug("Pingback already registered for URL %s, XML-RPC response: code=%s - %s", target_url, fault.faultCode, fault.faultString) - else: - LOGGER.error("Pingback XML-RPC request failed for URL %s: code=%s - %s", target_url, fault.faultCode, fault.faultString) - return False - LOGGER.info("Pingback notification sent for URL %s, endpoint response: %s", target_url, response) - return True - except (ConnectionError, HTTPError, RequestException, SSLError) as error: - LOGGER.error("Failed to send Pingback for link url %s: [%s] %s", target_url, error.__class__.__name__, error) - return False - except Exception: # unexpected exception => we display the stacktrace: - LOGGER.exception("Failed to send Pingback for link url %s", target_url) - return False - -def send_webmention(source_url, target_url, config=LinkbackConfig(), resp_content=None, resp_headers=None): - try: + raise RuntimeError(f"Pingback already registered for URL {self.target_url}, XML-RPC response: code={fault.faultCode} - {fault.faultString}") from fault + raise RuntimeError(f"Pingback XML-RPC request failed for URL {self.target_url}: code={fault.faultCode} - {fault.faultString}") from fault + +class WebmentionNotifier(Notifier): + def __init__(self, source_url, target_url, config=LinkbackConfig()): + self.kind = "webmention" + self.source_url = source_url + self.target_url = target_url + self.config = config + self.server_uri = None + def discover_server_uri(self, resp_content=None, resp_headers=None): if resp_content is None: - resp_content, resp_headers = requests_get_with_max_size(target_url, config) + resp_content, resp_headers = requests_get_with_max_size(self.target_url, self.config) # WebMention server autodiscovery: - server_uri = None link_header = resp_headers.get('Link') if link_header: try: - server_uri = next(lh.get('url') for lh in parse_header_links(link_header) - if lh.get('url') and lh.get('rel') in WEBMENTION_POSS_REL) + self.server_uri = next(lh.get('url') for lh in parse_header_links(link_header) + if lh.get('url') and lh.get('rel') in WEBMENTION_POSS_REL) except StopIteration: pass - if not server_uri and resp_headers.get('Content-Type', '').startswith('text/html'): + if not self.server_uri and resp_headers.get('Content-Type', '').startswith('text/html'): # As a falback, we try parsing the HTML, looking for elements - doc_soup = BeautifulSoup(resp_content, BS4_HTML_PARSER) # HTML parsing could be factord out of both methods + doc_soup = BeautifulSoup(resp_content, BS4_HTML_PARSER) # HTML parsing could be factored out of both methods for link in doc_soup.find_all(rel=WEBMENTION_POSS_REL, href=True): if link.get('href'): - server_uri = link.get('href') - if not server_uri: - return False - LOGGER.debug("WebMention URI detected: %s", server_uri) - server_uri = urljoin(target_url, server_uri) + self.server_uri = link.get('href') + def send(self): # Performing WebMention request: - response = requests.post(server_uri, headers={'User-Agent': config.user_agent}, timeout=config.timeout, - data={'source': source_url, 'target': target_url}, verify=config.cert_verify) + url = urljoin(self.target_url, self.server_uri) + response = requests.post(url, headers={'User-Agent': self.config.user_agent}, timeout=self.config.timeout, + data={'source': self.source_url, 'target': self.target_url}, verify=self.config.cert_verify) response.raise_for_status() - LOGGER.info("WebMention notification sent for URL %s, endpoint response: %s", target_url, response.text) - return True - except (ConnectionError, HTTPError, RequestException, SSLError) as error: - LOGGER.error("Failed to send WebMention for link url %s: [%s] %s", target_url, error.__class__.__name__, error) - return False - except Exception: # unexpected exception => we display the stacktrace: - LOGGER.exception("Failed to send WebMention for link url %s", target_url) - return False + return response.text GET_CHUNK_SIZE = 2**10 @@ -245,3 +344,59 @@ def make_connection(self, host): def register(): signals.all_generators_finalized.connect(process_all_articles_linkbacks) + + +def cli(html_filepath): + logging.basicConfig(format="%(levelname)s [%(name)s] %(message)s", + datefmt="%H:%M:%S", level=logging.DEBUG) + config = LinkbackConfig(os.environ) + cache = Cache.load_from_json(config) + with nullcontext() if config.cert_verify else warnings.catch_warnings(): + if not config.cert_verify: + warnings.simplefilter('ignore', InsecureRequestWarning) + url = basename(html_filepath) + slug = url.replace(".html", "") + with open(html_filepath, "r+", encoding="utf-8") as html_file: + content = html_file.read() + LOGGER.debug("Now extracting content from tag
...") + content = str(BeautifulSoup(content, BS4_HTML_PARSER).find("article")) + LOGGER.debug("Now processing HTML file with url=%s slug=%s...", url, slug) + successful_notifs_count = process_all_links_of_an_article(config, cache, url, slug, content) + LOGGER.info("Done - Notifications sent: %s", successful_notifs_count) + cache.dump_to_json() + +if __name__ == '__main__': + try: # Optional logs coloring: + from colorama import Back, Fore, Style + # Recipe from: https://chezsoi.org/lucas/blog/colored-logs-in-python.html + class ColorLogsWrapper: + COLOR_MAP = { + 'debug': Fore.CYAN, + 'info': Fore.GREEN, + 'warning': Fore.YELLOW, + 'error': Fore.RED, + 'critical': Back.RED, + } + def __init__(self, logger): + self.logger = logger + def __getattr__(self, attr_name): + if attr_name == 'warn': + attr_name = 'warning' + if attr_name not in 'debug info warning error critical': + return getattr(self.logger, attr_name) + log_level = getattr(logging, attr_name.upper()) + # mimicking logging/__init__.py behaviour + if not self.logger.isEnabledFor(log_level): + return None + def wrapped_attr(msg, *args, **kwargs): + style_prefix = self.COLOR_MAP[attr_name] + msg = style_prefix + msg + Style.RESET_ALL + # We call _.log directly to not increase the callstack + # so that Logger.findCaller extract the corrects filename/lineno + # pylint: disable=protected-access + return self.logger._log(log_level, msg, args, **kwargs) + return wrapped_attr + LOGGER = ColorLogsWrapper(LOGGER) + except ImportError: + print("colorama not available - Logs coloring disabled") + cli(sys.argv[1]) diff --git a/pelican/plugins/linkbacks/test_linkbacks.py b/pelican/plugins/linkbacks/test_linkbacks.py index ce444a7..f30a685 100644 --- a/pelican/plugins/linkbacks/test_linkbacks.py +++ b/pelican/plugins/linkbacks/test_linkbacks.py @@ -1,14 +1,17 @@ -import logging, os +import json, logging, os import httpretty -from pelican.generators import ArticlesGenerator -from pelican.tests.support import get_settings - from linkbacks import ( process_all_articles_linkbacks, + Cache, + LinkbackConfig, CACHE_FILENAME, MAX_RESPONSE_LENGTH, ) +import pytest + +from pelican.generators import ArticlesGenerator +from pelican.tests.support import get_settings CUR_DIR = os.path.dirname(__file__) @@ -33,7 +36,7 @@ def test_cache(tmpdir, caplog): article_generator = _build_article_generator(TEST_CONTENT_DIR, tmpdir) assert process_all_articles_linkbacks([article_generator]) == 2 assert process_all_articles_linkbacks([article_generator]) == 0 - assert 'Link url http://localhost/sub/some-page.html skipped because it has already been processed (present in cache)' in caplog.text + assert 'Link url http://localhost/sub/some-page.html skipped because it is present in cache with status: ALREADY SUBMITTED' in caplog.text def test_ignore_internal_links(tmpdir, caplog): caplog.set_level(logging.DEBUG) @@ -66,7 +69,7 @@ def test_pingback_http_error(tmpdir, caplog): _setup_http_mocks(pingback=('header', 'http_error'), webmention=()) article_generator = _build_article_generator(TEST_CONTENT_DIR, tmpdir) assert process_all_articles_linkbacks([article_generator]) == 0 - assert 'Failed to send Pingback for link url http://localhost/sub/some-page.html' in caplog.text + assert 'Failed to send pingback for link url http://localhost/sub/some-page.html' in caplog.text assert '503' in caplog.text @httpretty.activate @@ -78,7 +81,6 @@ def test_pingback_xmlrpc_error(tmpdir, caplog): @httpretty.activate def test_pingback_already_registered(tmpdir, caplog): - caplog.set_level(logging.DEBUG) _setup_http_mocks(pingback=('header', 'already_registered'), webmention=()) article_generator = _build_article_generator(TEST_CONTENT_DIR, tmpdir) assert process_all_articles_linkbacks([article_generator]) == 0 @@ -89,7 +91,7 @@ def test_webmention_http_error(tmpdir, caplog): _setup_http_mocks(pingback=(), webmention=('header', 'http_error')) article_generator = _build_article_generator(TEST_CONTENT_DIR, tmpdir) assert process_all_articles_linkbacks([article_generator]) == 0 - assert 'Failed to send WebMention for link url http://localhost/sub/some-page.html' in caplog.text + assert 'Failed to send webmention for link url http://localhost/sub/some-page.html' in caplog.text assert '503' in caplog.text @httpretty.activate @@ -188,3 +190,33 @@ def _setup_cache_dir(cache_dir_path): os.remove(os.path.join(cache_dir_path, CACHE_FILENAME)) except FileNotFoundError: pass + +def test_cache_load_old_format(tmpdir): + with (tmpdir / CACHE_FILENAME).open("w") as cache_file: + json.dump({ + "festival-meujeuteries-merveilles": [ + "https://laubergedesreveurs.fr/festival-meujeuterie-et-merveilles/" + ] + }, cache_file) + config = LinkbackConfig({'CACHE_PATH': str(tmpdir)}) + with pytest.raises(EnvironmentError) as error: + Cache.load_from_json(config) + assert "Old cache format detected" in str(error) + +def test_cache_load_new_format(tmpdir): + with (tmpdir / CACHE_FILENAME).open("w") as cache_file: + json.dump({ + "more-amazing-creative-commons-artists": { + "https://creativecommons.org/share-your-work/cclicenses/": { + "pingback": { + "error": "No pingback URI found" + }, + "webmention": { + "error": "No webmention URI found" + } + } + } + }, cache_file) + config = LinkbackConfig({'CACHE_PATH': str(tmpdir)}) + cache = Cache.load_from_json(config) + assert cache.get_status("more-amazing-creative-commons-artists", "https://creativecommons.org/share-your-work/cclicenses/") == "No pingback URI found" From 98661a01893827ec4ac19c8718066048bbac17fa Mon Sep 17 00:00:00 2001 From: Lucas Cimon <925560+Lucas-C@users.noreply.github.com> Date: Tue, 23 Dec 2025 12:50:51 +0100 Subject: [PATCH 2/2] New configuration setting LINKBACKS_IGNORED_URLS_PATTERN --- .pylintrc | 2 +- CHANGELOG.md | 1 + pelican/plugins/linkbacks/linkbacks.py | 37 ++++++++++++++------- pelican/plugins/linkbacks/test_linkbacks.py | 9 +++-- 4 files changed, 31 insertions(+), 18 deletions(-) diff --git a/.pylintrc b/.pylintrc index 451a7b8..9ecf672 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,5 +1,5 @@ [MESSAGES CONTROL] -disable = broad-except, missing-docstring, multiple-imports, too-few-public-methods, too-many-arguments, too-many-locals, too-many-positional-arguments +disable = broad-except, missing-docstring, multiple-imports, too-few-public-methods, too-many-arguments, too-many-branches, too-many-locals, too-many-positional-arguments, wrong-import-order [FORMAT] max-line-length = 180 diff --git a/CHANGELOG.md b/CHANGELOG.md index 573207e..e8b2689 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/). ### Added +- new configuration setting `LINKBACKS_IGNORED_URLS_PATTERN` to define some URLs that should never be considered for linkbacks (_e.g._ `youtube.com`) - manual execution mode: `python linkbacks.py $pelican_generated_html_file` ### Changed diff --git a/pelican/plugins/linkbacks/linkbacks.py b/pelican/plugins/linkbacks/linkbacks.py index c2f365d..c44ec0b 100644 --- a/pelican/plugins/linkbacks/linkbacks.py +++ b/pelican/plugins/linkbacks/linkbacks.py @@ -9,6 +9,7 @@ import json import logging import os +import re import sys from os import makedirs from os.path import basename, splitext @@ -32,6 +33,8 @@ DEFAULT_USER_AGENT = 'pelican-plugin-linkbacks' DEFAULT_CERT_VERIFY = True DEFAULT_TIMEOUT = 3 +DEFAULT_IGNORED_URLS_PATTERN = 'artstation.com|deviantart.com|github.com|github.io|itch.io|readthedocs.io|youtube.com|wikipedia.org' +IMAGE_EXTENSIONS = ('.gif', '.jpg', '.pdf', '.png', '.svg') WEBMENTION_POSS_REL = ('webmention', 'http://webmention.org', 'http://webmention.org/', 'https://webmention.org', 'https://webmention.org/') LOGGER = logging.getLogger(__name__) @@ -79,9 +82,12 @@ def process_all_links_of_an_article(config, cache, url, slug, content): if config.siteurl and link_url.startswith(config.siteurl): LOGGER.debug("Link url %s skipped because is starts with %s", link_url, config.siteurl) continue - if splitext(link_url)[1] in ('.gif', '.jpg', '.pdf', '.png', '.svg'): + if splitext(link_url)[1] in IMAGE_EXTENSIONS: LOGGER.debug("Link url %s skipped because it appears to be an image or PDF file", link_url) continue + if config.ignored_urls_pattern.search(link_url): + LOGGER.debug("Link url %s skipped because it matches the ignored URLs pattern", link_url) + continue cache_status = cache.get_status(slug, link_url) if cache_status: LOGGER.debug("Link url %s skipped because it is present in cache with status: %s", link_url, cache_status) @@ -104,9 +110,9 @@ def process_all_links_of_an_article(config, cache, url, slug, content): continue response = notifier.send() LOGGER.info("%s notification sent for URL %s, endpoint response: %s", notifier.kind, link_url, response) - cache.add_success(slug, link_url, notifier.kind, notifier.server_uri) + cache.add_success(slug, link_url, notifier.kind, notifier.server_uri, response) successful_notifs_count += 1 - except (ConnectionError, HTTPError, RequestException, SSLError, xmlrpc.client.ProtocolError) as error: + except (ConnectionError, HTTPError, NotifierError, RequestException, SSLError, xmlrpc.client.ProtocolError) as error: LOGGER.error("Failed to send %s for link url %s: [%s] %s", notifier.kind, link_url, error.__class__.__name__, error) cache.add_failure(slug, link_url, error, notifier.kind, notifier.server_uri) except Exception as error: # unexpected exception => we display the stacktrace: @@ -128,6 +134,9 @@ def __init__(self, settings=None): self.cert_verify = settings.get('LINKBACKS_CERT_VERIFY', DEFAULT_CERT_VERIFY) self.timeout = settings.get('LINKBACKS_REQUEST_TIMEOUT', DEFAULT_TIMEOUT) self.user_agent = settings.get('LINKBACKS_USERAGENT', DEFAULT_USER_AGENT) + self.ignored_urls_pattern = settings.get('LINKBACKS_IGNORED_URLS_PATTERN', DEFAULT_IGNORED_URLS_PATTERN) + if self.ignored_urls_pattern and isinstance(self.ignored_urls_pattern, str): + self.ignored_urls_pattern = re.compile(self.ignored_urls_pattern) class Cache: def __init__(self, config, data): @@ -137,12 +146,14 @@ def __init__(self, config, data): # $article_slug: { # $link_url: { # "pingback": { + # "error": // string or null if successful + # "response": // string or null if failed # "server_uri": "http...", // optional string - # "error": // string or null if successfull # }, # "webmention": { + # "error": // string or null if successful + # "response": // string or null if failed # "server_uri": "http...", // optional string - # "error": // string or null if successfull # } # }, # ... @@ -151,13 +162,14 @@ def __init__(self, config, data): # } self.data = defaultdict(dict) self.data.update(data) - def add_success(self, article_slug, link_url, kind, server_uri): + def add_success(self, article_slug, link_url, kind, server_uri, response): article_links = self.data[article_slug] link_status = article_links.get(link_url) if link_status is None: link_status = {} article_links[link_url] = link_status link_status[kind] = { + "response": response, "server_uri": server_uri } def add_failure(self, article_slug, link_url, error, notifier_kind=None, server_uri=None): @@ -186,11 +198,9 @@ def get_status(self, article_slug, link_url): return None # defensive, should never happen # For now we never retry sending pingbacks & webmentions if there is already an entry in the cache. # Later on, we could for example consider retrying on HTTP 5XX errors. - pingback_error = pingback_status.get("error") - webmention_error = webmention_status.get("error") - if pingback_error is None or webmention_error is None: + if pingback_status.get("response") or webmention_status.get("response"): return "ALREADY SUBMITTED" - return pingback_error or webmention_error + return pingback_status.get("error") or webmention_status.get("error") def links_count(self): return sum(len(url_statuses) for url_statuses in self.data.values()) @classmethod @@ -227,6 +237,9 @@ def discover_server_uri(self): def send(self): "Sends the actual notification." +class NotifierError(RuntimeError): + pass + class PingbackNotifier(Notifier): def __init__(self, source_url, target_url, config=LinkbackConfig()): self.kind = "pingback" @@ -253,8 +266,8 @@ def send(self): return xml_rpc_client.pingback.ping(self.source_url, self.target_url) except xmlrpc.client.Fault as fault: if fault.faultCode == 48: # pingback already registered - raise RuntimeError(f"Pingback already registered for URL {self.target_url}, XML-RPC response: code={fault.faultCode} - {fault.faultString}") from fault - raise RuntimeError(f"Pingback XML-RPC request failed for URL {self.target_url}: code={fault.faultCode} - {fault.faultString}") from fault + raise NotifierError(f"Pingback already registered for URL {self.target_url}, XML-RPC response: code={fault.faultCode} - {fault.faultString}") from fault + raise NotifierError(f"Pingback XML-RPC request failed for URL {self.target_url}: code={fault.faultCode} - {fault.faultString}") from fault class WebmentionNotifier(Notifier): def __init__(self, source_url, target_url, config=LinkbackConfig()): diff --git a/pelican/plugins/linkbacks/test_linkbacks.py b/pelican/plugins/linkbacks/test_linkbacks.py index f30a685..69a9c54 100644 --- a/pelican/plugins/linkbacks/test_linkbacks.py +++ b/pelican/plugins/linkbacks/test_linkbacks.py @@ -1,6 +1,10 @@ import json, logging, os import httpretty +from pelican.generators import ArticlesGenerator +from pelican.tests.support import get_settings +import pytest + from linkbacks import ( process_all_articles_linkbacks, Cache, @@ -8,11 +12,6 @@ CACHE_FILENAME, MAX_RESPONSE_LENGTH, ) -import pytest - -from pelican.generators import ArticlesGenerator -from pelican.tests.support import get_settings - CUR_DIR = os.path.dirname(__file__) TEST_CONTENT_DIR = os.path.join(CUR_DIR, 'test_content')