From 40fdc972b48ff47b3d29ace275df55b5626aed38 Mon Sep 17 00:00:00 2001
From: Lucas Cimon <925560+Lucas-C@users.noreply.github.com>
Date: Tue, 23 Dec 2025 10:54:52 +0100
Subject: [PATCH 1/2] JSON cache structure evolved to now store the linkbacks
requests status - fix #2
---
.pylintrc | 2 +-
CHANGELOG.md | 12 +
README.md | 11 +-
pelican/plugins/linkbacks/linkbacks.py | 327 +++++++++++++++-----
pelican/plugins/linkbacks/test_linkbacks.py | 48 ++-
5 files changed, 304 insertions(+), 96 deletions(-)
diff --git a/.pylintrc b/.pylintrc
index 9f1aaa0..451a7b8 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -1,5 +1,5 @@
[MESSAGES CONTROL]
-disable = broad-except, missing-docstring, multiple-imports, too-few-public-methods
+disable = broad-except, missing-docstring, multiple-imports, too-few-public-methods, too-many-arguments, too-many-locals, too-many-positional-arguments
[FORMAT]
max-line-length = 180
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6305911..573207e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,18 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/)
and this project adheres to [Semantic Versioning](http://semver.org/).
+## [1.0.6] - Not released yet
+
+### Added
+
+- manual execution mode: `python linkbacks.py $pelican_generated_html_file`
+
+### Changed
+
+- JSON cache structure evolved to now store the linkbacks requests status.
+ This is **not** backward-compatible, and the plugin will ask you to remove any pre-existing `pelican-plugin-linkbacks.json` file.
+ _cf._ [issue #2](https://github.com/pelican-plugins/linkbacks/issues/2)
+
## [1.0.5] - 2025-12-22
### Added
diff --git a/README.md b/README.md
index a0d0f6a..8d1080f 100644
--- a/README.md
+++ b/README.md
@@ -78,6 +78,15 @@ where `$CACHE_PATH` is [a Pelican setting](https://docs.getpelican.com/en/latest
time in seconds allowed for each HTTP linkback request before abandon
+## Manual execution
+The `linkbacks.py` module can be used as script to test this plugin behavior:
+
+ export SITEURL=...
+ python path/to/pelican/plugins/linkbacks/linkbacks.py $pelican_generated_html_file
+
+Optionally the `colorama` package can be installed to get colored logs in the output.
+
+
## Contributing
Contributions are welcome and much appreciated. Every little bit helps. You can contribute by improving the documentation,
@@ -99,7 +108,7 @@ With a valid configuration in `~/.config/pypoetry/`:
## Linter & tests
To execute them:
- pylint *linkbacks.py
+ pylint pelican/plugins/linkbacks/
pytest
### Integration tests
diff --git a/pelican/plugins/linkbacks/linkbacks.py b/pelican/plugins/linkbacks/linkbacks.py
index 8e6e202..c2f365d 100644
--- a/pelican/plugins/linkbacks/linkbacks.py
+++ b/pelican/plugins/linkbacks/linkbacks.py
@@ -1,3 +1,5 @@
+from abc import ABC, abstractmethod
+from collections import defaultdict
from contextlib import closing
try:
from contextlib import nullcontext
@@ -7,8 +9,9 @@
import json
import logging
import os
+import sys
from os import makedirs
-from os.path import splitext
+from os.path import basename, splitext
from ssl import CERT_NONE, SSLError
import xmlrpc.client
import warnings
@@ -44,14 +47,9 @@ def process_all_articles_linkbacks(generators):
article_generator = next(g for g in generators if isinstance(g, ArticlesGenerator))
config = LinkbackConfig(article_generator.settings)
+ cache = Cache.load_from_json(config)
- try:
- with open(config.cache_filepath, encoding='utf8') as cache_file:
- cache = json.load(cache_file)
- except FileNotFoundError:
- cache = {}
-
- original_cache_links_count = sum(len(urls) for slug, urls in cache.items())
+ original_cache_links_count = cache.links_count()
successful_notifs_count = 0
try:
for article in article_generator.articles:
@@ -59,37 +57,19 @@ def process_all_articles_linkbacks(generators):
with nullcontext() if config.cert_verify else warnings.catch_warnings():
if not config.cert_verify:
warnings.simplefilter('ignore', InsecureRequestWarning)
- successful_notifs_count += process_all_links_of_an_article(article, cache, config)
+ successful_notifs_count += process_all_links_of_an_article(config, cache, article.url, article.slug, article.content)
return successful_notifs_count
finally: # We save the cache & log our progress even in case of an interruption:
- with open(config.cache_filepath, 'w+', encoding='utf8') as cache_file:
- json.dump(cache, cache_file)
- new_cache_links_count = sum(len(urls) for slug, urls in cache.items())
+ cache.dump_to_json()
LOGGER.info("Linkback plugin execution took: %s - Links processed & inserted in cache: %s - Successful notifications: %s",
- datetime.now() - start_time, new_cache_links_count - original_cache_links_count, successful_notifs_count)
-
-class LinkbackConfig:
- def __init__(self, settings=None):
- if settings is None:
- settings = {}
- self.siteurl = settings.get('SITEURL', '')
- self.cache_filepath = settings.get('LINKBACKS_CACHEPATH')
- if not self.cache_filepath:
- cache_dir = settings.get('CACHE_PATH', '')
- self.cache_filepath = os.path.join(cache_dir, CACHE_FILENAME)
- if cache_dir:
- makedirs(cache_dir, exist_ok=True)
- self.cert_verify = settings.get('LINKBACKS_CERT_VERIFY', DEFAULT_CERT_VERIFY)
- self.timeout = settings.get('LINKBACKS_REQUEST_TIMEOUT', DEFAULT_TIMEOUT)
- self.user_agent = settings.get('LINKBACKS_USERAGENT', DEFAULT_USER_AGENT)
+ datetime.now() - start_time, cache.links_count() - original_cache_links_count, successful_notifs_count)
-def process_all_links_of_an_article(article, cache, config):
- source_url = os.path.join(config.siteurl, article.url)
+def process_all_links_of_an_article(config, cache, url, slug, content):
+ source_url = os.path.join(config.siteurl, url)
successful_notifs_count = 0
- links_cache = set(cache.get(article.slug, []))
# Even if an entry exists in the cache, we always extract all links,
# in order to support articles edits that could add new links.
- doc_soup = BeautifulSoup(article.content, BS4_HTML_PARSER)
+ doc_soup = BeautifulSoup(content, BS4_HTML_PARSER)
for anchor in doc_soup('a'):
if 'href' not in anchor.attrs:
continue
@@ -102,92 +82,211 @@ def process_all_links_of_an_article(article, cache, config):
if splitext(link_url)[1] in ('.gif', '.jpg', '.pdf', '.png', '.svg'):
LOGGER.debug("Link url %s skipped because it appears to be an image or PDF file", link_url)
continue
- if link_url in links_cache:
- LOGGER.debug("Link url %s skipped because it has already been processed (present in cache)", link_url)
+ cache_status = cache.get_status(slug, link_url)
+ if cache_status:
+ LOGGER.debug("Link url %s skipped because it is present in cache with status: %s", link_url, cache_status)
continue
LOGGER.debug("Now attempting to send Linkbacks for link url %s", link_url)
try:
resp_content, resp_headers = requests_get_with_max_size(link_url, config)
except Exception as error:
LOGGER.debug("Failed to retrieve web page for link url %s: [%s] %s", link_url, error.__class__.__name__, error)
+ cache.add_failure(slug, link_url, error)
continue
- for notifier in (send_pingback, send_webmention):
- if notifier(source_url, link_url, config, resp_content, resp_headers):
+ for notifier_class in (PingbackNotifier, WebmentionNotifier):
+ try:
+ notifier = notifier_class(source_url, link_url, config)
+ notifier.discover_server_uri(resp_content, resp_headers)
+ if notifier.server_uri:
+ LOGGER.debug("%s URI detected: %s", notifier.kind, notifier.server_uri)
+ else:
+ cache.add_failure(slug, link_url, f"No {notifier.kind} URI found", notifier.kind)
+ continue
+ response = notifier.send()
+ LOGGER.info("%s notification sent for URL %s, endpoint response: %s", notifier.kind, link_url, response)
+ cache.add_success(slug, link_url, notifier.kind, notifier.server_uri)
successful_notifs_count += 1
- links_cache.add(link_url)
- cache[article.slug] = list(links_cache)
+ except (ConnectionError, HTTPError, RequestException, SSLError, xmlrpc.client.ProtocolError) as error:
+ LOGGER.error("Failed to send %s for link url %s: [%s] %s", notifier.kind, link_url, error.__class__.__name__, error)
+ cache.add_failure(slug, link_url, error, notifier.kind, notifier.server_uri)
+ except Exception as error: # unexpected exception => we display the stacktrace:
+ LOGGER.exception("Failed to send %s for link url %s", notifier.kind, link_url)
+ cache.add_failure(slug, link_url, error, notifier.kind, notifier.server_uri)
return successful_notifs_count
-def send_pingback(source_url, target_url, config=LinkbackConfig(), resp_content=None, resp_headers=None):
- try:
+class LinkbackConfig:
+ def __init__(self, settings=None):
+ if settings is None:
+ settings = {}
+ self.siteurl = settings.get('SITEURL', '')
+ self.cache_filepath = settings.get('LINKBACKS_CACHEPATH')
+ if not self.cache_filepath:
+ cache_dir = settings.get('CACHE_PATH', '')
+ self.cache_filepath = os.path.join(cache_dir, CACHE_FILENAME)
+ if cache_dir:
+ makedirs(cache_dir, exist_ok=True)
+ self.cert_verify = settings.get('LINKBACKS_CERT_VERIFY', DEFAULT_CERT_VERIFY)
+ self.timeout = settings.get('LINKBACKS_REQUEST_TIMEOUT', DEFAULT_TIMEOUT)
+ self.user_agent = settings.get('LINKBACKS_USERAGENT', DEFAULT_USER_AGENT)
+
+class Cache:
+ def __init__(self, config, data):
+ self.cache_filepath = config.cache_filepath
+ # Cache structure:
+ # {
+ # $article_slug: {
+ # $link_url: {
+ # "pingback": {
+ # "server_uri": "http...", // optional string
+ # "error": // string or null if successfull
+ # },
+ # "webmention": {
+ # "server_uri": "http...", // optional string
+ # "error": // string or null if successfull
+ # }
+ # },
+ # ...
+ # },
+ # ...
+ # }
+ self.data = defaultdict(dict)
+ self.data.update(data)
+ def add_success(self, article_slug, link_url, kind, server_uri):
+ article_links = self.data[article_slug]
+ link_status = article_links.get(link_url)
+ if link_status is None:
+ link_status = {}
+ article_links[link_url] = link_status
+ link_status[kind] = {
+ "server_uri": server_uri
+ }
+ def add_failure(self, article_slug, link_url, error, notifier_kind=None, server_uri=None):
+ article_links = self.data[article_slug]
+ link_status = article_links.get(link_url)
+ if link_status is None:
+ link_status = {}
+ article_links[link_url] = link_status
+ kinds = [notifier_kind] if notifier_kind else ["pingback", "webmention"]
+ for kind in kinds:
+ status = {
+ "error": error if isinstance(error, str) else f"[{error.__class__.__name__}] {error}"
+ }
+ if server_uri:
+ status["server_uri"] = server_uri
+ link_status[kind] = status
+ def get_status(self, article_slug, link_url):
+ "Return None if a notification should be sent; otherwise return the reason why it should be skipped"
+ article_links = self.data[article_slug]
+ link_status = article_links.get(link_url)
+ if link_status is None:
+ return None # link not processed yet
+ pingback_status = link_status.get("pingback")
+ webmention_status = link_status.get("webmention")
+ if pingback_status is None or webmention_status is None:
+ return None # defensive, should never happen
+ # For now we never retry sending pingbacks & webmentions if there is already an entry in the cache.
+ # Later on, we could for example consider retrying on HTTP 5XX errors.
+ pingback_error = pingback_status.get("error")
+ webmention_error = webmention_status.get("error")
+ if pingback_error is None or webmention_error is None:
+ return "ALREADY SUBMITTED"
+ return pingback_error or webmention_error
+ def links_count(self):
+ return sum(len(url_statuses) for url_statuses in self.data.values())
+ @classmethod
+ def load_from_json(cls, config):
+ try:
+ with open(config.cache_filepath, encoding='utf8') as cache_file:
+ data = json.load(cache_file)
+ except FileNotFoundError:
+ data = {}
+ is_old_cache = data and isinstance(list(data.values())[0], list)
+ if is_old_cache:
+ raise EnvironmentError(
+ f"Old cache format detected in {config.cache_filepath}: please remove this file before publishing your website."
+ " All linkbacks will be processed again on next pelican execution.",
+ )
+ return cls(config, data)
+ def dump_to_json(self):
+ with open(self.cache_filepath, 'w+', encoding='utf8') as cache_file:
+ json.dump(self.data, cache_file)
+
+class Notifier(ABC):
+ """
+ Public properties:
+ * kind: 'pingback' or 'webmention'
+ * server_uri: URL of the notification endpoint
+ """
+ @abstractmethod
+ def discover_server_uri(self):
+ """
+ Sets .server_uri if a notification endpoint is found for target_url.
+ Must be called before calling send().
+ """
+ @abstractmethod
+ def send(self):
+ "Sends the actual notification."
+
+class PingbackNotifier(Notifier):
+ def __init__(self, source_url, target_url, config=LinkbackConfig()):
+ self.kind = "pingback"
+ self.source_url = source_url
+ self.target_url = target_url
+ self.config = config
+ self.server_uri = None
+ def discover_server_uri(self, resp_content=None, resp_headers=None):
if resp_content is None:
- resp_content, resp_headers = requests_get_with_max_size(target_url, config)
+ resp_content, resp_headers = requests_get_with_max_size(self.target_url, self.config)
# Pingback server autodiscovery:
- server_uri = resp_headers.get('X-Pingback')
- if not server_uri and resp_headers.get('Content-Type', '').startswith('text/html'):
+ self.server_uri = resp_headers.get('X-Pingback')
+ if not self.server_uri and resp_headers.get('Content-Type', '').startswith('text/html'):
# As a falback, we try parsing the HTML, looking for elements
doc_soup = BeautifulSoup(resp_content, BS4_HTML_PARSER)
link = doc_soup.find(rel='pingback', href=True)
if link:
- server_uri = link['href']
- if not server_uri:
- return False
- LOGGER.debug("Pingback URI detected: %s", server_uri)
+ self.server_uri = link['href']
+ def send(self):
# Performing pingback request:
- transport = SafeXmlRpcTransport(config) if server_uri.startswith('https') else XmlRpcTransport(config)
- xml_rpc_client = xmlrpc.client.ServerProxy(server_uri, transport)
+ transport = SafeXmlRpcTransport(self.config) if self.server_uri.startswith('https') else XmlRpcTransport(self.config)
+ xml_rpc_client = xmlrpc.client.ServerProxy(self.server_uri, transport)
try:
- response = xml_rpc_client.pingback.ping(source_url, target_url)
+ return xml_rpc_client.pingback.ping(self.source_url, self.target_url)
except xmlrpc.client.Fault as fault:
if fault.faultCode == 48: # pingback already registered
- LOGGER.debug("Pingback already registered for URL %s, XML-RPC response: code=%s - %s", target_url, fault.faultCode, fault.faultString)
- else:
- LOGGER.error("Pingback XML-RPC request failed for URL %s: code=%s - %s", target_url, fault.faultCode, fault.faultString)
- return False
- LOGGER.info("Pingback notification sent for URL %s, endpoint response: %s", target_url, response)
- return True
- except (ConnectionError, HTTPError, RequestException, SSLError) as error:
- LOGGER.error("Failed to send Pingback for link url %s: [%s] %s", target_url, error.__class__.__name__, error)
- return False
- except Exception: # unexpected exception => we display the stacktrace:
- LOGGER.exception("Failed to send Pingback for link url %s", target_url)
- return False
-
-def send_webmention(source_url, target_url, config=LinkbackConfig(), resp_content=None, resp_headers=None):
- try:
+ raise RuntimeError(f"Pingback already registered for URL {self.target_url}, XML-RPC response: code={fault.faultCode} - {fault.faultString}") from fault
+ raise RuntimeError(f"Pingback XML-RPC request failed for URL {self.target_url}: code={fault.faultCode} - {fault.faultString}") from fault
+
+class WebmentionNotifier(Notifier):
+ def __init__(self, source_url, target_url, config=LinkbackConfig()):
+ self.kind = "webmention"
+ self.source_url = source_url
+ self.target_url = target_url
+ self.config = config
+ self.server_uri = None
+ def discover_server_uri(self, resp_content=None, resp_headers=None):
if resp_content is None:
- resp_content, resp_headers = requests_get_with_max_size(target_url, config)
+ resp_content, resp_headers = requests_get_with_max_size(self.target_url, self.config)
# WebMention server autodiscovery:
- server_uri = None
link_header = resp_headers.get('Link')
if link_header:
try:
- server_uri = next(lh.get('url') for lh in parse_header_links(link_header)
- if lh.get('url') and lh.get('rel') in WEBMENTION_POSS_REL)
+ self.server_uri = next(lh.get('url') for lh in parse_header_links(link_header)
+ if lh.get('url') and lh.get('rel') in WEBMENTION_POSS_REL)
except StopIteration:
pass
- if not server_uri and resp_headers.get('Content-Type', '').startswith('text/html'):
+ if not self.server_uri and resp_headers.get('Content-Type', '').startswith('text/html'):
# As a falback, we try parsing the HTML, looking for elements
- doc_soup = BeautifulSoup(resp_content, BS4_HTML_PARSER) # HTML parsing could be factord out of both methods
+ doc_soup = BeautifulSoup(resp_content, BS4_HTML_PARSER) # HTML parsing could be factored out of both methods
for link in doc_soup.find_all(rel=WEBMENTION_POSS_REL, href=True):
if link.get('href'):
- server_uri = link.get('href')
- if not server_uri:
- return False
- LOGGER.debug("WebMention URI detected: %s", server_uri)
- server_uri = urljoin(target_url, server_uri)
+ self.server_uri = link.get('href')
+ def send(self):
# Performing WebMention request:
- response = requests.post(server_uri, headers={'User-Agent': config.user_agent}, timeout=config.timeout,
- data={'source': source_url, 'target': target_url}, verify=config.cert_verify)
+ url = urljoin(self.target_url, self.server_uri)
+ response = requests.post(url, headers={'User-Agent': self.config.user_agent}, timeout=self.config.timeout,
+ data={'source': self.source_url, 'target': self.target_url}, verify=self.config.cert_verify)
response.raise_for_status()
- LOGGER.info("WebMention notification sent for URL %s, endpoint response: %s", target_url, response.text)
- return True
- except (ConnectionError, HTTPError, RequestException, SSLError) as error:
- LOGGER.error("Failed to send WebMention for link url %s: [%s] %s", target_url, error.__class__.__name__, error)
- return False
- except Exception: # unexpected exception => we display the stacktrace:
- LOGGER.exception("Failed to send WebMention for link url %s", target_url)
- return False
+ return response.text
GET_CHUNK_SIZE = 2**10
@@ -245,3 +344,59 @@ def make_connection(self, host):
def register():
signals.all_generators_finalized.connect(process_all_articles_linkbacks)
+
+
+def cli(html_filepath):
+ logging.basicConfig(format="%(levelname)s [%(name)s] %(message)s",
+ datefmt="%H:%M:%S", level=logging.DEBUG)
+ config = LinkbackConfig(os.environ)
+ cache = Cache.load_from_json(config)
+ with nullcontext() if config.cert_verify else warnings.catch_warnings():
+ if not config.cert_verify:
+ warnings.simplefilter('ignore', InsecureRequestWarning)
+ url = basename(html_filepath)
+ slug = url.replace(".html", "")
+ with open(html_filepath, "r+", encoding="utf-8") as html_file:
+ content = html_file.read()
+ LOGGER.debug("Now extracting content from tag ...")
+ content = str(BeautifulSoup(content, BS4_HTML_PARSER).find("article"))
+ LOGGER.debug("Now processing HTML file with url=%s slug=%s...", url, slug)
+ successful_notifs_count = process_all_links_of_an_article(config, cache, url, slug, content)
+ LOGGER.info("Done - Notifications sent: %s", successful_notifs_count)
+ cache.dump_to_json()
+
+if __name__ == '__main__':
+ try: # Optional logs coloring:
+ from colorama import Back, Fore, Style
+ # Recipe from: https://chezsoi.org/lucas/blog/colored-logs-in-python.html
+ class ColorLogsWrapper:
+ COLOR_MAP = {
+ 'debug': Fore.CYAN,
+ 'info': Fore.GREEN,
+ 'warning': Fore.YELLOW,
+ 'error': Fore.RED,
+ 'critical': Back.RED,
+ }
+ def __init__(self, logger):
+ self.logger = logger
+ def __getattr__(self, attr_name):
+ if attr_name == 'warn':
+ attr_name = 'warning'
+ if attr_name not in 'debug info warning error critical':
+ return getattr(self.logger, attr_name)
+ log_level = getattr(logging, attr_name.upper())
+ # mimicking logging/__init__.py behaviour
+ if not self.logger.isEnabledFor(log_level):
+ return None
+ def wrapped_attr(msg, *args, **kwargs):
+ style_prefix = self.COLOR_MAP[attr_name]
+ msg = style_prefix + msg + Style.RESET_ALL
+ # We call _.log directly to not increase the callstack
+ # so that Logger.findCaller extract the corrects filename/lineno
+ # pylint: disable=protected-access
+ return self.logger._log(log_level, msg, args, **kwargs)
+ return wrapped_attr
+ LOGGER = ColorLogsWrapper(LOGGER)
+ except ImportError:
+ print("colorama not available - Logs coloring disabled")
+ cli(sys.argv[1])
diff --git a/pelican/plugins/linkbacks/test_linkbacks.py b/pelican/plugins/linkbacks/test_linkbacks.py
index ce444a7..f30a685 100644
--- a/pelican/plugins/linkbacks/test_linkbacks.py
+++ b/pelican/plugins/linkbacks/test_linkbacks.py
@@ -1,14 +1,17 @@
-import logging, os
+import json, logging, os
import httpretty
-from pelican.generators import ArticlesGenerator
-from pelican.tests.support import get_settings
-
from linkbacks import (
process_all_articles_linkbacks,
+ Cache,
+ LinkbackConfig,
CACHE_FILENAME,
MAX_RESPONSE_LENGTH,
)
+import pytest
+
+from pelican.generators import ArticlesGenerator
+from pelican.tests.support import get_settings
CUR_DIR = os.path.dirname(__file__)
@@ -33,7 +36,7 @@ def test_cache(tmpdir, caplog):
article_generator = _build_article_generator(TEST_CONTENT_DIR, tmpdir)
assert process_all_articles_linkbacks([article_generator]) == 2
assert process_all_articles_linkbacks([article_generator]) == 0
- assert 'Link url http://localhost/sub/some-page.html skipped because it has already been processed (present in cache)' in caplog.text
+ assert 'Link url http://localhost/sub/some-page.html skipped because it is present in cache with status: ALREADY SUBMITTED' in caplog.text
def test_ignore_internal_links(tmpdir, caplog):
caplog.set_level(logging.DEBUG)
@@ -66,7 +69,7 @@ def test_pingback_http_error(tmpdir, caplog):
_setup_http_mocks(pingback=('header', 'http_error'), webmention=())
article_generator = _build_article_generator(TEST_CONTENT_DIR, tmpdir)
assert process_all_articles_linkbacks([article_generator]) == 0
- assert 'Failed to send Pingback for link url http://localhost/sub/some-page.html' in caplog.text
+ assert 'Failed to send pingback for link url http://localhost/sub/some-page.html' in caplog.text
assert '503' in caplog.text
@httpretty.activate
@@ -78,7 +81,6 @@ def test_pingback_xmlrpc_error(tmpdir, caplog):
@httpretty.activate
def test_pingback_already_registered(tmpdir, caplog):
- caplog.set_level(logging.DEBUG)
_setup_http_mocks(pingback=('header', 'already_registered'), webmention=())
article_generator = _build_article_generator(TEST_CONTENT_DIR, tmpdir)
assert process_all_articles_linkbacks([article_generator]) == 0
@@ -89,7 +91,7 @@ def test_webmention_http_error(tmpdir, caplog):
_setup_http_mocks(pingback=(), webmention=('header', 'http_error'))
article_generator = _build_article_generator(TEST_CONTENT_DIR, tmpdir)
assert process_all_articles_linkbacks([article_generator]) == 0
- assert 'Failed to send WebMention for link url http://localhost/sub/some-page.html' in caplog.text
+ assert 'Failed to send webmention for link url http://localhost/sub/some-page.html' in caplog.text
assert '503' in caplog.text
@httpretty.activate
@@ -188,3 +190,33 @@ def _setup_cache_dir(cache_dir_path):
os.remove(os.path.join(cache_dir_path, CACHE_FILENAME))
except FileNotFoundError:
pass
+
+def test_cache_load_old_format(tmpdir):
+ with (tmpdir / CACHE_FILENAME).open("w") as cache_file:
+ json.dump({
+ "festival-meujeuteries-merveilles": [
+ "https://laubergedesreveurs.fr/festival-meujeuterie-et-merveilles/"
+ ]
+ }, cache_file)
+ config = LinkbackConfig({'CACHE_PATH': str(tmpdir)})
+ with pytest.raises(EnvironmentError) as error:
+ Cache.load_from_json(config)
+ assert "Old cache format detected" in str(error)
+
+def test_cache_load_new_format(tmpdir):
+ with (tmpdir / CACHE_FILENAME).open("w") as cache_file:
+ json.dump({
+ "more-amazing-creative-commons-artists": {
+ "https://creativecommons.org/share-your-work/cclicenses/": {
+ "pingback": {
+ "error": "No pingback URI found"
+ },
+ "webmention": {
+ "error": "No webmention URI found"
+ }
+ }
+ }
+ }, cache_file)
+ config = LinkbackConfig({'CACHE_PATH': str(tmpdir)})
+ cache = Cache.load_from_json(config)
+ assert cache.get_status("more-amazing-creative-commons-artists", "https://creativecommons.org/share-your-work/cclicenses/") == "No pingback URI found"
From 98661a01893827ec4ac19c8718066048bbac17fa Mon Sep 17 00:00:00 2001
From: Lucas Cimon <925560+Lucas-C@users.noreply.github.com>
Date: Tue, 23 Dec 2025 12:50:51 +0100
Subject: [PATCH 2/2] New configuration setting LINKBACKS_IGNORED_URLS_PATTERN
---
.pylintrc | 2 +-
CHANGELOG.md | 1 +
pelican/plugins/linkbacks/linkbacks.py | 37 ++++++++++++++-------
pelican/plugins/linkbacks/test_linkbacks.py | 9 +++--
4 files changed, 31 insertions(+), 18 deletions(-)
diff --git a/.pylintrc b/.pylintrc
index 451a7b8..9ecf672 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -1,5 +1,5 @@
[MESSAGES CONTROL]
-disable = broad-except, missing-docstring, multiple-imports, too-few-public-methods, too-many-arguments, too-many-locals, too-many-positional-arguments
+disable = broad-except, missing-docstring, multiple-imports, too-few-public-methods, too-many-arguments, too-many-branches, too-many-locals, too-many-positional-arguments, wrong-import-order
[FORMAT]
max-line-length = 180
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 573207e..e8b2689 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
### Added
+- new configuration setting `LINKBACKS_IGNORED_URLS_PATTERN` to define some URLs that should never be considered for linkbacks (_e.g._ `youtube.com`)
- manual execution mode: `python linkbacks.py $pelican_generated_html_file`
### Changed
diff --git a/pelican/plugins/linkbacks/linkbacks.py b/pelican/plugins/linkbacks/linkbacks.py
index c2f365d..c44ec0b 100644
--- a/pelican/plugins/linkbacks/linkbacks.py
+++ b/pelican/plugins/linkbacks/linkbacks.py
@@ -9,6 +9,7 @@
import json
import logging
import os
+import re
import sys
from os import makedirs
from os.path import basename, splitext
@@ -32,6 +33,8 @@
DEFAULT_USER_AGENT = 'pelican-plugin-linkbacks'
DEFAULT_CERT_VERIFY = True
DEFAULT_TIMEOUT = 3
+DEFAULT_IGNORED_URLS_PATTERN = 'artstation.com|deviantart.com|github.com|github.io|itch.io|readthedocs.io|youtube.com|wikipedia.org'
+IMAGE_EXTENSIONS = ('.gif', '.jpg', '.pdf', '.png', '.svg')
WEBMENTION_POSS_REL = ('webmention', 'http://webmention.org', 'http://webmention.org/', 'https://webmention.org', 'https://webmention.org/')
LOGGER = logging.getLogger(__name__)
@@ -79,9 +82,12 @@ def process_all_links_of_an_article(config, cache, url, slug, content):
if config.siteurl and link_url.startswith(config.siteurl):
LOGGER.debug("Link url %s skipped because is starts with %s", link_url, config.siteurl)
continue
- if splitext(link_url)[1] in ('.gif', '.jpg', '.pdf', '.png', '.svg'):
+ if splitext(link_url)[1] in IMAGE_EXTENSIONS:
LOGGER.debug("Link url %s skipped because it appears to be an image or PDF file", link_url)
continue
+ if config.ignored_urls_pattern.search(link_url):
+ LOGGER.debug("Link url %s skipped because it matches the ignored URLs pattern", link_url)
+ continue
cache_status = cache.get_status(slug, link_url)
if cache_status:
LOGGER.debug("Link url %s skipped because it is present in cache with status: %s", link_url, cache_status)
@@ -104,9 +110,9 @@ def process_all_links_of_an_article(config, cache, url, slug, content):
continue
response = notifier.send()
LOGGER.info("%s notification sent for URL %s, endpoint response: %s", notifier.kind, link_url, response)
- cache.add_success(slug, link_url, notifier.kind, notifier.server_uri)
+ cache.add_success(slug, link_url, notifier.kind, notifier.server_uri, response)
successful_notifs_count += 1
- except (ConnectionError, HTTPError, RequestException, SSLError, xmlrpc.client.ProtocolError) as error:
+ except (ConnectionError, HTTPError, NotifierError, RequestException, SSLError, xmlrpc.client.ProtocolError) as error:
LOGGER.error("Failed to send %s for link url %s: [%s] %s", notifier.kind, link_url, error.__class__.__name__, error)
cache.add_failure(slug, link_url, error, notifier.kind, notifier.server_uri)
except Exception as error: # unexpected exception => we display the stacktrace:
@@ -128,6 +134,9 @@ def __init__(self, settings=None):
self.cert_verify = settings.get('LINKBACKS_CERT_VERIFY', DEFAULT_CERT_VERIFY)
self.timeout = settings.get('LINKBACKS_REQUEST_TIMEOUT', DEFAULT_TIMEOUT)
self.user_agent = settings.get('LINKBACKS_USERAGENT', DEFAULT_USER_AGENT)
+ self.ignored_urls_pattern = settings.get('LINKBACKS_IGNORED_URLS_PATTERN', DEFAULT_IGNORED_URLS_PATTERN)
+ if self.ignored_urls_pattern and isinstance(self.ignored_urls_pattern, str):
+ self.ignored_urls_pattern = re.compile(self.ignored_urls_pattern)
class Cache:
def __init__(self, config, data):
@@ -137,12 +146,14 @@ def __init__(self, config, data):
# $article_slug: {
# $link_url: {
# "pingback": {
+ # "error": // string or null if successful
+ # "response": // string or null if failed
# "server_uri": "http...", // optional string
- # "error": // string or null if successfull
# },
# "webmention": {
+ # "error": // string or null if successful
+ # "response": // string or null if failed
# "server_uri": "http...", // optional string
- # "error": // string or null if successfull
# }
# },
# ...
@@ -151,13 +162,14 @@ def __init__(self, config, data):
# }
self.data = defaultdict(dict)
self.data.update(data)
- def add_success(self, article_slug, link_url, kind, server_uri):
+ def add_success(self, article_slug, link_url, kind, server_uri, response):
article_links = self.data[article_slug]
link_status = article_links.get(link_url)
if link_status is None:
link_status = {}
article_links[link_url] = link_status
link_status[kind] = {
+ "response": response,
"server_uri": server_uri
}
def add_failure(self, article_slug, link_url, error, notifier_kind=None, server_uri=None):
@@ -186,11 +198,9 @@ def get_status(self, article_slug, link_url):
return None # defensive, should never happen
# For now we never retry sending pingbacks & webmentions if there is already an entry in the cache.
# Later on, we could for example consider retrying on HTTP 5XX errors.
- pingback_error = pingback_status.get("error")
- webmention_error = webmention_status.get("error")
- if pingback_error is None or webmention_error is None:
+ if pingback_status.get("response") or webmention_status.get("response"):
return "ALREADY SUBMITTED"
- return pingback_error or webmention_error
+ return pingback_status.get("error") or webmention_status.get("error")
def links_count(self):
return sum(len(url_statuses) for url_statuses in self.data.values())
@classmethod
@@ -227,6 +237,9 @@ def discover_server_uri(self):
def send(self):
"Sends the actual notification."
+class NotifierError(RuntimeError):
+ pass
+
class PingbackNotifier(Notifier):
def __init__(self, source_url, target_url, config=LinkbackConfig()):
self.kind = "pingback"
@@ -253,8 +266,8 @@ def send(self):
return xml_rpc_client.pingback.ping(self.source_url, self.target_url)
except xmlrpc.client.Fault as fault:
if fault.faultCode == 48: # pingback already registered
- raise RuntimeError(f"Pingback already registered for URL {self.target_url}, XML-RPC response: code={fault.faultCode} - {fault.faultString}") from fault
- raise RuntimeError(f"Pingback XML-RPC request failed for URL {self.target_url}: code={fault.faultCode} - {fault.faultString}") from fault
+ raise NotifierError(f"Pingback already registered for URL {self.target_url}, XML-RPC response: code={fault.faultCode} - {fault.faultString}") from fault
+ raise NotifierError(f"Pingback XML-RPC request failed for URL {self.target_url}: code={fault.faultCode} - {fault.faultString}") from fault
class WebmentionNotifier(Notifier):
def __init__(self, source_url, target_url, config=LinkbackConfig()):
diff --git a/pelican/plugins/linkbacks/test_linkbacks.py b/pelican/plugins/linkbacks/test_linkbacks.py
index f30a685..69a9c54 100644
--- a/pelican/plugins/linkbacks/test_linkbacks.py
+++ b/pelican/plugins/linkbacks/test_linkbacks.py
@@ -1,6 +1,10 @@
import json, logging, os
import httpretty
+from pelican.generators import ArticlesGenerator
+from pelican.tests.support import get_settings
+import pytest
+
from linkbacks import (
process_all_articles_linkbacks,
Cache,
@@ -8,11 +12,6 @@
CACHE_FILENAME,
MAX_RESPONSE_LENGTH,
)
-import pytest
-
-from pelican.generators import ArticlesGenerator
-from pelican.tests.support import get_settings
-
CUR_DIR = os.path.dirname(__file__)
TEST_CONTENT_DIR = os.path.join(CUR_DIR, 'test_content')