From 40fdc972b48ff47b3d29ace275df55b5626aed38 Mon Sep 17 00:00:00 2001
From: Lucas Cimon <925560+Lucas-C@users.noreply.github.com>
Date: Tue, 23 Dec 2025 10:54:52 +0100
Subject: [PATCH 1/2] JSON cache structure evolved to now store the linkbacks
 requests status - fix #2

---
 .pylintrc                                   |   2 +-
 CHANGELOG.md                                |  12 +
 README.md                                   |  11 +-
 pelican/plugins/linkbacks/linkbacks.py      | 327 +++++++++++++++-----
 pelican/plugins/linkbacks/test_linkbacks.py |  48 ++-
 5 files changed, 304 insertions(+), 96 deletions(-)

diff --git a/.pylintrc b/.pylintrc
index 9f1aaa0..451a7b8 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -1,5 +1,5 @@
 [MESSAGES CONTROL]
-disable = broad-except, missing-docstring, multiple-imports, too-few-public-methods
+disable = broad-except, missing-docstring, multiple-imports, too-few-public-methods, too-many-arguments, too-many-locals, too-many-positional-arguments
 
 [FORMAT]
 max-line-length = 180
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6305911..573207e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,18 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
+## [1.0.6] - Not released yet
+
+### Added
+
+- manual execution mode: `python linkbacks.py $pelican_generated_html_file`
+
+### Changed
+
+- JSON cache structure evolved to now store the linkbacks requests status.
+  This is **not** backward-compatible, and the plugin will ask you to remove any pre-existing `pelican-plugin-linkbacks.json` file.
+  _cf._ [issue #2](https://github.com/pelican-plugins/linkbacks/issues/2)
+
 ## [1.0.5] - 2025-12-22
 
 ### Added
diff --git a/README.md b/README.md
index a0d0f6a..8d1080f 100644
--- a/README.md
+++ b/README.md
@@ -78,6 +78,15 @@ where `$CACHE_PATH` is [a Pelican setting](https://docs.getpelican.com/en/latest
   time in seconds allowed for each HTTP linkback request before abandon
 
 
+## Manual execution
+The `linkbacks.py` module can be used as script to test this plugin behavior:
+
+    export SITEURL=...
+    python path/to/pelican/plugins/linkbacks/linkbacks.py $pelican_generated_html_file
+
+Optionally the `colorama` package can be installed to get colored logs in the output.
+
+
 ## Contributing
 
 Contributions are welcome and much appreciated. Every little bit helps. You can contribute by improving the documentation,
@@ -99,7 +108,7 @@ With a valid configuration in `~/.config/pypoetry/`:
 ## Linter & tests
 To execute them:
 
-    pylint *linkbacks.py
+    pylint pelican/plugins/linkbacks/
     pytest
 
 ### Integration tests
diff --git a/pelican/plugins/linkbacks/linkbacks.py b/pelican/plugins/linkbacks/linkbacks.py
index 8e6e202..c2f365d 100644
--- a/pelican/plugins/linkbacks/linkbacks.py
+++ b/pelican/plugins/linkbacks/linkbacks.py
@@ -1,3 +1,5 @@
+from abc import ABC, abstractmethod
+from collections import defaultdict
 from contextlib import closing
 try:
     from contextlib import nullcontext
@@ -7,8 +9,9 @@
 import json
 import logging
 import os
+import sys
 from os import makedirs
-from os.path import splitext
+from os.path import basename, splitext
 from ssl import CERT_NONE, SSLError
 import xmlrpc.client
 import warnings
@@ -44,14 +47,9 @@ def process_all_articles_linkbacks(generators):
     article_generator = next(g for g in generators if isinstance(g, ArticlesGenerator))
 
     config = LinkbackConfig(article_generator.settings)
+    cache = Cache.load_from_json(config)
 
-    try:
-        with open(config.cache_filepath, encoding='utf8') as cache_file:
-            cache = json.load(cache_file)
-    except FileNotFoundError:
-        cache = {}
-
-    original_cache_links_count = sum(len(urls) for slug, urls in cache.items())
+    original_cache_links_count = cache.links_count()
     successful_notifs_count = 0
     try:
         for article in article_generator.articles:
@@ -59,37 +57,19 @@ def process_all_articles_linkbacks(generators):
                 with nullcontext() if config.cert_verify else warnings.catch_warnings():
                     if not config.cert_verify:
                         warnings.simplefilter('ignore', InsecureRequestWarning)
-                    successful_notifs_count += process_all_links_of_an_article(article, cache, config)
+                    successful_notifs_count += process_all_links_of_an_article(config, cache, article.url, article.slug, article.content)
         return successful_notifs_count
     finally:  # We save the cache & log our progress even in case of an interruption:
-        with open(config.cache_filepath, 'w+', encoding='utf8') as cache_file:
-            json.dump(cache, cache_file)
-        new_cache_links_count = sum(len(urls) for slug, urls in cache.items())
+        cache.dump_to_json()
         LOGGER.info("Linkback plugin execution took: %s - Links processed & inserted in cache: %s - Successful notifications: %s",
-                    datetime.now() - start_time, new_cache_links_count - original_cache_links_count, successful_notifs_count)
-
-class LinkbackConfig:
-    def __init__(self, settings=None):
-        if settings is None:
-            settings = {}
-        self.siteurl = settings.get('SITEURL', '')
-        self.cache_filepath = settings.get('LINKBACKS_CACHEPATH')
-        if not self.cache_filepath:
-            cache_dir = settings.get('CACHE_PATH', '')
-            self.cache_filepath = os.path.join(cache_dir, CACHE_FILENAME)
-            if cache_dir:
-                makedirs(cache_dir, exist_ok=True)
-        self.cert_verify = settings.get('LINKBACKS_CERT_VERIFY', DEFAULT_CERT_VERIFY)
-        self.timeout = settings.get('LINKBACKS_REQUEST_TIMEOUT', DEFAULT_TIMEOUT)
-        self.user_agent = settings.get('LINKBACKS_USERAGENT', DEFAULT_USER_AGENT)
+                    datetime.now() - start_time, cache.links_count() - original_cache_links_count, successful_notifs_count)
 
-def process_all_links_of_an_article(article, cache, config):
-    source_url = os.path.join(config.siteurl, article.url)
+def process_all_links_of_an_article(config, cache, url, slug, content):
+    source_url = os.path.join(config.siteurl, url)
     successful_notifs_count = 0
-    links_cache = set(cache.get(article.slug, []))
     # Even if an entry exists in the cache, we always extract all links,
     # in order to support articles edits that could add new links.
-    doc_soup = BeautifulSoup(article.content, BS4_HTML_PARSER)
+    doc_soup = BeautifulSoup(content, BS4_HTML_PARSER)
     for anchor in doc_soup('a'):
         if 'href' not in anchor.attrs:
             continue
@@ -102,92 +82,211 @@ def process_all_links_of_an_article(article, cache, config):
         if splitext(link_url)[1] in ('.gif', '.jpg', '.pdf', '.png', '.svg'):
             LOGGER.debug("Link url %s skipped because it appears to be an image or PDF file", link_url)
             continue
-        if link_url in links_cache:
-            LOGGER.debug("Link url %s skipped because it has already been processed (present in cache)", link_url)
+        cache_status = cache.get_status(slug, link_url)
+        if cache_status:
+            LOGGER.debug("Link url %s skipped because it is present in cache with status: %s", link_url, cache_status)
             continue
         LOGGER.debug("Now attempting to send Linkbacks for link url %s", link_url)
         try:
             resp_content, resp_headers = requests_get_with_max_size(link_url, config)
         except Exception as error:
             LOGGER.debug("Failed to retrieve web page for link url %s: [%s] %s", link_url, error.__class__.__name__, error)
+            cache.add_failure(slug, link_url, error)
             continue
-        for notifier in (send_pingback, send_webmention):
-            if notifier(source_url, link_url, config, resp_content, resp_headers):
+        for notifier_class in (PingbackNotifier, WebmentionNotifier):
+            try:
+                notifier = notifier_class(source_url, link_url, config)
+                notifier.discover_server_uri(resp_content, resp_headers)
+                if notifier.server_uri:
+                    LOGGER.debug("%s URI detected: %s", notifier.kind, notifier.server_uri)
+                else:
+                    cache.add_failure(slug, link_url, f"No {notifier.kind} URI found", notifier.kind)
+                    continue
+                response = notifier.send()
+                LOGGER.info("%s notification sent for URL %s, endpoint response: %s", notifier.kind, link_url, response)
+                cache.add_success(slug, link_url, notifier.kind, notifier.server_uri)
                 successful_notifs_count += 1
-        links_cache.add(link_url)
-    cache[article.slug] = list(links_cache)
+            except (ConnectionError, HTTPError, RequestException, SSLError, xmlrpc.client.ProtocolError) as error:
+                LOGGER.error("Failed to send %s for link url %s: [%s] %s", notifier.kind, link_url, error.__class__.__name__, error)
+                cache.add_failure(slug, link_url, error, notifier.kind, notifier.server_uri)
+            except Exception as error:  # unexpected exception => we display the stacktrace:
+                LOGGER.exception("Failed to send %s for link url %s", notifier.kind, link_url)
+                cache.add_failure(slug, link_url, error, notifier.kind, notifier.server_uri)
     return successful_notifs_count
 
-def send_pingback(source_url, target_url, config=LinkbackConfig(), resp_content=None, resp_headers=None):
-    try:
+class LinkbackConfig:
+    def __init__(self, settings=None):
+        if settings is None:
+            settings = {}
+        self.siteurl = settings.get('SITEURL', '')
+        self.cache_filepath = settings.get('LINKBACKS_CACHEPATH')
+        if not self.cache_filepath:
+            cache_dir = settings.get('CACHE_PATH', '')
+            self.cache_filepath = os.path.join(cache_dir, CACHE_FILENAME)
+            if cache_dir:
+                makedirs(cache_dir, exist_ok=True)
+        self.cert_verify = settings.get('LINKBACKS_CERT_VERIFY', DEFAULT_CERT_VERIFY)
+        self.timeout = settings.get('LINKBACKS_REQUEST_TIMEOUT', DEFAULT_TIMEOUT)
+        self.user_agent = settings.get('LINKBACKS_USERAGENT', DEFAULT_USER_AGENT)
+
+class Cache:
+    def __init__(self, config, data):
+        self.cache_filepath = config.cache_filepath
+        # Cache structure:
+        # {
+        #   $article_slug: {
+        #     $link_url: {
+        #       "pingback": {
+        #         "server_uri": "http...", // optional string
+        #         "error": // string or null if successfull
+        #       },
+        #       "webmention": {
+        #         "server_uri": "http...", // optional string
+        #         "error": // string or null if successfull
+        #       }
+        #     },
+        #     ...
+        #   },
+        #   ...
+        # }
+        self.data = defaultdict(dict)
+        self.data.update(data)
+    def add_success(self, article_slug, link_url, kind, server_uri):
+        article_links = self.data[article_slug]
+        link_status = article_links.get(link_url)
+        if link_status is None:
+            link_status = {}
+            article_links[link_url] = link_status
+        link_status[kind] = {
+            "server_uri": server_uri
+        }
+    def add_failure(self, article_slug, link_url, error, notifier_kind=None, server_uri=None):
+        article_links = self.data[article_slug]
+        link_status = article_links.get(link_url)
+        if link_status is None:
+            link_status = {}
+            article_links[link_url] = link_status
+        kinds = [notifier_kind] if notifier_kind else ["pingback", "webmention"]
+        for kind in kinds:
+            status = {
+                "error": error if isinstance(error, str) else f"[{error.__class__.__name__}] {error}"
+            }
+            if server_uri:
+                status["server_uri"] = server_uri
+            link_status[kind] = status
+    def get_status(self, article_slug, link_url):
+        "Return None if a notification should be sent; otherwise return the reason why it should be skipped"
+        article_links = self.data[article_slug]
+        link_status = article_links.get(link_url)
+        if link_status is None:
+            return None  # link not processed yet
+        pingback_status = link_status.get("pingback")
+        webmention_status = link_status.get("webmention")
+        if pingback_status is None or webmention_status is None:
+            return None  # defensive, should never happen
+        # For now we never retry sending pingbacks & webmentions if there is already an entry in the cache.
+        # Later on, we could for example consider retrying on HTTP 5XX errors.
+        pingback_error = pingback_status.get("error")
+        webmention_error = webmention_status.get("error")
+        if pingback_error is None or webmention_error is None:
+            return "ALREADY SUBMITTED"
+        return pingback_error or webmention_error
+    def links_count(self):
+        return sum(len(url_statuses) for url_statuses in self.data.values())
+    @classmethod
+    def load_from_json(cls, config):
+        try:
+            with open(config.cache_filepath, encoding='utf8') as cache_file:
+                data = json.load(cache_file)
+        except FileNotFoundError:
+            data = {}
+        is_old_cache = data and isinstance(list(data.values())[0], list)
+        if is_old_cache:
+            raise EnvironmentError(
+                f"Old cache format detected in {config.cache_filepath}: please remove this file before publishing your website."
+                " All linkbacks will be processed again on next pelican execution.",
+            )
+        return cls(config, data)
+    def dump_to_json(self):
+        with open(self.cache_filepath, 'w+', encoding='utf8') as cache_file:
+            json.dump(self.data, cache_file)
+
+class Notifier(ABC):
+    """
+    Public properties:
+    * kind: 'pingback' or 'webmention'
+    * server_uri: URL of the notification endpoint
+    """
+    @abstractmethod
+    def discover_server_uri(self):
+        """
+        Sets .server_uri if a notification endpoint is found for target_url.
+        Must be called before calling send().
+        """
+    @abstractmethod
+    def send(self):
+        "Sends the actual notification."
+
+class PingbackNotifier(Notifier):
+    def __init__(self, source_url, target_url, config=LinkbackConfig()):
+        self.kind = "pingback"
+        self.source_url = source_url
+        self.target_url = target_url
+        self.config = config
+        self.server_uri = None
+    def discover_server_uri(self, resp_content=None, resp_headers=None):
         if resp_content is None:
-            resp_content, resp_headers = requests_get_with_max_size(target_url, config)
+            resp_content, resp_headers = requests_get_with_max_size(self.target_url, self.config)
         # Pingback server autodiscovery:
-        server_uri = resp_headers.get('X-Pingback')
-        if not server_uri and resp_headers.get('Content-Type', '').startswith('text/html'):
+        self.server_uri = resp_headers.get('X-Pingback')
+        if not self.server_uri and resp_headers.get('Content-Type', '').startswith('text/html'):
             # As a falback, we try parsing the HTML, looking for <link> elements
             doc_soup = BeautifulSoup(resp_content, BS4_HTML_PARSER)
             link = doc_soup.find(rel='pingback', href=True)
             if link:
-                server_uri = link['href']
-        if not server_uri:
-            return False
-        LOGGER.debug("Pingback URI detected: %s", server_uri)
+                self.server_uri = link['href']
+    def send(self):
         # Performing pingback request:
-        transport = SafeXmlRpcTransport(config) if server_uri.startswith('https') else XmlRpcTransport(config)
-        xml_rpc_client = xmlrpc.client.ServerProxy(server_uri, transport)
+        transport = SafeXmlRpcTransport(self.config) if self.server_uri.startswith('https') else XmlRpcTransport(self.config)
+        xml_rpc_client = xmlrpc.client.ServerProxy(self.server_uri, transport)
         try:
-            response = xml_rpc_client.pingback.ping(source_url, target_url)
+            return xml_rpc_client.pingback.ping(self.source_url, self.target_url)
         except xmlrpc.client.Fault as fault:
             if fault.faultCode == 48:  # pingback already registered
-                LOGGER.debug("Pingback already registered for URL %s, XML-RPC response: code=%s - %s", target_url, fault.faultCode, fault.faultString)
-            else:
-                LOGGER.error("Pingback XML-RPC request failed for URL %s: code=%s - %s", target_url, fault.faultCode, fault.faultString)
-            return False
-        LOGGER.info("Pingback notification sent for URL %s, endpoint response: %s", target_url, response)
-        return True
-    except (ConnectionError, HTTPError, RequestException, SSLError) as error:
-        LOGGER.error("Failed to send Pingback for link url %s: [%s] %s", target_url, error.__class__.__name__, error)
-        return False
-    except Exception:  # unexpected exception => we display the stacktrace:
-        LOGGER.exception("Failed to send Pingback for link url %s", target_url)
-        return False
-
-def send_webmention(source_url, target_url, config=LinkbackConfig(), resp_content=None, resp_headers=None):
-    try:
+                raise RuntimeError(f"Pingback already registered for URL {self.target_url}, XML-RPC response: code={fault.faultCode} - {fault.faultString}") from fault
+            raise RuntimeError(f"Pingback XML-RPC request failed for URL {self.target_url}: code={fault.faultCode} - {fault.faultString}") from fault
+
+class WebmentionNotifier(Notifier):
+    def __init__(self, source_url, target_url, config=LinkbackConfig()):
+        self.kind = "webmention"
+        self.source_url = source_url
+        self.target_url = target_url
+        self.config = config
+        self.server_uri = None
+    def discover_server_uri(self, resp_content=None, resp_headers=None):
         if resp_content is None:
-            resp_content, resp_headers = requests_get_with_max_size(target_url, config)
+            resp_content, resp_headers = requests_get_with_max_size(self.target_url, self.config)
         # WebMention server autodiscovery:
-        server_uri = None
         link_header = resp_headers.get('Link')
         if link_header:
             try:
-                server_uri = next(lh.get('url') for lh in parse_header_links(link_header)
-                                  if lh.get('url') and lh.get('rel') in WEBMENTION_POSS_REL)
+                self.server_uri = next(lh.get('url') for lh in parse_header_links(link_header)
+                                       if lh.get('url') and lh.get('rel') in WEBMENTION_POSS_REL)
             except StopIteration:
                 pass
-        if not server_uri and resp_headers.get('Content-Type', '').startswith('text/html'):
+        if not self.server_uri and resp_headers.get('Content-Type', '').startswith('text/html'):
             # As a falback, we try parsing the HTML, looking for <link> elements
-            doc_soup = BeautifulSoup(resp_content, BS4_HTML_PARSER)  # HTML parsing could be factord out of both methods
+            doc_soup = BeautifulSoup(resp_content, BS4_HTML_PARSER)  # HTML parsing could be factored out of both methods
             for link in doc_soup.find_all(rel=WEBMENTION_POSS_REL, href=True):
                 if link.get('href'):
-                    server_uri = link.get('href')
-        if not server_uri:
-            return False
-        LOGGER.debug("WebMention URI detected: %s", server_uri)
-        server_uri = urljoin(target_url, server_uri)
+                    self.server_uri = link.get('href')
+    def send(self):
         # Performing WebMention request:
-        response = requests.post(server_uri, headers={'User-Agent': config.user_agent}, timeout=config.timeout,
-                                 data={'source': source_url, 'target': target_url}, verify=config.cert_verify)
+        url = urljoin(self.target_url, self.server_uri)
+        response = requests.post(url, headers={'User-Agent': self.config.user_agent}, timeout=self.config.timeout,
+                                 data={'source': self.source_url, 'target': self.target_url}, verify=self.config.cert_verify)
         response.raise_for_status()
-        LOGGER.info("WebMention notification sent for URL %s, endpoint response: %s", target_url, response.text)
-        return True
-    except (ConnectionError, HTTPError, RequestException, SSLError) as error:
-        LOGGER.error("Failed to send WebMention for link url %s: [%s] %s", target_url, error.__class__.__name__, error)
-        return False
-    except Exception:  # unexpected exception => we display the stacktrace:
-        LOGGER.exception("Failed to send WebMention for link url %s", target_url)
-        return False
+        return response.text
 
 
 GET_CHUNK_SIZE = 2**10
@@ -245,3 +344,59 @@ def make_connection(self, host):
 
 def register():
     signals.all_generators_finalized.connect(process_all_articles_linkbacks)
+
+
+def cli(html_filepath):
+    logging.basicConfig(format="%(levelname)s [%(name)s] %(message)s",
+                        datefmt="%H:%M:%S", level=logging.DEBUG)
+    config = LinkbackConfig(os.environ)
+    cache = Cache.load_from_json(config)
+    with nullcontext() if config.cert_verify else warnings.catch_warnings():
+        if not config.cert_verify:
+            warnings.simplefilter('ignore', InsecureRequestWarning)
+        url = basename(html_filepath)
+        slug = url.replace(".html", "")
+        with open(html_filepath, "r+", encoding="utf-8") as html_file:
+            content = html_file.read()
+        LOGGER.debug("Now extracting content from tag <article>...")
+        content = str(BeautifulSoup(content, BS4_HTML_PARSER).find("article"))
+        LOGGER.debug("Now processing HTML file with url=%s slug=%s...", url, slug)
+        successful_notifs_count = process_all_links_of_an_article(config, cache, url, slug, content)
+        LOGGER.info("Done - Notifications sent: %s", successful_notifs_count)
+        cache.dump_to_json()
+
+if __name__ == '__main__':
+    try:  # Optional logs coloring:
+        from colorama import Back, Fore, Style
+        # Recipe from: https://chezsoi.org/lucas/blog/colored-logs-in-python.html
+        class ColorLogsWrapper:
+            COLOR_MAP = {
+                'debug': Fore.CYAN,
+                'info': Fore.GREEN,
+                'warning': Fore.YELLOW,
+                'error': Fore.RED,
+                'critical': Back.RED,
+            }
+            def __init__(self, logger):
+                self.logger = logger
+            def __getattr__(self, attr_name):
+                if attr_name == 'warn':
+                    attr_name = 'warning'
+                if attr_name not in 'debug info warning error critical':
+                    return getattr(self.logger, attr_name)
+                log_level = getattr(logging, attr_name.upper())
+                # mimicking logging/__init__.py behaviour
+                if not self.logger.isEnabledFor(log_level):
+                    return None
+                def wrapped_attr(msg, *args, **kwargs):
+                    style_prefix = self.COLOR_MAP[attr_name]
+                    msg = style_prefix + msg + Style.RESET_ALL
+                    # We call _.log directly to not increase the callstack
+                    # so that Logger.findCaller extract the corrects filename/lineno
+                    # pylint: disable=protected-access
+                    return self.logger._log(log_level, msg, args, **kwargs)
+                return wrapped_attr
+        LOGGER = ColorLogsWrapper(LOGGER)
+    except ImportError:
+        print("colorama not available - Logs coloring disabled")
+    cli(sys.argv[1])
diff --git a/pelican/plugins/linkbacks/test_linkbacks.py b/pelican/plugins/linkbacks/test_linkbacks.py
index ce444a7..f30a685 100644
--- a/pelican/plugins/linkbacks/test_linkbacks.py
+++ b/pelican/plugins/linkbacks/test_linkbacks.py
@@ -1,14 +1,17 @@
-import logging, os
+import json, logging, os
 
 import httpretty
-from pelican.generators import ArticlesGenerator
-from pelican.tests.support import get_settings
-
 from linkbacks import (
     process_all_articles_linkbacks,
+    Cache,
+    LinkbackConfig,
     CACHE_FILENAME,
     MAX_RESPONSE_LENGTH,
 )
+import pytest
+
+from pelican.generators import ArticlesGenerator
+from pelican.tests.support import get_settings
 
 
 CUR_DIR = os.path.dirname(__file__)
@@ -33,7 +36,7 @@ def test_cache(tmpdir, caplog):
     article_generator = _build_article_generator(TEST_CONTENT_DIR, tmpdir)
     assert process_all_articles_linkbacks([article_generator]) == 2
     assert process_all_articles_linkbacks([article_generator]) == 0
-    assert 'Link url http://localhost/sub/some-page.html skipped because it has already been processed (present in cache)' in caplog.text
+    assert 'Link url http://localhost/sub/some-page.html skipped because it is present in cache with status: ALREADY SUBMITTED' in caplog.text
 
 def test_ignore_internal_links(tmpdir, caplog):
     caplog.set_level(logging.DEBUG)
@@ -66,7 +69,7 @@ def test_pingback_http_error(tmpdir, caplog):
     _setup_http_mocks(pingback=('header', 'http_error'), webmention=())
     article_generator = _build_article_generator(TEST_CONTENT_DIR, tmpdir)
     assert process_all_articles_linkbacks([article_generator]) == 0
-    assert 'Failed to send Pingback for link url http://localhost/sub/some-page.html' in caplog.text
+    assert 'Failed to send pingback for link url http://localhost/sub/some-page.html' in caplog.text
     assert '503' in caplog.text
 
 @httpretty.activate
@@ -78,7 +81,6 @@ def test_pingback_xmlrpc_error(tmpdir, caplog):
 
 @httpretty.activate
 def test_pingback_already_registered(tmpdir, caplog):
-    caplog.set_level(logging.DEBUG)
     _setup_http_mocks(pingback=('header', 'already_registered'), webmention=())
     article_generator = _build_article_generator(TEST_CONTENT_DIR, tmpdir)
     assert process_all_articles_linkbacks([article_generator]) == 0
@@ -89,7 +91,7 @@ def test_webmention_http_error(tmpdir, caplog):
     _setup_http_mocks(pingback=(), webmention=('header', 'http_error'))
     article_generator = _build_article_generator(TEST_CONTENT_DIR, tmpdir)
     assert process_all_articles_linkbacks([article_generator]) == 0
-    assert 'Failed to send WebMention for link url http://localhost/sub/some-page.html' in caplog.text
+    assert 'Failed to send webmention for link url http://localhost/sub/some-page.html' in caplog.text
     assert '503' in caplog.text
 
 @httpretty.activate
@@ -188,3 +190,33 @@ def _setup_cache_dir(cache_dir_path):
         os.remove(os.path.join(cache_dir_path, CACHE_FILENAME))
     except FileNotFoundError:
         pass
+
+def test_cache_load_old_format(tmpdir):
+    with (tmpdir / CACHE_FILENAME).open("w") as cache_file:
+        json.dump({
+            "festival-meujeuteries-merveilles": [
+                "https://laubergedesreveurs.fr/festival-meujeuterie-et-merveilles/"
+            ]
+        }, cache_file)
+    config = LinkbackConfig({'CACHE_PATH': str(tmpdir)})
+    with pytest.raises(EnvironmentError) as error:
+        Cache.load_from_json(config)
+    assert "Old cache format detected" in str(error)
+
+def test_cache_load_new_format(tmpdir):
+    with (tmpdir / CACHE_FILENAME).open("w") as cache_file:
+        json.dump({
+            "more-amazing-creative-commons-artists": {
+                "https://creativecommons.org/share-your-work/cclicenses/": {
+                    "pingback": {
+                        "error": "No pingback URI found"
+                    },
+                    "webmention": {
+                        "error": "No webmention URI found"
+                    }
+                }
+            }
+        }, cache_file)
+    config = LinkbackConfig({'CACHE_PATH': str(tmpdir)})
+    cache = Cache.load_from_json(config)
+    assert cache.get_status("more-amazing-creative-commons-artists", "https://creativecommons.org/share-your-work/cclicenses/") == "No pingback URI found"

From 98661a01893827ec4ac19c8718066048bbac17fa Mon Sep 17 00:00:00 2001
From: Lucas Cimon <925560+Lucas-C@users.noreply.github.com>
Date: Tue, 23 Dec 2025 12:50:51 +0100
Subject: [PATCH 2/2] New configuration setting LINKBACKS_IGNORED_URLS_PATTERN

---
 .pylintrc                                   |  2 +-
 CHANGELOG.md                                |  1 +
 pelican/plugins/linkbacks/linkbacks.py      | 37 ++++++++++++++-------
 pelican/plugins/linkbacks/test_linkbacks.py |  9 +++--
 4 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/.pylintrc b/.pylintrc
index 451a7b8..9ecf672 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -1,5 +1,5 @@
 [MESSAGES CONTROL]
-disable = broad-except, missing-docstring, multiple-imports, too-few-public-methods, too-many-arguments, too-many-locals, too-many-positional-arguments
+disable = broad-except, missing-docstring, multiple-imports, too-few-public-methods, too-many-arguments, too-many-branches, too-many-locals, too-many-positional-arguments, wrong-import-order
 
 [FORMAT]
 max-line-length = 180
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 573207e..e8b2689 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 
 ### Added
 
+- new configuration setting `LINKBACKS_IGNORED_URLS_PATTERN` to define some URLs that should never be considered for linkbacks (_e.g._ `youtube.com`)
 - manual execution mode: `python linkbacks.py $pelican_generated_html_file`
 
 ### Changed
diff --git a/pelican/plugins/linkbacks/linkbacks.py b/pelican/plugins/linkbacks/linkbacks.py
index c2f365d..c44ec0b 100644
--- a/pelican/plugins/linkbacks/linkbacks.py
+++ b/pelican/plugins/linkbacks/linkbacks.py
@@ -9,6 +9,7 @@
 import json
 import logging
 import os
+import re
 import sys
 from os import makedirs
 from os.path import basename, splitext
@@ -32,6 +33,8 @@
 DEFAULT_USER_AGENT = 'pelican-plugin-linkbacks'
 DEFAULT_CERT_VERIFY = True
 DEFAULT_TIMEOUT = 3
+DEFAULT_IGNORED_URLS_PATTERN = 'artstation.com|deviantart.com|github.com|github.io|itch.io|readthedocs.io|youtube.com|wikipedia.org'
+IMAGE_EXTENSIONS = ('.gif', '.jpg', '.pdf', '.png', '.svg')
 WEBMENTION_POSS_REL = ('webmention', 'http://webmention.org', 'http://webmention.org/', 'https://webmention.org', 'https://webmention.org/')
 
 LOGGER = logging.getLogger(__name__)
@@ -79,9 +82,12 @@ def process_all_links_of_an_article(config, cache, url, slug, content):
         if config.siteurl and link_url.startswith(config.siteurl):
             LOGGER.debug("Link url %s skipped because is starts with %s", link_url, config.siteurl)
             continue
-        if splitext(link_url)[1] in ('.gif', '.jpg', '.pdf', '.png', '.svg'):
+        if splitext(link_url)[1] in IMAGE_EXTENSIONS:
             LOGGER.debug("Link url %s skipped because it appears to be an image or PDF file", link_url)
             continue
+        if config.ignored_urls_pattern.search(link_url):
+            LOGGER.debug("Link url %s skipped because it matches the ignored URLs pattern", link_url)
+            continue
         cache_status = cache.get_status(slug, link_url)
         if cache_status:
             LOGGER.debug("Link url %s skipped because it is present in cache with status: %s", link_url, cache_status)
@@ -104,9 +110,9 @@ def process_all_links_of_an_article(config, cache, url, slug, content):
                     continue
                 response = notifier.send()
                 LOGGER.info("%s notification sent for URL %s, endpoint response: %s", notifier.kind, link_url, response)
-                cache.add_success(slug, link_url, notifier.kind, notifier.server_uri)
+                cache.add_success(slug, link_url, notifier.kind, notifier.server_uri, response)
                 successful_notifs_count += 1
-            except (ConnectionError, HTTPError, RequestException, SSLError, xmlrpc.client.ProtocolError) as error:
+            except (ConnectionError, HTTPError, NotifierError, RequestException, SSLError, xmlrpc.client.ProtocolError) as error:
                 LOGGER.error("Failed to send %s for link url %s: [%s] %s", notifier.kind, link_url, error.__class__.__name__, error)
                 cache.add_failure(slug, link_url, error, notifier.kind, notifier.server_uri)
             except Exception as error:  # unexpected exception => we display the stacktrace:
@@ -128,6 +134,9 @@ def __init__(self, settings=None):
         self.cert_verify = settings.get('LINKBACKS_CERT_VERIFY', DEFAULT_CERT_VERIFY)
         self.timeout = settings.get('LINKBACKS_REQUEST_TIMEOUT', DEFAULT_TIMEOUT)
         self.user_agent = settings.get('LINKBACKS_USERAGENT', DEFAULT_USER_AGENT)
+        self.ignored_urls_pattern = settings.get('LINKBACKS_IGNORED_URLS_PATTERN', DEFAULT_IGNORED_URLS_PATTERN)
+        if self.ignored_urls_pattern and isinstance(self.ignored_urls_pattern, str):
+            self.ignored_urls_pattern = re.compile(self.ignored_urls_pattern)
 
 class Cache:
     def __init__(self, config, data):
@@ -137,12 +146,14 @@ def __init__(self, config, data):
         #   $article_slug: {
         #     $link_url: {
         #       "pingback": {
+        #         "error": // string or null if successful
+        #         "response": // string or null if failed
         #         "server_uri": "http...", // optional string
-        #         "error": // string or null if successfull
         #       },
         #       "webmention": {
+        #         "error": // string or null if successful
+        #         "response": // string or null if failed
         #         "server_uri": "http...", // optional string
-        #         "error": // string or null if successfull
         #       }
         #     },
         #     ...
@@ -151,13 +162,14 @@ def __init__(self, config, data):
         # }
         self.data = defaultdict(dict)
         self.data.update(data)
-    def add_success(self, article_slug, link_url, kind, server_uri):
+    def add_success(self, article_slug, link_url, kind, server_uri, response):
         article_links = self.data[article_slug]
         link_status = article_links.get(link_url)
         if link_status is None:
             link_status = {}
             article_links[link_url] = link_status
         link_status[kind] = {
+            "response": response,
             "server_uri": server_uri
         }
     def add_failure(self, article_slug, link_url, error, notifier_kind=None, server_uri=None):
@@ -186,11 +198,9 @@ def get_status(self, article_slug, link_url):
             return None  # defensive, should never happen
         # For now we never retry sending pingbacks & webmentions if there is already an entry in the cache.
         # Later on, we could for example consider retrying on HTTP 5XX errors.
-        pingback_error = pingback_status.get("error")
-        webmention_error = webmention_status.get("error")
-        if pingback_error is None or webmention_error is None:
+        if pingback_status.get("response") or webmention_status.get("response"):
             return "ALREADY SUBMITTED"
-        return pingback_error or webmention_error
+        return pingback_status.get("error") or webmention_status.get("error")
     def links_count(self):
         return sum(len(url_statuses) for url_statuses in self.data.values())
     @classmethod
@@ -227,6 +237,9 @@ def discover_server_uri(self):
     def send(self):
         "Sends the actual notification."
 
+class NotifierError(RuntimeError):
+    pass
+
 class PingbackNotifier(Notifier):
     def __init__(self, source_url, target_url, config=LinkbackConfig()):
         self.kind = "pingback"
@@ -253,8 +266,8 @@ def send(self):
             return xml_rpc_client.pingback.ping(self.source_url, self.target_url)
         except xmlrpc.client.Fault as fault:
             if fault.faultCode == 48:  # pingback already registered
-                raise RuntimeError(f"Pingback already registered for URL {self.target_url}, XML-RPC response: code={fault.faultCode} - {fault.faultString}") from fault
-            raise RuntimeError(f"Pingback XML-RPC request failed for URL {self.target_url}: code={fault.faultCode} - {fault.faultString}") from fault
+                raise NotifierError(f"Pingback already registered for URL {self.target_url}, XML-RPC response: code={fault.faultCode} - {fault.faultString}") from fault
+            raise NotifierError(f"Pingback XML-RPC request failed for URL {self.target_url}: code={fault.faultCode} - {fault.faultString}") from fault
 
 class WebmentionNotifier(Notifier):
     def __init__(self, source_url, target_url, config=LinkbackConfig()):
diff --git a/pelican/plugins/linkbacks/test_linkbacks.py b/pelican/plugins/linkbacks/test_linkbacks.py
index f30a685..69a9c54 100644
--- a/pelican/plugins/linkbacks/test_linkbacks.py
+++ b/pelican/plugins/linkbacks/test_linkbacks.py
@@ -1,6 +1,10 @@
 import json, logging, os
 
 import httpretty
+from pelican.generators import ArticlesGenerator
+from pelican.tests.support import get_settings
+import pytest
+
 from linkbacks import (
     process_all_articles_linkbacks,
     Cache,
@@ -8,11 +12,6 @@
     CACHE_FILENAME,
     MAX_RESPONSE_LENGTH,
 )
-import pytest
-
-from pelican.generators import ArticlesGenerator
-from pelican.tests.support import get_settings
-
 
 CUR_DIR = os.path.dirname(__file__)
 TEST_CONTENT_DIR = os.path.join(CUR_DIR, 'test_content')