aboutcode-org · chinyeungli · Dec 5, 2025 · Dec 8, 2025 · Dec 10, 2025 · Dec 10, 2025
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -42,7 +42,7 @@ What can you do PurlDB?
   :ref:`symbols_and_strings`.
 
 - Detect software supply chain issues by mapping package binaries to their corresponding source code
-  and determining if there are possible discrepancies between sources and binaries (such as with the
+  and determining if there are possible discrepancies between sources and sources (such as with the
   XZ utils attack, or sources and binaries, where package may not report the exact source code
   used to build binaries with the :ref:`deploy_to_devel` mapping analysis.
 

diff --git a/docs/source/purldb/purl_watch.rst b/docs/source/purldb/purl_watch.rst
@@ -7,7 +7,7 @@ Maintaining an Up-to-Date PurlDB with PurlWatch
 PurlDB serves as a knowledge base for packages. It is essential to
 keep this knowledge base updated as new package versions are released daily.
 PurlWatch is responsible for keeping PurlDB up-to-date. Depending on the PurlDB
-size, PurlWatch provides two different approaches.
+size PurlWatch provides two different approach.
 
 Methods to Keep PurlDB Updated
 ------------------------------
@@ -48,7 +48,7 @@ watch. It creates watch task for the PURL and enqueues it in RQ for execution.
 Advantages
 ~~~~~~~~~~
     - Background tasks ensure that the PurlDB remains updated without manual intervention.
-    - The watch frequency can be customized to balance the resource usage.
+    - The watch frequency can be customized to balance the resource uses.
     - Users can define the depth of data collection based on their needs.
 
 .. tip::

diff --git a/docs/source/purldb/rest_api.rst b/docs/source/purldb/rest_api.rst
@@ -906,9 +906,9 @@ Also each package can have list of ``addon_pipelines`` to run on the package.
 Find all addon pipelines `here. <https://scancodeio.readthedocs.io/en/latest/built-in-pipelines.html>`_
 
 
-If the ``reindex`` flag is set to True, existing package will be rescanned and all the non existing package will be indexed.
-If the ``reindex_set`` flag is set to True, then all the package in the same set will be rescanned.
-
+If ``reindex`` flag is True then existing package will be rescanned, if ``reindex_set``
+is True then all the package in the same set will be rescanned.
+If reindex flag is set to true then all the non existing package will be indexed.
 
 .. Note::
 
@@ -970,11 +970,7 @@ Then return a mapping containing:
     - The number of package urls that are not processable by the index queue.
 - unsupported_packages
     - A list of package urls that are not processable by the index queue.
-        The package indexing queue can handle certain supported purl
-        types such as npm, pypi, maven, etc. See "supported_ecosystems"
-        list in
-        https://github.com/aboutcode-org/purldb/blob/main/packagedb/api.py
-        for details.
+        The package indexing queue can only handle npm and maven purls.
 - unsupported_vers_count
     - The number of vers range that are not supported by the univers or package_manager.
 - unsupported_vers
@@ -1076,9 +1072,8 @@ Package Update Set List
 Take a list of purls (where each item is a mapping containing PURL and
 content_type).
 
-If uuid is given, all purls will be added to the package set if it exists;
-otherwise, a new set will be created and all the purls will be added to
-that set.
+If uuid is given then all purls will be added to package set if it exists else a
+new set would be created and all the purls will be added to that new set.
 
 .. Note::
 
@@ -1120,7 +1115,7 @@ Package Set List
 
 Return a list of package sets and the package data of packages within
 
-``GET /api/package_sets/``
+``GET /api/projects/0bbdcf88-ad07-4970-9272-7d5f4c82cc7b/``
 
 .. code-block:: json
 

diff --git a/minecode/collectors/maven.py b/minecode/collectors/maven.py
@@ -476,9 +476,6 @@ def process_request(purl_str, **kwargs):
 
 
 collect_links = re.compile(r'href="([^"]+)"').findall
-collect_links_and_artifact_timestamps = re.compile(
-    r'<a href="([^"]+)".*</a>\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)'
-).findall
 
 
 def check_if_file_name_is_linked_on_page(file_name, links, **kwargs):
@@ -675,6 +672,62 @@ def filter_for_artifacts(timestamps_by_links):
     return timestamps_by_links_filtered
 
 
+def collect_links_and_artifact_timestamps(text):
+    # Return a list of sets containing all link locations and their
+    # corresponding timestamps extracted from a given HTML text.
+
+    # Pattern that matches with https://repo.maven.apache.org/maven2
+    maven_apache_pattern = re.compile(
+        r'<a href="([^"]+)"[^>]*>[^<]*</a>\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)'
+    )
+    maven_apache_matches = maven_apache_pattern.findall(text)
+    if maven_apache_matches:
+        return maven_apache_matches
+
+    # Pattern that matces with
+    # both Apache (UTC) and Nexus (Z) formats
+    # https://repository.jboss.org/nexus/service/rest/repository/browse/releases/
+    # https://repository.jboss.org/nexus/service/rest/repository/browse/public/
+    # https://repository.apache.org/snapshots/
+    repo_jboss_apache_pattern = re.compile(
+        r'<a href="([^"]+)"[^>]*>[^<]*</a></td>\s*<td>\s*((?:[A-Z][a-z]{2}\s+[A-Z][a-z]{2}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+(?:UTC|Z)\s+\d{4})|&nbsp;)\s*</td>'
+    )
+    repo_jboss_apache_matches = repo_jboss_apache_pattern.findall(text)
+    # Convert &nbsp; to empty string for table format
+    if repo_jboss_apache_matches:
+        return [
+            (item, "" if timestamp == "&nbsp;" else timestamp)
+            for item, timestamp in repo_jboss_apache_matches
+        ]
+
+    # Pattern that matches with
+    # https://repo.spring.io/milestone
+    repo_spring_pattern = re.compile(
+        r'<a href="([^"]+)"[^>]*>[^<]*</a>\s+(\d{2}-[A-Z][a-z]{2}-\d{4}\s+\d{2}:\d{2})'
+    )
+    repo_spring_matches = repo_spring_pattern.findall(text)
+    if repo_spring_matches:
+        return repo_spring_matches
+
+    # Simple links in <pre> tags without timestamps (Gradle plugins format)
+    # https://plugins.gradle.org/m2/
+    plugins_gradle_pattern = re.compile(r'<pre><a href="([^"]+)"[^>]*>[^<]*</a></pre>')
+    plugins_gradle_matches = plugins_gradle_pattern.findall(text)
+    if plugins_gradle_matches:
+        # Filter out parent directory link if present
+        filtered_matches = []
+        for href in plugins_gradle_matches:
+            # Skip parent directory links
+            if href != "../" and not href.startswith(".."):
+                filtered_matches.append((href, ""))
+
+        # Only return if we found non-parent links
+        if filtered_matches:
+            return filtered_matches
+
+    return []
+
+
 def collect_links_from_text(text, filter):
     """
     Return a mapping of link locations and their timestamps, given HTML `text`
@@ -700,7 +753,7 @@ def create_absolute_urls_for_links(text, url, filter):
     url = url.rstrip("/")
     timestamps_by_links = collect_links_from_text(text, filter)
     for link, timestamp in timestamps_by_links.items():
-        if not link.startswith(url):
+        if not link.startswith("http:") and not link.startswith("https:"):
             link = f"{url}/{link}"
         timestamps_by_absolute_links[link] = timestamp
     return timestamps_by_absolute_links
@@ -758,23 +811,20 @@ def get_artifact_sha1(artifact_url):
     return sha1
 
 
-def get_classifier_from_artifact_url(
-    artifact_url, package_version_page_url, package_name, package_version
-):
+def get_classifier_from_artifact_url(artifact_url, package_name, package_version):
     """
     Return the classifier from a Maven artifact URL `artifact_url`, otherwise
     return None if a classifier cannot be determined from `artifact_url`
     """
     classifier = None
-    # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0
-    package_version_page_url = package_version_page_url.rstrip("/")
-    # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0
-    leading_url_portion = f"{package_version_page_url}/{package_name}-{package_version}"
+    package_name_version_portion = f"{package_name}-{package_version}"
+    artifact_url_filename = artifact_url.rsplit("/", 1)[-1]
+    remaining_url_portion = artifact_url_filename.replace(package_name_version_portion, "")
     # artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar'
-    # ['', '-onejar.jar']
-    _, remaining_url_portion = artifact_url.split(leading_url_portion)
-    # ['-onejar', 'jar']
+    # artifact_url_filename = 'livereload-jvm-0.2.0-onejar.jar'
+    # remaining_url_portion = '-onejar.jar'
     remaining_url_portions = remaining_url_portion.split(".")
+    # ['-onejar', 'jar']
     if remaining_url_portions and remaining_url_portions[0]:
         # '-onejar'
         classifier = remaining_url_portions[0]

diff --git a/minecode/management/commands/defederate_packages.py b/minecode/management/commands/defederate_packages.py
diff --git a/minecode/management/commands/federate_packages.py b/minecode/management/commands/federate_packages.py
diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py
@@ -132,13 +132,14 @@ def process_request(importable_uri):
         timestamps_by_artifact_links = get_artifact_links(version_page_url)
         for artifact_link, timestamp in timestamps_by_artifact_links.items():
             sha1 = get_artifact_sha1(artifact_link)
-            classifier = get_classifier_from_artifact_url(
-                artifact_link, version_page_url, name, version
-            )
+            classifier = get_classifier_from_artifact_url(artifact_link, name, version)
             qualifiers = None
             if classifier:
                 qualifiers = f"classifier={classifier}"
-            release_date = dateutil_parse(timestamp)
+            if timestamp:
+                release_date = dateutil_parse(timestamp)
+            else:
+                release_date = None
             package_data = PackageData(
                 type="maven",
                 namespace=namespace,