From 489af1b5b62c83d46fb80e9ce0dfab1416a40aba Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 17 Nov 2025 15:12:54 +0530 Subject: [PATCH 1/9] Simplify GemfileHandler path patterns Signed-off-by: Ayan Sinha Mahapatra --- src/packagedcode/rubygems.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/packagedcode/rubygems.py b/src/packagedcode/rubygems.py index 9cbbf6d7553..e80295c48af 100644 --- a/src/packagedcode/rubygems.py +++ b/src/packagedcode/rubygems.py @@ -211,7 +211,7 @@ def assemble(cls, package_data, resource, codebase, package_adder): # TODO: https://stackoverflow.com/questions/41454333/meaning-of-new-block-git-sourcegithub-in-gemfile class GemfileHandler(GemspecHandler): datasource_id = 'gemfile' - path_patterns = ('*/Gemfile', '*/*.gemfile', '*/Gemfile-*') + path_patterns = ('*/Gemfile', '*.gemfile', '*/Gemfile-*') default_package_type = 'gem' default_primary_language = 'Ruby' description = 'RubyGems Bundler Gemfile' From 8d6fa73bdba4c7e8658f38b2db6ce5cb07f4ed4a Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 17 Nov 2025 15:15:51 +0530 Subject: [PATCH 2/9] Add multiregex as a dependency Reference: https://github.com/Quantco/multiregex Signed-off-by: Ayan Sinha Mahapatra --- requirements.txt | 1 + setup-mini.cfg | 1 + setup.cfg | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8d7b458c84d..f9e6b6a0a28 100644 --- a/requirements.txt +++ b/requirements.txt @@ -40,6 +40,7 @@ license-expression==30.4.4 lxml==5.4.0 MarkupSafe==3.0.2 more-itertools==10.7.0 +multiregex==2.0.3 normality==2.6.1 packageurl-python==0.17.1 packaging==25.0 diff --git a/setup-mini.cfg b/setup-mini.cfg index 8f3a043d8af..7251d59715a 100644 --- a/setup-mini.cfg +++ b/setup-mini.cfg @@ -89,6 +89,7 @@ install_requires = license_expression >= 30.4.4 lxml >= 5.4.0 MarkupSafe >= 2.1.2 + multiregex >= 2.0.3 normality <= 2.6.1 packageurl_python >= 0.9.0 packvers >= 21.0.0 diff --git a/setup.cfg b/setup.cfg index 770b70542b3..c02d1ce9f13 100644 --- a/setup.cfg +++ b/setup.cfg @@ -74,6 +74,7 @@ install_requires = colorama >= 0.3.9 commoncode >= 32.4.0 container-inspector >= 31.0.0 + cyseq >= 0.0.2 debian-inspector >= 31.1.0 dparse2 >= 0.7.0 fasteners @@ -90,6 +91,7 @@ install_requires = license_expression >= 30.4.4 lxml >= 5.4.0 MarkupSafe >= 2.1.2 + multiregex >= 2.0.3 normality <= 2.6.1 packageurl_python >= 0.9.0 packvers >= 21.0.0 @@ -116,7 +118,6 @@ install_requires = typecode >= 30.0.1 typecode[full] >= 30.0.1 extractcode[full] >= 31.0.0 - cyseq >= 0.0.2 [options.packages.find] From 4fc3af2f834a7db0d126a5fa5b44ad251d752511 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 17 Nov 2025 15:16:44 +0530 Subject: [PATCH 3/9] Add initial multiregex implementation Use multiregex to use a cached regex path patterns and datafile handlers mapping to detect package datafiles faster. Reference: https://github.com/aboutcode-org/scancode-toolkit/issues/4064 Reference: https://github.com/aboutcode-org/scancode-toolkit/issues/4061 Signed-off-by: Ayan Sinha Mahapatra --- src/packagedcode/__init__.py | 18 ++- src/packagedcode/cache.py | 200 ++++++++++++++++++++++++++++++++++ src/packagedcode/recognize.py | 60 +++++++--- src/scancode_config.py | 6 + 4 files changed, 269 insertions(+), 15 deletions(-) create mode 100644 src/packagedcode/cache.py diff --git a/src/packagedcode/__init__.py b/src/packagedcode/__init__.py index 9cc46d0e09b..d65e535bee6 100644 --- a/src/packagedcode/__init__.py +++ b/src/packagedcode/__init__.py @@ -246,15 +246,29 @@ win_reg.InstalledProgramFromDockerUtilityvmSoftwareHandler, ] + +# These handlers are special as they use filetype to +# detect these binaries instead of datafile path patterns +# as these are optionally installed, we can skip checking +# for filetype if these are not available +BINARY_HANDLERS_PRESENT = False +BINARY_PACKAGE_DATAFILE_HANDLERS = [] + try: from go_inspector.binary import get_go_binary_handler - APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(get_go_binary_handler()) + handler = get_go_binary_handler() + APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(handler) + BINARY_PACKAGE_DATAFILE_HANDLERS.append(handler) + BINARY_HANDLERS_PRESENT = True except ImportError: pass try: from rust_inspector.packages import get_rust_binary_handler - APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(get_rust_binary_handler()) + handler = get_rust_binary_handler() + APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(handler) + BINARY_PACKAGE_DATAFILE_HANDLERS.append(handler) + BINARY_HANDLERS_PRESENT = True except ImportError: pass diff --git a/src/packagedcode/cache.py b/src/packagedcode/cache.py new file mode 100644 index 00000000000..6412f7c6e8c --- /dev/null +++ b/src/packagedcode/cache.py @@ -0,0 +1,200 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# ScanCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/scancode-toolkit for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import os +import json +import attr +import fnmatch + +from commoncode.fileutils import create_dir + +from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS +from packagedcode import SYSTEM_PACKAGE_DATAFILE_HANDLERS + +from scancode_config import packagedcode_cache_dir +from scancode_config import scancode_cache_dir + +""" +An on-disk persistent cache of package manifest patterns and related package +manifest handlers mapping. Loading and dumping the cached package manifest +patterns is safe to use across multiple processes using lock files. +""" + +# global in-memory cache of the PkgManifestPatternsCache +_PACKAGE_CACHE = None + +PACKAGE_INDEX_LOCK_TIMEOUT = 60 * 6 +PACKAGE_INDEX_DIR = 'package_patterns_index' +PACKAGE_INDEX_FILENAME = 'index_cache' +PACKAGE_LOCKFILE_NAME = 'scancode_package_index_lockfile' +PACKAGE_CHECKSUM_FILE = 'scancode_package_index_tree_checksums' + + +@attr.s +class PkgManifestPatternsCache: + """ + Represent cachable package manifest regex patterns, prematchers + and mappings from regex patterns to datasource IDs for all datafile + handlers. + """ + + handler_by_regex = attr.ib(default=attr.Factory(dict)) + system_multiregex_patterns = attr.ib(default=attr.Factory(list)) + application_multiregex_patterns = attr.ib(default=attr.Factory(list)) + + @staticmethod + def all_multiregex_patterns(self): + return self.application_multiregex_patterns + [ + multiregex_pattern + for multiregex_pattern in self.system_multiregex_patterns + if multiregex_pattern not in self.application_multiregex_patterns + ] + + @classmethod + def from_mapping(cls, cache_mapping): + return cls(**cache_mapping) + + @staticmethod + def load_or_build( + packagedcode_cache_dir=packagedcode_cache_dir, + scancode_cache_dir=scancode_cache_dir, + force=False, + timeout=PACKAGE_INDEX_LOCK_TIMEOUT, + ): + """ + Load or build and save and return a PkgManifestPatternsCache object. + + We either load a cached PkgManifestPatternsCache or build and cache the patterns. + + - If the cache exists, it is returned unless corrupted. + - If ``force`` is True, or if the cache does not exist a new index is built + and cached. + """ + idx_cache_dir = os.path.join(packagedcode_cache_dir, PACKAGE_INDEX_DIR) + create_dir(idx_cache_dir) + cache_file = os.path.join(idx_cache_dir, PACKAGE_INDEX_FILENAME) + has_cache = os.path.exists(cache_file) and os.path.getsize(cache_file) + + # bypass build if cache exists + if has_cache and not force: + try: + return load_cache_file(cache_file) + except Exception as e: + # work around some rare Windows quirks + import traceback + print('Inconsistent License cache: rebuilding index.') + print(str(e)) + print(traceback.format_exc()) + + + from scancode import lockfile + lock_file = os.path.join(scancode_cache_dir, PACKAGE_LOCKFILE_NAME) + + # here, we have no cache: lock, check and rebuild + try: + # acquire lock and wait until timeout to get a lock or die + with lockfile.FileLock(lock_file).locked(timeout=timeout): + + system_multiregex_patterns, system_handlers_by_regex = build_mappings_and_multiregex_patterns( + datafile_handlers=SYSTEM_PACKAGE_DATAFILE_HANDLERS, + ) + application_multiregex_patterns, application_handlers_by_regex = build_mappings_and_multiregex_patterns( + datafile_handlers=APPLICATION_PACKAGE_DATAFILE_HANDLERS, + ) + package_cache = PkgManifestPatternsCache( + handler_by_regex=system_handlers_by_regex + application_handlers_by_regex, + system_multiregex_patterns=system_multiregex_patterns, + application_multiregex_patterns=application_multiregex_patterns, + ) + package_cache.dump(cache_file) + return package_cache + + except lockfile.LockTimeout: + # TODO: handle unable to lock in a nicer way + raise + + def dump(self, cache_file): + """ + Dump this package cache on disk at ``cache_file``. + """ + package_cache = {} + with open(cache_file, 'w') as f: + json.dump(package_cache, f) + + +def get_prematchers_from_glob_pattern(pattern): + return [ + prematcher.lower().lstrip("/") + for prematcher in pattern.split("*") + if prematcher + ] + + +def build_mappings_and_multiregex_patterns( + datafile_handlers, +): + """ + Return an index built from rules and licenses directories + """ + with_patterns = [] + + for handler in datafile_handlers: + if handler.path_patterns: + with_patterns.append(handler) + + handler_by_regex = {} + prematchers_by_regex = {} + + for handler in with_patterns: + for pattern in handler.path_patterns: + regex_pattern = fnmatch.translate(pattern) + regex_pattern = fr"{regex_pattern}" + + prematchers_by_regex[regex_pattern] = get_prematchers_from_glob_pattern(pattern) + + if regex_pattern in handler_by_regex: + handler_by_regex[regex_pattern].append(handler.datasource_id) + else: + handler_by_regex[regex_pattern]= [handler.datasource_id] + + multiregex_patterns = [] + for regex in handler_by_regex.keys(): + regex_and_prematcher = (regex, prematchers_by_regex.get(regex, [])) + multiregex_patterns.append(regex_and_prematcher) + + return handler_by_regex, multiregex_patterns + + +def get_cache( + force=False, +): + """ + Return a PkgManifestPatternsCache either rebuilt, cached or loaded from disk. + """ + global _PACKAGE_CACHE + + if force or not _PACKAGE_CACHE: + _PACKAGE_CACHE = PkgManifestPatternsCache.load_or_build( + packagedcode_cache_dir=packagedcode_cache_dir, + scancode_cache_dir=scancode_cache_dir, + force=force, + # used for testing only + timeout=PACKAGE_INDEX_LOCK_TIMEOUT, + ) + return _PACKAGE_CACHE + + +def load_cache_file(cache_file): + """ + Return a PkgManifestPatternsCache loaded from JSON ``cache_file``. + """ + with open(cache_file) as f: + cache = json.load(f) + + return PkgManifestPatternsCache.from_mapping(cache) diff --git a/src/packagedcode/recognize.py b/src/packagedcode/recognize.py index e41d29c82df..bc3704fb64d 100644 --- a/src/packagedcode/recognize.py +++ b/src/packagedcode/recognize.py @@ -10,11 +10,16 @@ import os import sys +import multiregex + from commoncode import filetype -from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS -from packagedcode import SYSTEM_PACKAGE_DATAFILE_HANDLERS -from packagedcode import ALL_DATAFILE_HANDLERS +from commoncode.fileutils import as_posixpath + +from packagedcode import HANDLER_BY_DATASOURCE_ID +from packagedcode import BINARY_HANDLERS_PRESENT +from packagedcode import BINARY_PACKAGE_DATAFILE_HANDLERS from packagedcode import models +from packagedcode.cache import get_cache TRACE = os.environ.get('SCANCODE_DEBUG_PACKAGE_API', False) @@ -56,25 +61,19 @@ def recognize_package_data( if not filetype.is_file(location): return [] - assert application or system or package_only - if package_only or (application and system): - datafile_handlers = ALL_DATAFILE_HANDLERS - elif application: - datafile_handlers = APPLICATION_PACKAGE_DATAFILE_HANDLERS - elif system: - datafile_handlers = SYSTEM_PACKAGE_DATAFILE_HANDLERS - return list(_parse( location=location, package_only=package_only, - datafile_handlers=datafile_handlers, + application=application, + system=system, )) def _parse( location, + application=True, + system=False, package_only=False, - datafile_handlers=APPLICATION_PACKAGE_DATAFILE_HANDLERS, ): """ Yield parsed PackageData objects from ``location``. Raises Exceptions on errors. @@ -83,6 +82,41 @@ def _parse( Default to use application packages """ + package_path = as_posixpath(location) + package_patterns = get_cache() + + assert application or system or package_only + if package_only or (application and system): + multiregex_patterns = package_patterns.all_multiregex_patterns + elif application: + multiregex_patterns = package_patterns.application_multiregex_patterns + elif system: + multiregex_patterns = package_patterns.system_multiregex_patterns + + package_matcher = multiregex.RegexMatcher(multiregex_patterns) + matched_patterns = package_matcher.match(package_path) + + datafile_handlers = [] + for matched_pattern in matched_patterns: + regex, _match = matched_pattern + handler_ids = package_patterns.handler_by_regex.get(regex.pattern) + if TRACE: + logger_debug(f'_parse:.handler_ids: {handler_ids}') + + datafile_handlers = [ + HANDLER_BY_DATASOURCE_ID.get(handler_id) + for handler_id in handler_ids + ] + + if not datafile_handlers: + if BINARY_HANDLERS_PRESENT: + datafile_handlers = BINARY_PACKAGE_DATAFILE_HANDLERS + else: + if TRACE: + logger_debug(f'_parse: no package datafile detected at {package_path}') + + return + for handler in datafile_handlers: if TRACE: logger_debug(f'_parse:.is_datafile: {handler}') diff --git a/src/scancode_config.py b/src/scancode_config.py index 9b6e2b7d075..520a0af9396 100644 --- a/src/scancode_config.py +++ b/src/scancode_config.py @@ -185,7 +185,13 @@ def _create_dir(location): __env_license_cache_dir = os.getenv('SCANCODE_LICENSE_INDEX_CACHE') licensedcode_cache_dir = (__env_license_cache_dir or std_license_cache_dir) + +std_package_cache_dir = join(scancode_src_dir, 'packagedcode', 'data', 'cache') +__env_package_cache_dir = os.getenv('SCANCODE_PACKAGE_INDEX_CACHE') +packagedcode_cache_dir = (__env_package_cache_dir or std_package_cache_dir) + _create_dir(licensedcode_cache_dir) +_create_dir(packagedcode_cache_dir) _create_dir(scancode_cache_dir) # - scancode_temp_dir: for short-lived temporary files which are import- or run- From 4438526050feb0ae6c600ca3ecf32783c9f226d5 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 17 Nov 2025 19:27:01 +0530 Subject: [PATCH 4/9] Add minimal tests for package cache Signed-off-by: Ayan Sinha Mahapatra --- src/packagedcode/cache.py | 48 +++++++++++++----- src/packagedcode/data/.gitignore | 1 + .../package_patterns_index/index_cache | 1 + .../data/plugin/plugins_list_linux.txt | 2 +- tests/packagedcode/test_cache.py | 49 +++++++++++++++++++ 5 files changed, 87 insertions(+), 14 deletions(-) create mode 100644 src/packagedcode/data/.gitignore create mode 100644 tests/packagedcode/data/cache/package_patterns_index/package_patterns_index/index_cache create mode 100644 tests/packagedcode/test_cache.py diff --git a/src/packagedcode/cache.py b/src/packagedcode/cache.py index 6412f7c6e8c..ff596f80cf8 100644 --- a/src/packagedcode/cache.py +++ b/src/packagedcode/cache.py @@ -66,6 +66,8 @@ def load_or_build( scancode_cache_dir=scancode_cache_dir, force=False, timeout=PACKAGE_INDEX_LOCK_TIMEOUT, + system_package_datafile_handlers=SYSTEM_PACKAGE_DATAFILE_HANDLERS, + application_package_datafile_handlers=APPLICATION_PACKAGE_DATAFILE_HANDLERS, ): """ Load or build and save and return a PkgManifestPatternsCache object. @@ -88,7 +90,7 @@ def load_or_build( except Exception as e: # work around some rare Windows quirks import traceback - print('Inconsistent License cache: rebuilding index.') + print('Inconsistent Package cache: rebuilding index.') print(str(e)) print(traceback.format_exc()) @@ -102,13 +104,13 @@ def load_or_build( with lockfile.FileLock(lock_file).locked(timeout=timeout): system_multiregex_patterns, system_handlers_by_regex = build_mappings_and_multiregex_patterns( - datafile_handlers=SYSTEM_PACKAGE_DATAFILE_HANDLERS, + datafile_handlers=system_package_datafile_handlers, ) application_multiregex_patterns, application_handlers_by_regex = build_mappings_and_multiregex_patterns( - datafile_handlers=APPLICATION_PACKAGE_DATAFILE_HANDLERS, + datafile_handlers=application_package_datafile_handlers, ) package_cache = PkgManifestPatternsCache( - handler_by_regex=system_handlers_by_regex + application_handlers_by_regex, + handler_by_regex=system_handlers_by_regex | application_handlers_by_regex, system_multiregex_patterns=system_multiregex_patterns, application_multiregex_patterns=application_multiregex_patterns, ) @@ -123,7 +125,11 @@ def dump(self, cache_file): """ Dump this package cache on disk at ``cache_file``. """ - package_cache = {} + package_cache = { + "handler_by_regex": self.handler_by_regex, + "system_multiregex_patterns": self.system_multiregex_patterns, + "application_multiregex_patterns": self.application_multiregex_patterns, + } with open(cache_file, 'w') as f: json.dump(package_cache, f) @@ -136,19 +142,23 @@ def get_prematchers_from_glob_pattern(pattern): ] -def build_mappings_and_multiregex_patterns( - datafile_handlers, -): +def build_mappings_and_multiregex_patterns(datafile_handlers): """ - Return an index built from rules and licenses directories + Return a mapping of regex patterns to datafile handler IDs and + multiregex patterns consisting of regex patterns and prematchers. """ + handler_by_regex = {} + multiregex_patterns = [] + + if not datafile_handlers: + return multiregex_patterns, handler_by_regex + with_patterns = [] for handler in datafile_handlers: if handler.path_patterns: with_patterns.append(handler) - handler_by_regex = {} prematchers_by_regex = {} for handler in with_patterns: @@ -163,16 +173,17 @@ def build_mappings_and_multiregex_patterns( else: handler_by_regex[regex_pattern]= [handler.datasource_id] - multiregex_patterns = [] for regex in handler_by_regex.keys(): regex_and_prematcher = (regex, prematchers_by_regex.get(regex, [])) multiregex_patterns.append(regex_and_prematcher) - return handler_by_regex, multiregex_patterns + return multiregex_patterns, handler_by_regex def get_cache( force=False, + packagedcode_cache_dir=packagedcode_cache_dir, + scancode_cache_dir=scancode_cache_dir, ): """ Return a PkgManifestPatternsCache either rebuilt, cached or loaded from disk. @@ -197,4 +208,15 @@ def load_cache_file(cache_file): with open(cache_file) as f: cache = json.load(f) - return PkgManifestPatternsCache.from_mapping(cache) + # convert multiregex patterns from list to tuples while loading + cache_transformed = {"handler_by_regex": cache.get("handler_by_regex")} + cache_transformed["system_multiregex_patterns"] = [ + tuple(multiregex_pattern) + for multiregex_pattern in cache.get("system_multiregex_patterns") + ] + cache_transformed["application_multiregex_patterns"] = [ + tuple(multiregex_pattern) + for multiregex_pattern in cache.get("application_multiregex_patterns") + ] + + return PkgManifestPatternsCache.from_mapping(cache_transformed) diff --git a/src/packagedcode/data/.gitignore b/src/packagedcode/data/.gitignore new file mode 100644 index 00000000000..0a2101fab9b --- /dev/null +++ b/src/packagedcode/data/.gitignore @@ -0,0 +1 @@ +/cache/ diff --git a/tests/packagedcode/data/cache/package_patterns_index/package_patterns_index/index_cache b/tests/packagedcode/data/cache/package_patterns_index/package_patterns_index/index_cache new file mode 100644 index 00000000000..2c820bcff1c --- /dev/null +++ b/tests/packagedcode/data/cache/package_patterns_index/package_patterns_index/index_cache @@ -0,0 +1 @@ +{"handler_by_regex": {"(?s:.*\\.ABOUT)\\Z": ["about_file"]}, "system_multiregex_patterns": [], "application_multiregex_patterns": [["(?s:.*\\.ABOUT)\\Z", [".about"]]]} \ No newline at end of file diff --git a/tests/packagedcode/data/plugin/plugins_list_linux.txt b/tests/packagedcode/data/plugin/plugins_list_linux.txt index e24512dfd91..eb4763d6c7e 100755 --- a/tests/packagedcode/data/plugin/plugins_list_linux.txt +++ b/tests/packagedcode/data/plugin/plugins_list_linux.txt @@ -410,7 +410,7 @@ Package type: gem documentation URL: https://bundler.io/man/gemfile.5.html primary language: Ruby description: RubyGems Bundler Gemfile - path_patterns: '*/Gemfile', '*/*.gemfile', '*/Gemfile-*' + path_patterns: '*/Gemfile', '*.gemfile', '*/Gemfile-*' -------------------------------------------- Package type: gem datasource_id: gemfile_extracted diff --git a/tests/packagedcode/test_cache.py b/tests/packagedcode/test_cache.py new file mode 100644 index 00000000000..84614d17e59 --- /dev/null +++ b/tests/packagedcode/test_cache.py @@ -0,0 +1,49 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# ScanCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/scancode-toolkit for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import os.path + +from packagedcode import cache +from packages_test_utils import PackageTester +from scancode_config import REGEN_TEST_FIXTURES +from scancode.cli_test_utils import run_scan_click +from scancode.cli_test_utils import check_json_scan + + +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') + + +class TestMultiregexPatterns(PackageTester): + test_data_dir = TEST_DATA_DIR + + def test_build_mappings_and_multiregex_patterns_works(self): + from packagedcode.about import AboutFileHandler + + multiregex_patterns, handler_by_regex = cache.build_mappings_and_multiregex_patterns( + datafile_handlers=[AboutFileHandler], + ) + assert multiregex_patterns == [('(?s:.*\\.ABOUT)\\Z', ['.about'])] + assert handler_by_regex == {'(?s:.*\\.ABOUT)\\Z': ['about_file']} + + def test_build_package_cache_works(self): + from packagedcode.about import AboutFileHandler + + package_cache_dir = self.get_test_loc('cache/package_patterns_index') + package_cache = cache.PkgManifestPatternsCache.load_or_build( + packagedcode_cache_dir=package_cache_dir, + application_package_datafile_handlers=[AboutFileHandler], + system_package_datafile_handlers=[], + force=True, + ) + + assert not package_cache.system_multiregex_patterns + assert len(package_cache.application_multiregex_patterns) == 1 + assert '(?s:.*\\.ABOUT)\\Z' in package_cache.handler_by_regex + + From 3bbf35e14d8ae6134684e14c1f3b33d687301f32 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 18 Nov 2025 16:37:40 +0530 Subject: [PATCH 5/9] Cache multiregex matchers instead of patterns Signed-off-by: Ayan Sinha Mahapatra --- src/packagedcode/cache.py | 83 +++++++++---------- src/packagedcode/recognize.py | 22 ++--- tests/packagedcode/data/cache/.gitignore | 1 + .../package_patterns_index/index_cache | 1 - tests/packagedcode/test_cache.py | 22 +++-- tests/packagedcode/test_recognize.py | 5 ++ 6 files changed, 71 insertions(+), 63 deletions(-) create mode 100644 tests/packagedcode/data/cache/.gitignore delete mode 100644 tests/packagedcode/data/cache/package_patterns_index/package_patterns_index/index_cache diff --git a/src/packagedcode/cache.py b/src/packagedcode/cache.py index ff596f80cf8..163efd18f56 100644 --- a/src/packagedcode/cache.py +++ b/src/packagedcode/cache.py @@ -8,12 +8,13 @@ # import os -import json -import attr import fnmatch +import pickle +import multiregex -from commoncode.fileutils import create_dir +import attr +from commoncode.fileutils import create_dir from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS from packagedcode import SYSTEM_PACKAGE_DATAFILE_HANDLERS @@ -29,6 +30,9 @@ # global in-memory cache of the PkgManifestPatternsCache _PACKAGE_CACHE = None +# This is the Pickle protocol we use, which was added in Python 3.4. +PICKLE_PROTOCOL = 4 + PACKAGE_INDEX_LOCK_TIMEOUT = 60 * 6 PACKAGE_INDEX_DIR = 'package_patterns_index' PACKAGE_INDEX_FILENAME = 'index_cache' @@ -45,23 +49,21 @@ class PkgManifestPatternsCache: """ handler_by_regex = attr.ib(default=attr.Factory(dict)) - system_multiregex_patterns = attr.ib(default=attr.Factory(list)) - application_multiregex_patterns = attr.ib(default=attr.Factory(list)) + system_package_matcher = attr.ib(default=None) + application_package_matcher = attr.ib(default=None) + all_package_matcher = attr.ib(default=None) @staticmethod - def all_multiregex_patterns(self): - return self.application_multiregex_patterns + [ + def all_multiregex_patterns(application_multiregex_patterns, system_multiregex_patterns): + return application_multiregex_patterns + [ multiregex_pattern - for multiregex_pattern in self.system_multiregex_patterns - if multiregex_pattern not in self.application_multiregex_patterns + for multiregex_pattern in system_multiregex_patterns + if multiregex_pattern not in application_multiregex_patterns ] @classmethod - def from_mapping(cls, cache_mapping): - return cls(**cache_mapping) - - @staticmethod def load_or_build( + cls, packagedcode_cache_dir=packagedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, force=False, @@ -94,7 +96,6 @@ def load_or_build( print(str(e)) print(traceback.format_exc()) - from scancode import lockfile lock_file = os.path.join(scancode_cache_dir, PACKAGE_LOCKFILE_NAME) @@ -109,29 +110,31 @@ def load_or_build( application_multiregex_patterns, application_handlers_by_regex = build_mappings_and_multiregex_patterns( datafile_handlers=application_package_datafile_handlers, ) - package_cache = PkgManifestPatternsCache( + all_multiregex_matcher = PkgManifestPatternsCache.all_multiregex_patterns( + application_multiregex_patterns, system_multiregex_patterns, + ) + system_package_matcher = multiregex.RegexMatcher(system_multiregex_patterns) + application_package_matcher = multiregex.RegexMatcher(application_multiregex_patterns) + all_package_matcher = multiregex.RegexMatcher(all_multiregex_matcher) + package_cache = cls( handler_by_regex=system_handlers_by_regex | application_handlers_by_regex, - system_multiregex_patterns=system_multiregex_patterns, - application_multiregex_patterns=application_multiregex_patterns, + system_package_matcher=system_package_matcher, + application_package_matcher=application_package_matcher, + all_package_matcher=all_package_matcher, ) package_cache.dump(cache_file) return package_cache except lockfile.LockTimeout: # TODO: handle unable to lock in a nicer way - raise + raise def dump(self, cache_file): """ - Dump this package cache on disk at ``cache_file``. + Dump this license cache on disk at ``cache_file``. """ - package_cache = { - "handler_by_regex": self.handler_by_regex, - "system_multiregex_patterns": self.system_multiregex_patterns, - "application_multiregex_patterns": self.application_multiregex_patterns, - } - with open(cache_file, 'w') as f: - json.dump(package_cache, f) + with open(cache_file, 'wb') as fn: + pickle.dump(self, fn, protocol=PICKLE_PROTOCOL) def get_prematchers_from_glob_pattern(pattern): @@ -203,20 +206,16 @@ def get_cache( def load_cache_file(cache_file): """ - Return a PkgManifestPatternsCache loaded from JSON ``cache_file``. + Return a PkgManifestPatternsCache loaded from ``cache_file``. """ - with open(cache_file) as f: - cache = json.load(f) - - # convert multiregex patterns from list to tuples while loading - cache_transformed = {"handler_by_regex": cache.get("handler_by_regex")} - cache_transformed["system_multiregex_patterns"] = [ - tuple(multiregex_pattern) - for multiregex_pattern in cache.get("system_multiregex_patterns") - ] - cache_transformed["application_multiregex_patterns"] = [ - tuple(multiregex_pattern) - for multiregex_pattern in cache.get("application_multiregex_patterns") - ] - - return PkgManifestPatternsCache.from_mapping(cache_transformed) + with open(cache_file, 'rb') as lfc: + try: + return pickle.load(lfc) + except Exception as e: + msg = ( + 'ERROR: Failed to load package cache (the file may be corrupted ?).\n' + f'Please delete "{cache_file}" and retry.\n' + 'If the problem persists, copy this error message ' + 'and submit a bug report at https://github.com/nexB/scancode-toolkit/issues/' + ) + raise Exception(msg) from e diff --git a/src/packagedcode/recognize.py b/src/packagedcode/recognize.py index bc3704fb64d..26c5f0d702b 100644 --- a/src/packagedcode/recognize.py +++ b/src/packagedcode/recognize.py @@ -10,8 +10,6 @@ import os import sys -import multiregex - from commoncode import filetype from commoncode.fileutils import as_posixpath @@ -87,13 +85,12 @@ def _parse( assert application or system or package_only if package_only or (application and system): - multiregex_patterns = package_patterns.all_multiregex_patterns + package_matcher = package_patterns.all_package_matcher elif application: - multiregex_patterns = package_patterns.application_multiregex_patterns + package_matcher = package_patterns.application_package_matcher elif system: - multiregex_patterns = package_patterns.system_multiregex_patterns + package_matcher = package_patterns.system_package_matcher - package_matcher = multiregex.RegexMatcher(multiregex_patterns) matched_patterns = package_matcher.match(package_path) datafile_handlers = [] @@ -103,19 +100,14 @@ def _parse( if TRACE: logger_debug(f'_parse:.handler_ids: {handler_ids}') - datafile_handlers = [ + datafile_handlers.extend([ HANDLER_BY_DATASOURCE_ID.get(handler_id) for handler_id in handler_ids - ] + ]) if not datafile_handlers: - if BINARY_HANDLERS_PRESENT: - datafile_handlers = BINARY_PACKAGE_DATAFILE_HANDLERS - else: - if TRACE: - logger_debug(f'_parse: no package datafile detected at {package_path}') - - return + if TRACE: + logger_debug(f'_parse: no package datafile detected at {package_path}') for handler in datafile_handlers: if TRACE: diff --git a/tests/packagedcode/data/cache/.gitignore b/tests/packagedcode/data/cache/.gitignore new file mode 100644 index 00000000000..a738fbc8f7f --- /dev/null +++ b/tests/packagedcode/data/cache/.gitignore @@ -0,0 +1 @@ +/package_patterns_index/ \ No newline at end of file diff --git a/tests/packagedcode/data/cache/package_patterns_index/package_patterns_index/index_cache b/tests/packagedcode/data/cache/package_patterns_index/package_patterns_index/index_cache deleted file mode 100644 index 2c820bcff1c..00000000000 --- a/tests/packagedcode/data/cache/package_patterns_index/package_patterns_index/index_cache +++ /dev/null @@ -1 +0,0 @@ -{"handler_by_regex": {"(?s:.*\\.ABOUT)\\Z": ["about_file"]}, "system_multiregex_patterns": [], "application_multiregex_patterns": [["(?s:.*\\.ABOUT)\\Z", [".about"]]]} \ No newline at end of file diff --git a/tests/packagedcode/test_cache.py b/tests/packagedcode/test_cache.py index 84614d17e59..98951d9fc8f 100644 --- a/tests/packagedcode/test_cache.py +++ b/tests/packagedcode/test_cache.py @@ -10,6 +10,8 @@ import os.path from packagedcode import cache +from commoncode.fileutils import as_posixpath + from packages_test_utils import PackageTester from scancode_config import REGEN_TEST_FIXTURES from scancode.cli_test_utils import run_scan_click @@ -33,17 +35,27 @@ def test_build_mappings_and_multiregex_patterns_works(self): def test_build_package_cache_works(self): from packagedcode.about import AboutFileHandler + from packagedcode.bower import BowerJsonHandler - package_cache_dir = self.get_test_loc('cache/package_patterns_index') + package_cache_dir = self.get_test_loc('cache/') package_cache = cache.PkgManifestPatternsCache.load_or_build( packagedcode_cache_dir=package_cache_dir, application_package_datafile_handlers=[AboutFileHandler], - system_package_datafile_handlers=[], + system_package_datafile_handlers=[BowerJsonHandler], force=True, ) + test_path = "scancode-toolkit.ABOUT" + + assert not package_cache.system_package_matcher.match(test_path) + assert package_cache.application_package_matcher.match(test_path) - assert not package_cache.system_multiregex_patterns - assert len(package_cache.application_multiregex_patterns) == 1 - assert '(?s:.*\\.ABOUT)\\Z' in package_cache.handler_by_regex + regex, _match = package_cache.all_package_matcher.match(test_path).pop() + assert package_cache.handler_by_regex.get(regex.pattern).pop() == AboutFileHandler.datasource_id + + def check_empty_file_scan_works(self): + test_file = self.get_test_loc('cache/.gitignore') + package_path = as_posixpath(test_file) + package_matcher = cache.get_cache() + assert not package_matcher.match(package_path) diff --git a/tests/packagedcode/test_recognize.py b/tests/packagedcode/test_recognize.py index f7736aeeb61..98a50164321 100644 --- a/tests/packagedcode/test_recognize.py +++ b/tests/packagedcode/test_recognize.py @@ -202,3 +202,8 @@ def test_recognize_rpmdb_sqlite(self): packages = recognize_package_data(test_file, system=True) assert packages assert isinstance(packages[0], models.PackageData) + + def test_recognize_non_package_manifest_file(self): + test_file = self.get_test_loc('cache/.gitignore') + packages = recognize_package_data(test_file) + assert not packages From e0460ef7add32caba7f4ff721add2f305bd0bb3b Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 18 Nov 2025 18:27:52 +0530 Subject: [PATCH 6/9] Restore binary package manifest scanning Signed-off-by: Ayan Sinha Mahapatra --- src/packagedcode/recognize.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/packagedcode/recognize.py b/src/packagedcode/recognize.py index 26c5f0d702b..1c9e85ec570 100644 --- a/src/packagedcode/recognize.py +++ b/src/packagedcode/recognize.py @@ -106,7 +106,9 @@ def _parse( ]) if not datafile_handlers: - if TRACE: + if BINARY_HANDLERS_PRESENT: + datafile_handlers.extend(BINARY_PACKAGE_DATAFILE_HANDLERS) + elif TRACE: logger_debug(f'_parse: no package datafile detected at {package_path}') for handler in datafile_handlers: From dde6bc9710ad0f16e1cfde69e3e7b29eb2ea057d Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Wed, 19 Nov 2025 15:00:10 +0530 Subject: [PATCH 7/9] Only scan for bianry packages optionally Introduce a new option --binary-packages which looks for package/dependency data in binaries. Signed-off-by: Ayan Sinha Mahapatra --- docs/source/rst_snippets/basic_options.rst | 5 ++++ src/packagedcode/__init__.py | 7 +---- src/packagedcode/plugin_package.py | 32 ++++++++++++++++++---- src/packagedcode/recognize.py | 27 ++++++++++++------ src/scancode/api.py | 22 +++++++++++++-- tests/packagedcode/test_cargo.py | 2 +- tests/scancode/data/help/help.txt | 17 +++++++----- tests/scancode/data/help/help_linux.txt | 21 ++++++++------ 8 files changed, 94 insertions(+), 39 deletions(-) diff --git a/docs/source/rst_snippets/basic_options.rst b/docs/source/rst_snippets/basic_options.rst index d01fbf72a6c..83caf28f406 100644 --- a/docs/source/rst_snippets/basic_options.rst +++ b/docs/source/rst_snippets/basic_options.rst @@ -33,6 +33,11 @@ documenting a program's options. For example: --system-package Scan ```` for installed system package databases. +-b, --binary-package Scan for package and dependency related + data in binaries. Note that looking for packages + in binaries makes package scan slower. + Currently supported binaries: Go, Rust. + --package-only Scan ```` for system and application only for package metadata, without license/ copyright detection and package assembly. diff --git a/src/packagedcode/__init__.py b/src/packagedcode/__init__.py index d65e535bee6..8626fcf7ff6 100644 --- a/src/packagedcode/__init__.py +++ b/src/packagedcode/__init__.py @@ -251,24 +251,19 @@ # detect these binaries instead of datafile path patterns # as these are optionally installed, we can skip checking # for filetype if these are not available -BINARY_HANDLERS_PRESENT = False BINARY_PACKAGE_DATAFILE_HANDLERS = [] try: from go_inspector.binary import get_go_binary_handler handler = get_go_binary_handler() - APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(handler) BINARY_PACKAGE_DATAFILE_HANDLERS.append(handler) - BINARY_HANDLERS_PRESENT = True except ImportError: pass try: from rust_inspector.packages import get_rust_binary_handler handler = get_rust_binary_handler() - APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(handler) BINARY_PACKAGE_DATAFILE_HANDLERS.append(handler) - BINARY_HANDLERS_PRESENT = True except ImportError: pass @@ -276,7 +271,7 @@ APPLICATION_PACKAGE_DATAFILE_HANDLERS + [ p for p in SYSTEM_PACKAGE_DATAFILE_HANDLERS if p not in APPLICATION_PACKAGE_DATAFILE_HANDLERS - ] + ] + BINARY_PACKAGE_DATAFILE_HANDLERS ) # registry of all handler classes keyed by datasource_id diff --git a/src/packagedcode/plugin_package.py b/src/packagedcode/plugin_package.py index 8dc993e3b7a..dc55e0053fb 100644 --- a/src/packagedcode/plugin_package.py +++ b/src/packagedcode/plugin_package.py @@ -170,6 +170,20 @@ class PackageScanner(ScanPlugin): help_group=SCAN_GROUP, sort_order=21, ), + PluggableCommandLineOption( + ( + '-b', + '--binary-package', + ), + is_flag=True, + default=False, + help=( + 'Scan for package and dependency related data in binaries. ' + 'Currently supported binaries: Go, Rust.' + ), + help_group=SCAN_GROUP, + sort_order=22, + ), PluggableCommandLineOption( ( '--package-only', @@ -182,7 +196,7 @@ class PackageScanner(ScanPlugin): 'license/copyright detection and top-level package creation.' ), help_group=SCAN_GROUP, - sort_order=22, + sort_order=23, ), PluggableCommandLineOption( ('--list-packages',), @@ -195,10 +209,17 @@ class PackageScanner(ScanPlugin): ), ] - def is_enabled(self, package, system_package, package_only, **kwargs): - return package or system_package or package_only + def is_enabled(self, package, system_package, binary_package, package_only, **kwargs): + return package or system_package or binary_package or package_only - def get_scanner(self, package=True, system_package=False, package_only=False, **kwargs): + def get_scanner( + self, + package=True, + system_package=False, + binary_package=False, + package_only=False, + **kwargs + ): """ Return a scanner callable to scan a file for package data. """ @@ -208,6 +229,7 @@ def get_scanner(self, package=True, system_package=False, package_only=False, ** get_package_data, application=package, system=system_package, + binary=binary_package, package_only=package_only, ) @@ -464,7 +486,7 @@ def get_package_and_deps(codebase, package_adder=add_to_package, strip_root=Fals resource.scan_errors.append(msg) resource.save(codebase) - if TRACE: + if TRACE_ASSEMBLY: raise Exception(msg) from e return packages, dependencies diff --git a/src/packagedcode/recognize.py b/src/packagedcode/recognize.py index 1c9e85ec570..f60107c6904 100644 --- a/src/packagedcode/recognize.py +++ b/src/packagedcode/recognize.py @@ -14,7 +14,6 @@ from commoncode.fileutils import as_posixpath from packagedcode import HANDLER_BY_DATASOURCE_ID -from packagedcode import BINARY_HANDLERS_PRESENT from packagedcode import BINARY_PACKAGE_DATAFILE_HANDLERS from packagedcode import models from packagedcode.cache import get_cache @@ -47,6 +46,7 @@ def recognize_package_data( location, application=True, system=False, + binary=False, package_only=False, ): """ @@ -61,9 +61,10 @@ def recognize_package_data( return list(_parse( location=location, - package_only=package_only, application=application, system=system, + binary=binary, + package_only=package_only, )) @@ -71,6 +72,7 @@ def _parse( location, application=True, system=False, + binary=False, package_only=False, ): """ @@ -83,7 +85,8 @@ def _parse( package_path = as_posixpath(location) package_patterns = get_cache() - assert application or system or package_only + has_patterns = application or system or package_only + assert has_patterns or binary if package_only or (application and system): package_matcher = package_patterns.all_package_matcher elif application: @@ -91,22 +94,30 @@ def _parse( elif system: package_matcher = package_patterns.system_package_matcher - matched_patterns = package_matcher.match(package_path) + matched_patterns = [] + if has_patterns: + matched_patterns = package_matcher.match(package_path) - datafile_handlers = [] + all_handler_ids = [] for matched_pattern in matched_patterns: regex, _match = matched_pattern handler_ids = package_patterns.handler_by_regex.get(regex.pattern) if TRACE: logger_debug(f'_parse:.handler_ids: {handler_ids}') - datafile_handlers.extend([ - HANDLER_BY_DATASOURCE_ID.get(handler_id) + all_handler_ids.extend([ + handler_id for handler_id in handler_ids + if handler_id not in all_handler_ids ]) + datafile_handlers = [ + HANDLER_BY_DATASOURCE_ID.get(handler_id) + for handler_id in all_handler_ids + ] + if not datafile_handlers: - if BINARY_HANDLERS_PRESENT: + if binary: datafile_handlers.extend(BINARY_PACKAGE_DATAFILE_HANDLERS) elif TRACE: logger_debug(f'_parse: no package datafile detected at {package_path}') diff --git a/src/scancode/api.py b/src/scancode/api.py index 94592e20ce1..d06af7dcf45 100644 --- a/src/scancode/api.py +++ b/src/scancode/api.py @@ -256,20 +256,28 @@ def get_licenses( SCANCODE_DEBUG_PACKAGE_API = os.environ.get('SCANCODE_DEBUG_PACKAGE_API', False) -def _get_package_data(location, application=True, system=False, package_only=False, **kwargs): +def _get_package_data( + location, + application=True, + system=False, + binary=False, + package_only=False, + **kwargs +): """ Return a mapping of package manifest information detected in the file at ``location``. Include ``application`` packages (such as pypi) and/or ``system`` packages. Note that all exceptions are caught if there are any errors while parsing a package manifest. """ - assert application or system or package_only + assert application or system or binary or package_only from packagedcode.recognize import recognize_package_data try: return recognize_package_data( location=location, application=application, system=system, + binary=binary, package_only=package_only, ) or [] @@ -300,7 +308,14 @@ def get_package_info(location, **kwargs): return dict(packages=[p.to_dict() for p in packages]) -def get_package_data(location, application=True, system=False, package_only=False, **kwargs): +def get_package_data( + location, + application=True, + system=False, + binary=False, + package_only=False, + **kwargs +): """ Return a mapping of package manifest information detected in the file at `location`. @@ -313,6 +328,7 @@ def get_package_data(location, application=True, system=False, package_only=Fals location=location, application=application, system=system, + binary=binary, package_only=package_only, **kwargs, ) or [] diff --git a/tests/packagedcode/test_cargo.py b/tests/packagedcode/test_cargo.py index b71634aa8a1..5b22b69e193 100644 --- a/tests/packagedcode/test_cargo.py +++ b/tests/packagedcode/test_cargo.py @@ -159,7 +159,7 @@ def test_scan_works_on_rust_binary_with_inspector(self): test_file = self.get_test_loc('cargo/binary/cargo_dependencies') expected_file = self.get_test_loc('cargo/binary/cargo-binary.expected.json') result_file = self.get_temp_file('results.json') - run_scan_click(['--package', test_file, '--json', result_file]) + run_scan_click(['--binary-package', test_file, '--json', result_file]) check_json_scan( expected_file, result_file, remove_uuid=True, regen=REGEN_TEST_FIXTURES ) diff --git a/tests/scancode/data/help/help.txt b/tests/scancode/data/help/help.txt index 8a486871b5d..2c45a354b31 100644 --- a/tests/scancode/data/help/help.txt +++ b/tests/scancode/data/help/help.txt @@ -8,13 +8,16 @@ Usage: scancode [OPTIONS] ... Options: primary scans: - -l, --license Scan for licenses. - -p, --package Scan for application package and dependency - manifests, lockfiles and related data. - --system-package Scan for installed system package databases. - --package-only Scan for system and application package data and skip - license/copyright detection and top-level package creation. - -c, --copyright Scan for copyrights. + -l, --license Scan for licenses. + -p, --package Scan for application package and dependency + manifests, lockfiles and related data. + --system-package Scan for installed system package databases. + -b, --binary-package Scan for package and dependency related data in + binaries. Currently supported binaries: Go, Rust. + --package-only Scan for system and application package data and skip + license/copyright detection and top-level package + creation. + -c, --copyright Scan for copyrights. other scans: -i, --info Scan for file information (size, checksums, etc). diff --git a/tests/scancode/data/help/help_linux.txt b/tests/scancode/data/help/help_linux.txt index 9ca1d26d68a..5d7b1dfed92 100644 --- a/tests/scancode/data/help/help_linux.txt +++ b/tests/scancode/data/help/help_linux.txt @@ -8,15 +8,18 @@ Usage: scancode [OPTIONS] ... Options: primary scans: - -l, --license Scan for licenses. - -p, --package Scan for application package and dependency - manifests, lockfiles and related data. - --system-package Scan for installed system package databases. - --package-only Scan for system and application package data and skip - license/copyright detection and top-level package creation. - -c, --copyright Scan for copyrights. - --go-symbol Collect Go symbols. - --rust-symbol Collect Rust symbols from rust binaries. + -l, --license Scan for licenses. + -p, --package Scan for application package and dependency + manifests, lockfiles and related data. + --system-package Scan for installed system package databases. + -b, --binary-package Scan for package and dependency related data in + binaries. Currently supported binaries: Go, Rust. + --package-only Scan for system and application package data and skip + license/copyright detection and top-level package + creation. + -c, --copyright Scan for copyrights. + --go-symbol Collect Go symbols. + --rust-symbol Collect Rust symbols from rust binaries. other scans: -i, --info Scan for file information (size, checksums, etc). From 5cd5f74bc27b6da1ff8371b8198455696d654533 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Wed, 19 Nov 2025 16:15:02 +0530 Subject: [PATCH 8/9] Do not setup license index on --package-only We do not need the license index in a --package-only scan as this is designed to do a fast package detection only scan which skips the license detection. As license index loading takes a couple seconds in each case, this makes the package only scan much faster. Signed-off-by: Ayan Sinha Mahapatra --- src/licensedcode/plugin_license.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/licensedcode/plugin_license.py b/src/licensedcode/plugin_license.py index 5c42f96760a..717253c4baa 100644 --- a/src/licensedcode/plugin_license.py +++ b/src/licensedcode/plugin_license.py @@ -152,6 +152,9 @@ def setup(self, **kwargs): This is a cache warmup such that child process inherit from the loaded index. """ + if kwargs.get("package_only"): + return + from licensedcode.cache import populate_cache populate_cache() From 6b6a79b8a1c0b9789a466df4c5623ab723890a76 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 2 Dec 2025 19:45:46 +0530 Subject: [PATCH 9/9] Add a new console script to build the package patterns cache Signed-off-by: Ayan Sinha Mahapatra --- Dockerfile | 6 ++++-- etc/release/scancode-create-pypi-wheel.sh | 1 + etc/release/scancode-create-release-app-linux.sh | 1 + etc/release/scancode-create-release-app-macos.sh | 1 + .../scancode-create-release-app-windows.sh | 1 + setup-mini.cfg | 1 + setup.cfg | 1 + src/packagedcode/cache.py | 15 +++++++++++++++ 8 files changed, 25 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index d4c641d7a2f..17e28ad0930 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,9 +38,11 @@ WORKDIR /scancode-toolkit COPY . /scancode-toolkit # Initial configuration using ./configure, scancode-reindex-licenses to build -# the base license index +# the base license index and scancode-cache-package-patterns to build the +# package patterns cache RUN ./configure \ - && ./venv/bin/scancode-reindex-licenses + && ./venv/bin/scancode-reindex-licenses \ + && ./venv/bin/scancode-cache-package-patterns # Add scancode to path ENV PATH=/scancode-toolkit:$PATH diff --git a/etc/release/scancode-create-pypi-wheel.sh b/etc/release/scancode-create-pypi-wheel.sh index 5ab2fe8e988..4915695bae8 100755 --- a/etc/release/scancode-create-pypi-wheel.sh +++ b/etc/release/scancode-create-pypi-wheel.sh @@ -19,6 +19,7 @@ set -e ./configure --dev venv/bin/scancode-reindex-licenses +venv/bin/scancode-cache-package-patterns python_tag=$( python -c "import platform;print(f\"cp{''.join(platform.python_version_tuple()[:2])}\")" ) diff --git a/etc/release/scancode-create-release-app-linux.sh b/etc/release/scancode-create-release-app-linux.sh index fbe5951a937..ab6a4314d6d 100755 --- a/etc/release/scancode-create-release-app-linux.sh +++ b/etc/release/scancode-create-release-app-linux.sh @@ -65,6 +65,7 @@ cp -r etc/thirdparty $release_dir/etc # Build the wheel ./configure --dev venv/bin/scancode-reindex-licenses +venv/bin/scancode-cache-package-patterns venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version cp -r \ diff --git a/etc/release/scancode-create-release-app-macos.sh b/etc/release/scancode-create-release-app-macos.sh index 5f34bf88f28..41c804137bb 100755 --- a/etc/release/scancode-create-release-app-macos.sh +++ b/etc/release/scancode-create-release-app-macos.sh @@ -63,6 +63,7 @@ cp -r etc/thirdparty $release_dir/etc # Build the wheel ./configure --dev venv/bin/scancode-reindex-licenses +venv/bin/scancode-cache-package-patterns venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version cp -r \ diff --git a/etc/release/scancode-create-release-app-windows.sh b/etc/release/scancode-create-release-app-windows.sh index 03a22d7117a..e4dba1b9b2f 100755 --- a/etc/release/scancode-create-release-app-windows.sh +++ b/etc/release/scancode-create-release-app-windows.sh @@ -62,6 +62,7 @@ cp -r etc/thirdparty $release_dir/etc # Build the wheel ./configure --dev venv/bin/scancode-reindex-licenses +venv/bin/scancode-cache-package-patterns venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version cp -r \ diff --git a/setup-mini.cfg b/setup-mini.cfg index 7251d59715a..bfb24e2dd33 100644 --- a/setup-mini.cfg +++ b/setup-mini.cfg @@ -157,6 +157,7 @@ packages = console_scripts = scancode = scancode.cli:scancode scancode-reindex-licenses = licensedcode.reindex:reindex_licenses + scancode-cache-package-patterns = packagedcode.cache:cache_package_patterns scancode-license-data = licensedcode.license_db:dump_scancode_license_data regen-package-docs = packagedcode.regen_package_docs:regen_package_docs add-required-phrases = licensedcode.required_phrases:add_required_phrases diff --git a/setup.cfg b/setup.cfg index c02d1ce9f13..f156833e463 100644 --- a/setup.cfg +++ b/setup.cfg @@ -159,6 +159,7 @@ packages = console_scripts = scancode = scancode.cli:scancode scancode-reindex-licenses = licensedcode.reindex:reindex_licenses + scancode-cache-package-patterns = packagedcode.cache:cache_package_patterns scancode-license-data = licensedcode.license_db:dump_scancode_license_data regen-package-docs = packagedcode.regen_package_docs:regen_package_docs add-required-phrases = licensedcode.required_phrases:add_required_phrases diff --git a/src/packagedcode/cache.py b/src/packagedcode/cache.py index 163efd18f56..92320379887 100644 --- a/src/packagedcode/cache.py +++ b/src/packagedcode/cache.py @@ -13,7 +13,9 @@ import multiregex import attr +import click +from commoncode.cliutils import PluggableCommandLineOption from commoncode.fileutils import create_dir from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS from packagedcode import SYSTEM_PACKAGE_DATAFILE_HANDLERS @@ -219,3 +221,16 @@ def load_cache_file(cache_file): 'and submit a bug report at https://github.com/nexB/scancode-toolkit/issues/' ) raise Exception(msg) from e + + +@click.command(name='scancode-cache-package-patterns') +@click.help_option('-h', '--help') +def cache_package_patterns(*args, **kwargs): + """Create scancode package manifest patterns cache and exit""" + click.echo('Rebuilding the package cache patterns...') + get_cache(force=True) + click.echo('Done.') + + +if __name__ == '__main__': + cache_package_patterns()