diff --git a/Dockerfile b/Dockerfile
index d4c641d7a2f..17e28ad0930 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -38,9 +38,11 @@ WORKDIR /scancode-toolkit
COPY . /scancode-toolkit
# Initial configuration using ./configure, scancode-reindex-licenses to build
-# the base license index
+# the base license index and scancode-cache-package-patterns to build the
+# package patterns cache
RUN ./configure \
- && ./venv/bin/scancode-reindex-licenses
+ && ./venv/bin/scancode-reindex-licenses \
+ && ./venv/bin/scancode-cache-package-patterns
# Add scancode to path
ENV PATH=/scancode-toolkit:$PATH
diff --git a/docs/source/rst_snippets/basic_options.rst b/docs/source/rst_snippets/basic_options.rst
index d01fbf72a6c..83caf28f406 100644
--- a/docs/source/rst_snippets/basic_options.rst
+++ b/docs/source/rst_snippets/basic_options.rst
@@ -33,6 +33,11 @@ documenting a program's options. For example:
--system-package Scan ```` for installed system package
databases.
+-b, --binary-package Scan for package and dependency related
+ data in binaries. Note that looking for packages
+ in binaries makes package scan slower.
+ Currently supported binaries: Go, Rust.
+
--package-only Scan ```` for system and application
only for package metadata, without license/
copyright detection and package assembly.
diff --git a/etc/release/scancode-create-pypi-wheel.sh b/etc/release/scancode-create-pypi-wheel.sh
index 5ab2fe8e988..4915695bae8 100755
--- a/etc/release/scancode-create-pypi-wheel.sh
+++ b/etc/release/scancode-create-pypi-wheel.sh
@@ -19,6 +19,7 @@ set -e
./configure --dev
venv/bin/scancode-reindex-licenses
+venv/bin/scancode-cache-package-patterns
python_tag=$( python -c "import platform;print(f\"cp{''.join(platform.python_version_tuple()[:2])}\")" )
diff --git a/etc/release/scancode-create-release-app-linux.sh b/etc/release/scancode-create-release-app-linux.sh
index fbe5951a937..ab6a4314d6d 100755
--- a/etc/release/scancode-create-release-app-linux.sh
+++ b/etc/release/scancode-create-release-app-linux.sh
@@ -65,6 +65,7 @@ cp -r etc/thirdparty $release_dir/etc
# Build the wheel
./configure --dev
venv/bin/scancode-reindex-licenses
+venv/bin/scancode-cache-package-patterns
venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version
cp -r \
diff --git a/etc/release/scancode-create-release-app-macos.sh b/etc/release/scancode-create-release-app-macos.sh
index 5f34bf88f28..41c804137bb 100755
--- a/etc/release/scancode-create-release-app-macos.sh
+++ b/etc/release/scancode-create-release-app-macos.sh
@@ -63,6 +63,7 @@ cp -r etc/thirdparty $release_dir/etc
# Build the wheel
./configure --dev
venv/bin/scancode-reindex-licenses
+venv/bin/scancode-cache-package-patterns
venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version
cp -r \
diff --git a/etc/release/scancode-create-release-app-windows.sh b/etc/release/scancode-create-release-app-windows.sh
index 03a22d7117a..e4dba1b9b2f 100755
--- a/etc/release/scancode-create-release-app-windows.sh
+++ b/etc/release/scancode-create-release-app-windows.sh
@@ -62,6 +62,7 @@ cp -r etc/thirdparty $release_dir/etc
# Build the wheel
./configure --dev
venv/bin/scancode-reindex-licenses
+venv/bin/scancode-cache-package-patterns
venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version
cp -r \
diff --git a/requirements.txt b/requirements.txt
index 8d7b458c84d..f9e6b6a0a28 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -40,6 +40,7 @@ license-expression==30.4.4
lxml==5.4.0
MarkupSafe==3.0.2
more-itertools==10.7.0
+multiregex==2.0.3
normality==2.6.1
packageurl-python==0.17.1
packaging==25.0
diff --git a/setup-mini.cfg b/setup-mini.cfg
index 8f3a043d8af..bfb24e2dd33 100644
--- a/setup-mini.cfg
+++ b/setup-mini.cfg
@@ -89,6 +89,7 @@ install_requires =
license_expression >= 30.4.4
lxml >= 5.4.0
MarkupSafe >= 2.1.2
+ multiregex >= 2.0.3
normality <= 2.6.1
packageurl_python >= 0.9.0
packvers >= 21.0.0
@@ -156,6 +157,7 @@ packages =
console_scripts =
scancode = scancode.cli:scancode
scancode-reindex-licenses = licensedcode.reindex:reindex_licenses
+ scancode-cache-package-patterns = packagedcode.cache:cache_package_patterns
scancode-license-data = licensedcode.license_db:dump_scancode_license_data
regen-package-docs = packagedcode.regen_package_docs:regen_package_docs
add-required-phrases = licensedcode.required_phrases:add_required_phrases
diff --git a/setup.cfg b/setup.cfg
index 770b70542b3..f156833e463 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -74,6 +74,7 @@ install_requires =
colorama >= 0.3.9
commoncode >= 32.4.0
container-inspector >= 31.0.0
+ cyseq >= 0.0.2
debian-inspector >= 31.1.0
dparse2 >= 0.7.0
fasteners
@@ -90,6 +91,7 @@ install_requires =
license_expression >= 30.4.4
lxml >= 5.4.0
MarkupSafe >= 2.1.2
+ multiregex >= 2.0.3
normality <= 2.6.1
packageurl_python >= 0.9.0
packvers >= 21.0.0
@@ -116,7 +118,6 @@ install_requires =
typecode >= 30.0.1
typecode[full] >= 30.0.1
extractcode[full] >= 31.0.0
- cyseq >= 0.0.2
[options.packages.find]
@@ -158,6 +159,7 @@ packages =
console_scripts =
scancode = scancode.cli:scancode
scancode-reindex-licenses = licensedcode.reindex:reindex_licenses
+ scancode-cache-package-patterns = packagedcode.cache:cache_package_patterns
scancode-license-data = licensedcode.license_db:dump_scancode_license_data
regen-package-docs = packagedcode.regen_package_docs:regen_package_docs
add-required-phrases = licensedcode.required_phrases:add_required_phrases
diff --git a/src/licensedcode/plugin_license.py b/src/licensedcode/plugin_license.py
index 5c42f96760a..717253c4baa 100644
--- a/src/licensedcode/plugin_license.py
+++ b/src/licensedcode/plugin_license.py
@@ -152,6 +152,9 @@ def setup(self, **kwargs):
This is a cache warmup such that child process inherit from the
loaded index.
"""
+ if kwargs.get("package_only"):
+ return
+
from licensedcode.cache import populate_cache
populate_cache()
diff --git a/src/packagedcode/__init__.py b/src/packagedcode/__init__.py
index 9cc46d0e09b..8626fcf7ff6 100644
--- a/src/packagedcode/__init__.py
+++ b/src/packagedcode/__init__.py
@@ -246,15 +246,24 @@
win_reg.InstalledProgramFromDockerUtilityvmSoftwareHandler,
]
+
+# These handlers are special as they use filetype to
+# detect these binaries instead of datafile path patterns
+# as these are optionally installed, we can skip checking
+# for filetype if these are not available
+BINARY_PACKAGE_DATAFILE_HANDLERS = []
+
try:
from go_inspector.binary import get_go_binary_handler
- APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(get_go_binary_handler())
+ handler = get_go_binary_handler()
+ BINARY_PACKAGE_DATAFILE_HANDLERS.append(handler)
except ImportError:
pass
try:
from rust_inspector.packages import get_rust_binary_handler
- APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(get_rust_binary_handler())
+ handler = get_rust_binary_handler()
+ BINARY_PACKAGE_DATAFILE_HANDLERS.append(handler)
except ImportError:
pass
@@ -262,7 +271,7 @@
APPLICATION_PACKAGE_DATAFILE_HANDLERS + [
p for p in SYSTEM_PACKAGE_DATAFILE_HANDLERS
if p not in APPLICATION_PACKAGE_DATAFILE_HANDLERS
- ]
+ ] + BINARY_PACKAGE_DATAFILE_HANDLERS
)
# registry of all handler classes keyed by datasource_id
diff --git a/src/packagedcode/cache.py b/src/packagedcode/cache.py
new file mode 100644
index 00000000000..92320379887
--- /dev/null
+++ b/src/packagedcode/cache.py
@@ -0,0 +1,236 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# ScanCode is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/nexB/scancode-toolkit for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+import os
+import fnmatch
+import pickle
+import multiregex
+
+import attr
+import click
+
+from commoncode.cliutils import PluggableCommandLineOption
+from commoncode.fileutils import create_dir
+from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS
+from packagedcode import SYSTEM_PACKAGE_DATAFILE_HANDLERS
+
+from scancode_config import packagedcode_cache_dir
+from scancode_config import scancode_cache_dir
+
+"""
+An on-disk persistent cache of package manifest patterns and related package
+manifest handlers mapping. Loading and dumping the cached package manifest
+patterns is safe to use across multiple processes using lock files.
+"""
+
+# global in-memory cache of the PkgManifestPatternsCache
+_PACKAGE_CACHE = None
+
+# This is the Pickle protocol we use, which was added in Python 3.4.
+PICKLE_PROTOCOL = 4
+
+PACKAGE_INDEX_LOCK_TIMEOUT = 60 * 6
+PACKAGE_INDEX_DIR = 'package_patterns_index'
+PACKAGE_INDEX_FILENAME = 'index_cache'
+PACKAGE_LOCKFILE_NAME = 'scancode_package_index_lockfile'
+PACKAGE_CHECKSUM_FILE = 'scancode_package_index_tree_checksums'
+
+
+@attr.s
+class PkgManifestPatternsCache:
+ """
+ Represent cachable package manifest regex patterns, prematchers
+ and mappings from regex patterns to datasource IDs for all datafile
+ handlers.
+ """
+
+ handler_by_regex = attr.ib(default=attr.Factory(dict))
+ system_package_matcher = attr.ib(default=None)
+ application_package_matcher = attr.ib(default=None)
+ all_package_matcher = attr.ib(default=None)
+
+ @staticmethod
+ def all_multiregex_patterns(application_multiregex_patterns, system_multiregex_patterns):
+ return application_multiregex_patterns + [
+ multiregex_pattern
+ for multiregex_pattern in system_multiregex_patterns
+ if multiregex_pattern not in application_multiregex_patterns
+ ]
+
+ @classmethod
+ def load_or_build(
+ cls,
+ packagedcode_cache_dir=packagedcode_cache_dir,
+ scancode_cache_dir=scancode_cache_dir,
+ force=False,
+ timeout=PACKAGE_INDEX_LOCK_TIMEOUT,
+ system_package_datafile_handlers=SYSTEM_PACKAGE_DATAFILE_HANDLERS,
+ application_package_datafile_handlers=APPLICATION_PACKAGE_DATAFILE_HANDLERS,
+ ):
+ """
+ Load or build and save and return a PkgManifestPatternsCache object.
+
+ We either load a cached PkgManifestPatternsCache or build and cache the patterns.
+
+ - If the cache exists, it is returned unless corrupted.
+ - If ``force`` is True, or if the cache does not exist a new index is built
+ and cached.
+ """
+ idx_cache_dir = os.path.join(packagedcode_cache_dir, PACKAGE_INDEX_DIR)
+ create_dir(idx_cache_dir)
+ cache_file = os.path.join(idx_cache_dir, PACKAGE_INDEX_FILENAME)
+ has_cache = os.path.exists(cache_file) and os.path.getsize(cache_file)
+
+ # bypass build if cache exists
+ if has_cache and not force:
+ try:
+ return load_cache_file(cache_file)
+ except Exception as e:
+ # work around some rare Windows quirks
+ import traceback
+ print('Inconsistent Package cache: rebuilding index.')
+ print(str(e))
+ print(traceback.format_exc())
+
+ from scancode import lockfile
+ lock_file = os.path.join(scancode_cache_dir, PACKAGE_LOCKFILE_NAME)
+
+ # here, we have no cache: lock, check and rebuild
+ try:
+ # acquire lock and wait until timeout to get a lock or die
+ with lockfile.FileLock(lock_file).locked(timeout=timeout):
+
+ system_multiregex_patterns, system_handlers_by_regex = build_mappings_and_multiregex_patterns(
+ datafile_handlers=system_package_datafile_handlers,
+ )
+ application_multiregex_patterns, application_handlers_by_regex = build_mappings_and_multiregex_patterns(
+ datafile_handlers=application_package_datafile_handlers,
+ )
+ all_multiregex_matcher = PkgManifestPatternsCache.all_multiregex_patterns(
+ application_multiregex_patterns, system_multiregex_patterns,
+ )
+ system_package_matcher = multiregex.RegexMatcher(system_multiregex_patterns)
+ application_package_matcher = multiregex.RegexMatcher(application_multiregex_patterns)
+ all_package_matcher = multiregex.RegexMatcher(all_multiregex_matcher)
+ package_cache = cls(
+ handler_by_regex=system_handlers_by_regex | application_handlers_by_regex,
+ system_package_matcher=system_package_matcher,
+ application_package_matcher=application_package_matcher,
+ all_package_matcher=all_package_matcher,
+ )
+ package_cache.dump(cache_file)
+ return package_cache
+
+ except lockfile.LockTimeout:
+ # TODO: handle unable to lock in a nicer way
+ raise
+
+ def dump(self, cache_file):
+ """
+ Dump this license cache on disk at ``cache_file``.
+ """
+ with open(cache_file, 'wb') as fn:
+ pickle.dump(self, fn, protocol=PICKLE_PROTOCOL)
+
+
+def get_prematchers_from_glob_pattern(pattern):
+ return [
+ prematcher.lower().lstrip("/")
+ for prematcher in pattern.split("*")
+ if prematcher
+ ]
+
+
+def build_mappings_and_multiregex_patterns(datafile_handlers):
+ """
+ Return a mapping of regex patterns to datafile handler IDs and
+ multiregex patterns consisting of regex patterns and prematchers.
+ """
+ handler_by_regex = {}
+ multiregex_patterns = []
+
+ if not datafile_handlers:
+ return multiregex_patterns, handler_by_regex
+
+ with_patterns = []
+
+ for handler in datafile_handlers:
+ if handler.path_patterns:
+ with_patterns.append(handler)
+
+ prematchers_by_regex = {}
+
+ for handler in with_patterns:
+ for pattern in handler.path_patterns:
+ regex_pattern = fnmatch.translate(pattern)
+ regex_pattern = fr"{regex_pattern}"
+
+ prematchers_by_regex[regex_pattern] = get_prematchers_from_glob_pattern(pattern)
+
+ if regex_pattern in handler_by_regex:
+ handler_by_regex[regex_pattern].append(handler.datasource_id)
+ else:
+ handler_by_regex[regex_pattern]= [handler.datasource_id]
+
+ for regex in handler_by_regex.keys():
+ regex_and_prematcher = (regex, prematchers_by_regex.get(regex, []))
+ multiregex_patterns.append(regex_and_prematcher)
+
+ return multiregex_patterns, handler_by_regex
+
+
+def get_cache(
+ force=False,
+ packagedcode_cache_dir=packagedcode_cache_dir,
+ scancode_cache_dir=scancode_cache_dir,
+):
+ """
+ Return a PkgManifestPatternsCache either rebuilt, cached or loaded from disk.
+ """
+ global _PACKAGE_CACHE
+
+ if force or not _PACKAGE_CACHE:
+ _PACKAGE_CACHE = PkgManifestPatternsCache.load_or_build(
+ packagedcode_cache_dir=packagedcode_cache_dir,
+ scancode_cache_dir=scancode_cache_dir,
+ force=force,
+ # used for testing only
+ timeout=PACKAGE_INDEX_LOCK_TIMEOUT,
+ )
+ return _PACKAGE_CACHE
+
+
+def load_cache_file(cache_file):
+ """
+ Return a PkgManifestPatternsCache loaded from ``cache_file``.
+ """
+ with open(cache_file, 'rb') as lfc:
+ try:
+ return pickle.load(lfc)
+ except Exception as e:
+ msg = (
+ 'ERROR: Failed to load package cache (the file may be corrupted ?).\n'
+ f'Please delete "{cache_file}" and retry.\n'
+ 'If the problem persists, copy this error message '
+ 'and submit a bug report at https://github.com/nexB/scancode-toolkit/issues/'
+ )
+ raise Exception(msg) from e
+
+
+@click.command(name='scancode-cache-package-patterns')
+@click.help_option('-h', '--help')
+def cache_package_patterns(*args, **kwargs):
+ """Create scancode package manifest patterns cache and exit"""
+ click.echo('Rebuilding the package cache patterns...')
+ get_cache(force=True)
+ click.echo('Done.')
+
+
+if __name__ == '__main__':
+ cache_package_patterns()
diff --git a/src/packagedcode/data/.gitignore b/src/packagedcode/data/.gitignore
new file mode 100644
index 00000000000..0a2101fab9b
--- /dev/null
+++ b/src/packagedcode/data/.gitignore
@@ -0,0 +1 @@
+/cache/
diff --git a/src/packagedcode/plugin_package.py b/src/packagedcode/plugin_package.py
index 8dc993e3b7a..dc55e0053fb 100644
--- a/src/packagedcode/plugin_package.py
+++ b/src/packagedcode/plugin_package.py
@@ -170,6 +170,20 @@ class PackageScanner(ScanPlugin):
help_group=SCAN_GROUP,
sort_order=21,
),
+ PluggableCommandLineOption(
+ (
+ '-b',
+ '--binary-package',
+ ),
+ is_flag=True,
+ default=False,
+ help=(
+ 'Scan for package and dependency related data in binaries. '
+ 'Currently supported binaries: Go, Rust.'
+ ),
+ help_group=SCAN_GROUP,
+ sort_order=22,
+ ),
PluggableCommandLineOption(
(
'--package-only',
@@ -182,7 +196,7 @@ class PackageScanner(ScanPlugin):
'license/copyright detection and top-level package creation.'
),
help_group=SCAN_GROUP,
- sort_order=22,
+ sort_order=23,
),
PluggableCommandLineOption(
('--list-packages',),
@@ -195,10 +209,17 @@ class PackageScanner(ScanPlugin):
),
]
- def is_enabled(self, package, system_package, package_only, **kwargs):
- return package or system_package or package_only
+ def is_enabled(self, package, system_package, binary_package, package_only, **kwargs):
+ return package or system_package or binary_package or package_only
- def get_scanner(self, package=True, system_package=False, package_only=False, **kwargs):
+ def get_scanner(
+ self,
+ package=True,
+ system_package=False,
+ binary_package=False,
+ package_only=False,
+ **kwargs
+ ):
"""
Return a scanner callable to scan a file for package data.
"""
@@ -208,6 +229,7 @@ def get_scanner(self, package=True, system_package=False, package_only=False, **
get_package_data,
application=package,
system=system_package,
+ binary=binary_package,
package_only=package_only,
)
@@ -464,7 +486,7 @@ def get_package_and_deps(codebase, package_adder=add_to_package, strip_root=Fals
resource.scan_errors.append(msg)
resource.save(codebase)
- if TRACE:
+ if TRACE_ASSEMBLY:
raise Exception(msg) from e
return packages, dependencies
diff --git a/src/packagedcode/recognize.py b/src/packagedcode/recognize.py
index e41d29c82df..f60107c6904 100644
--- a/src/packagedcode/recognize.py
+++ b/src/packagedcode/recognize.py
@@ -11,10 +11,12 @@
import sys
from commoncode import filetype
-from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS
-from packagedcode import SYSTEM_PACKAGE_DATAFILE_HANDLERS
-from packagedcode import ALL_DATAFILE_HANDLERS
+from commoncode.fileutils import as_posixpath
+
+from packagedcode import HANDLER_BY_DATASOURCE_ID
+from packagedcode import BINARY_PACKAGE_DATAFILE_HANDLERS
from packagedcode import models
+from packagedcode.cache import get_cache
TRACE = os.environ.get('SCANCODE_DEBUG_PACKAGE_API', False)
@@ -44,6 +46,7 @@ def recognize_package_data(
location,
application=True,
system=False,
+ binary=False,
package_only=False,
):
"""
@@ -56,25 +59,21 @@ def recognize_package_data(
if not filetype.is_file(location):
return []
- assert application or system or package_only
- if package_only or (application and system):
- datafile_handlers = ALL_DATAFILE_HANDLERS
- elif application:
- datafile_handlers = APPLICATION_PACKAGE_DATAFILE_HANDLERS
- elif system:
- datafile_handlers = SYSTEM_PACKAGE_DATAFILE_HANDLERS
-
return list(_parse(
location=location,
+ application=application,
+ system=system,
+ binary=binary,
package_only=package_only,
- datafile_handlers=datafile_handlers,
))
def _parse(
location,
+ application=True,
+ system=False,
+ binary=False,
package_only=False,
- datafile_handlers=APPLICATION_PACKAGE_DATAFILE_HANDLERS,
):
"""
Yield parsed PackageData objects from ``location``. Raises Exceptions on errors.
@@ -83,6 +82,46 @@ def _parse(
Default to use application packages
"""
+ package_path = as_posixpath(location)
+ package_patterns = get_cache()
+
+ has_patterns = application or system or package_only
+ assert has_patterns or binary
+ if package_only or (application and system):
+ package_matcher = package_patterns.all_package_matcher
+ elif application:
+ package_matcher = package_patterns.application_package_matcher
+ elif system:
+ package_matcher = package_patterns.system_package_matcher
+
+ matched_patterns = []
+ if has_patterns:
+ matched_patterns = package_matcher.match(package_path)
+
+ all_handler_ids = []
+ for matched_pattern in matched_patterns:
+ regex, _match = matched_pattern
+ handler_ids = package_patterns.handler_by_regex.get(regex.pattern)
+ if TRACE:
+ logger_debug(f'_parse:.handler_ids: {handler_ids}')
+
+ all_handler_ids.extend([
+ handler_id
+ for handler_id in handler_ids
+ if handler_id not in all_handler_ids
+ ])
+
+ datafile_handlers = [
+ HANDLER_BY_DATASOURCE_ID.get(handler_id)
+ for handler_id in all_handler_ids
+ ]
+
+ if not datafile_handlers:
+ if binary:
+ datafile_handlers.extend(BINARY_PACKAGE_DATAFILE_HANDLERS)
+ elif TRACE:
+ logger_debug(f'_parse: no package datafile detected at {package_path}')
+
for handler in datafile_handlers:
if TRACE:
logger_debug(f'_parse:.is_datafile: {handler}')
diff --git a/src/packagedcode/rubygems.py b/src/packagedcode/rubygems.py
index 9cbbf6d7553..e80295c48af 100644
--- a/src/packagedcode/rubygems.py
+++ b/src/packagedcode/rubygems.py
@@ -211,7 +211,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):
# TODO: https://stackoverflow.com/questions/41454333/meaning-of-new-block-git-sourcegithub-in-gemfile
class GemfileHandler(GemspecHandler):
datasource_id = 'gemfile'
- path_patterns = ('*/Gemfile', '*/*.gemfile', '*/Gemfile-*')
+ path_patterns = ('*/Gemfile', '*.gemfile', '*/Gemfile-*')
default_package_type = 'gem'
default_primary_language = 'Ruby'
description = 'RubyGems Bundler Gemfile'
diff --git a/src/scancode/api.py b/src/scancode/api.py
index 94592e20ce1..d06af7dcf45 100644
--- a/src/scancode/api.py
+++ b/src/scancode/api.py
@@ -256,20 +256,28 @@ def get_licenses(
SCANCODE_DEBUG_PACKAGE_API = os.environ.get('SCANCODE_DEBUG_PACKAGE_API', False)
-def _get_package_data(location, application=True, system=False, package_only=False, **kwargs):
+def _get_package_data(
+ location,
+ application=True,
+ system=False,
+ binary=False,
+ package_only=False,
+ **kwargs
+):
"""
Return a mapping of package manifest information detected in the file at ``location``.
Include ``application`` packages (such as pypi) and/or ``system`` packages.
Note that all exceptions are caught if there are any errors while parsing a
package manifest.
"""
- assert application or system or package_only
+ assert application or system or binary or package_only
from packagedcode.recognize import recognize_package_data
try:
return recognize_package_data(
location=location,
application=application,
system=system,
+ binary=binary,
package_only=package_only,
) or []
@@ -300,7 +308,14 @@ def get_package_info(location, **kwargs):
return dict(packages=[p.to_dict() for p in packages])
-def get_package_data(location, application=True, system=False, package_only=False, **kwargs):
+def get_package_data(
+ location,
+ application=True,
+ system=False,
+ binary=False,
+ package_only=False,
+ **kwargs
+):
"""
Return a mapping of package manifest information detected in the file at
`location`.
@@ -313,6 +328,7 @@ def get_package_data(location, application=True, system=False, package_only=Fals
location=location,
application=application,
system=system,
+ binary=binary,
package_only=package_only,
**kwargs,
) or []
diff --git a/src/scancode_config.py b/src/scancode_config.py
index 9b6e2b7d075..520a0af9396 100644
--- a/src/scancode_config.py
+++ b/src/scancode_config.py
@@ -185,7 +185,13 @@ def _create_dir(location):
__env_license_cache_dir = os.getenv('SCANCODE_LICENSE_INDEX_CACHE')
licensedcode_cache_dir = (__env_license_cache_dir or std_license_cache_dir)
+
+std_package_cache_dir = join(scancode_src_dir, 'packagedcode', 'data', 'cache')
+__env_package_cache_dir = os.getenv('SCANCODE_PACKAGE_INDEX_CACHE')
+packagedcode_cache_dir = (__env_package_cache_dir or std_package_cache_dir)
+
_create_dir(licensedcode_cache_dir)
+_create_dir(packagedcode_cache_dir)
_create_dir(scancode_cache_dir)
# - scancode_temp_dir: for short-lived temporary files which are import- or run-
diff --git a/tests/packagedcode/data/cache/.gitignore b/tests/packagedcode/data/cache/.gitignore
new file mode 100644
index 00000000000..a738fbc8f7f
--- /dev/null
+++ b/tests/packagedcode/data/cache/.gitignore
@@ -0,0 +1 @@
+/package_patterns_index/
\ No newline at end of file
diff --git a/tests/packagedcode/data/plugin/plugins_list_linux.txt b/tests/packagedcode/data/plugin/plugins_list_linux.txt
index e24512dfd91..eb4763d6c7e 100755
--- a/tests/packagedcode/data/plugin/plugins_list_linux.txt
+++ b/tests/packagedcode/data/plugin/plugins_list_linux.txt
@@ -410,7 +410,7 @@ Package type: gem
documentation URL: https://bundler.io/man/gemfile.5.html
primary language: Ruby
description: RubyGems Bundler Gemfile
- path_patterns: '*/Gemfile', '*/*.gemfile', '*/Gemfile-*'
+ path_patterns: '*/Gemfile', '*.gemfile', '*/Gemfile-*'
--------------------------------------------
Package type: gem
datasource_id: gemfile_extracted
diff --git a/tests/packagedcode/test_cache.py b/tests/packagedcode/test_cache.py
new file mode 100644
index 00000000000..98951d9fc8f
--- /dev/null
+++ b/tests/packagedcode/test_cache.py
@@ -0,0 +1,61 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# ScanCode is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/nexB/scancode-toolkit for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+import os.path
+
+from packagedcode import cache
+from commoncode.fileutils import as_posixpath
+
+from packages_test_utils import PackageTester
+from scancode_config import REGEN_TEST_FIXTURES
+from scancode.cli_test_utils import run_scan_click
+from scancode.cli_test_utils import check_json_scan
+
+
+TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
+
+
+class TestMultiregexPatterns(PackageTester):
+ test_data_dir = TEST_DATA_DIR
+
+ def test_build_mappings_and_multiregex_patterns_works(self):
+ from packagedcode.about import AboutFileHandler
+
+ multiregex_patterns, handler_by_regex = cache.build_mappings_and_multiregex_patterns(
+ datafile_handlers=[AboutFileHandler],
+ )
+ assert multiregex_patterns == [('(?s:.*\\.ABOUT)\\Z', ['.about'])]
+ assert handler_by_regex == {'(?s:.*\\.ABOUT)\\Z': ['about_file']}
+
+ def test_build_package_cache_works(self):
+ from packagedcode.about import AboutFileHandler
+ from packagedcode.bower import BowerJsonHandler
+
+ package_cache_dir = self.get_test_loc('cache/')
+ package_cache = cache.PkgManifestPatternsCache.load_or_build(
+ packagedcode_cache_dir=package_cache_dir,
+ application_package_datafile_handlers=[AboutFileHandler],
+ system_package_datafile_handlers=[BowerJsonHandler],
+ force=True,
+ )
+ test_path = "scancode-toolkit.ABOUT"
+
+ assert not package_cache.system_package_matcher.match(test_path)
+ assert package_cache.application_package_matcher.match(test_path)
+
+ regex, _match = package_cache.all_package_matcher.match(test_path).pop()
+ assert package_cache.handler_by_regex.get(regex.pattern).pop() == AboutFileHandler.datasource_id
+
+ def check_empty_file_scan_works(self):
+
+ test_file = self.get_test_loc('cache/.gitignore')
+ package_path = as_posixpath(test_file)
+ package_matcher = cache.get_cache()
+
+ assert not package_matcher.match(package_path)
diff --git a/tests/packagedcode/test_cargo.py b/tests/packagedcode/test_cargo.py
index b71634aa8a1..5b22b69e193 100644
--- a/tests/packagedcode/test_cargo.py
+++ b/tests/packagedcode/test_cargo.py
@@ -159,7 +159,7 @@ def test_scan_works_on_rust_binary_with_inspector(self):
test_file = self.get_test_loc('cargo/binary/cargo_dependencies')
expected_file = self.get_test_loc('cargo/binary/cargo-binary.expected.json')
result_file = self.get_temp_file('results.json')
- run_scan_click(['--package', test_file, '--json', result_file])
+ run_scan_click(['--binary-package', test_file, '--json', result_file])
check_json_scan(
expected_file, result_file, remove_uuid=True, regen=REGEN_TEST_FIXTURES
)
diff --git a/tests/packagedcode/test_recognize.py b/tests/packagedcode/test_recognize.py
index f7736aeeb61..98a50164321 100644
--- a/tests/packagedcode/test_recognize.py
+++ b/tests/packagedcode/test_recognize.py
@@ -202,3 +202,8 @@ def test_recognize_rpmdb_sqlite(self):
packages = recognize_package_data(test_file, system=True)
assert packages
assert isinstance(packages[0], models.PackageData)
+
+ def test_recognize_non_package_manifest_file(self):
+ test_file = self.get_test_loc('cache/.gitignore')
+ packages = recognize_package_data(test_file)
+ assert not packages
diff --git a/tests/scancode/data/help/help.txt b/tests/scancode/data/help/help.txt
index 8a486871b5d..2c45a354b31 100644
--- a/tests/scancode/data/help/help.txt
+++ b/tests/scancode/data/help/help.txt
@@ -8,13 +8,16 @@ Usage: scancode [OPTIONS]