diff --git a/Dockerfile b/Dockerfile index d4c641d7a2f..17e28ad0930 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,9 +38,11 @@ WORKDIR /scancode-toolkit COPY . /scancode-toolkit # Initial configuration using ./configure, scancode-reindex-licenses to build -# the base license index +# the base license index and scancode-cache-package-patterns to build the +# package patterns cache RUN ./configure \ - && ./venv/bin/scancode-reindex-licenses + && ./venv/bin/scancode-reindex-licenses \ + && ./venv/bin/scancode-cache-package-patterns # Add scancode to path ENV PATH=/scancode-toolkit:$PATH diff --git a/docs/source/rst_snippets/basic_options.rst b/docs/source/rst_snippets/basic_options.rst index d01fbf72a6c..83caf28f406 100644 --- a/docs/source/rst_snippets/basic_options.rst +++ b/docs/source/rst_snippets/basic_options.rst @@ -33,6 +33,11 @@ documenting a program's options. For example: --system-package Scan ```` for installed system package databases. +-b, --binary-package Scan for package and dependency related + data in binaries. Note that looking for packages + in binaries makes package scan slower. + Currently supported binaries: Go, Rust. + --package-only Scan ```` for system and application only for package metadata, without license/ copyright detection and package assembly. diff --git a/etc/release/scancode-create-pypi-wheel.sh b/etc/release/scancode-create-pypi-wheel.sh index 5ab2fe8e988..4915695bae8 100755 --- a/etc/release/scancode-create-pypi-wheel.sh +++ b/etc/release/scancode-create-pypi-wheel.sh @@ -19,6 +19,7 @@ set -e ./configure --dev venv/bin/scancode-reindex-licenses +venv/bin/scancode-cache-package-patterns python_tag=$( python -c "import platform;print(f\"cp{''.join(platform.python_version_tuple()[:2])}\")" ) diff --git a/etc/release/scancode-create-release-app-linux.sh b/etc/release/scancode-create-release-app-linux.sh index fbe5951a937..ab6a4314d6d 100755 --- a/etc/release/scancode-create-release-app-linux.sh +++ b/etc/release/scancode-create-release-app-linux.sh @@ -65,6 +65,7 @@ cp -r etc/thirdparty $release_dir/etc # Build the wheel ./configure --dev venv/bin/scancode-reindex-licenses +venv/bin/scancode-cache-package-patterns venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version cp -r \ diff --git a/etc/release/scancode-create-release-app-macos.sh b/etc/release/scancode-create-release-app-macos.sh index 5f34bf88f28..41c804137bb 100755 --- a/etc/release/scancode-create-release-app-macos.sh +++ b/etc/release/scancode-create-release-app-macos.sh @@ -63,6 +63,7 @@ cp -r etc/thirdparty $release_dir/etc # Build the wheel ./configure --dev venv/bin/scancode-reindex-licenses +venv/bin/scancode-cache-package-patterns venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version cp -r \ diff --git a/etc/release/scancode-create-release-app-windows.sh b/etc/release/scancode-create-release-app-windows.sh index 03a22d7117a..e4dba1b9b2f 100755 --- a/etc/release/scancode-create-release-app-windows.sh +++ b/etc/release/scancode-create-release-app-windows.sh @@ -62,6 +62,7 @@ cp -r etc/thirdparty $release_dir/etc # Build the wheel ./configure --dev venv/bin/scancode-reindex-licenses +venv/bin/scancode-cache-package-patterns venv/bin/python setup.py --quiet bdist_wheel --python-tag cp$python_version cp -r \ diff --git a/requirements.txt b/requirements.txt index 8d7b458c84d..f9e6b6a0a28 100644 --- a/requirements.txt +++ b/requirements.txt @@ -40,6 +40,7 @@ license-expression==30.4.4 lxml==5.4.0 MarkupSafe==3.0.2 more-itertools==10.7.0 +multiregex==2.0.3 normality==2.6.1 packageurl-python==0.17.1 packaging==25.0 diff --git a/setup-mini.cfg b/setup-mini.cfg index 8f3a043d8af..bfb24e2dd33 100644 --- a/setup-mini.cfg +++ b/setup-mini.cfg @@ -89,6 +89,7 @@ install_requires = license_expression >= 30.4.4 lxml >= 5.4.0 MarkupSafe >= 2.1.2 + multiregex >= 2.0.3 normality <= 2.6.1 packageurl_python >= 0.9.0 packvers >= 21.0.0 @@ -156,6 +157,7 @@ packages = console_scripts = scancode = scancode.cli:scancode scancode-reindex-licenses = licensedcode.reindex:reindex_licenses + scancode-cache-package-patterns = packagedcode.cache:cache_package_patterns scancode-license-data = licensedcode.license_db:dump_scancode_license_data regen-package-docs = packagedcode.regen_package_docs:regen_package_docs add-required-phrases = licensedcode.required_phrases:add_required_phrases diff --git a/setup.cfg b/setup.cfg index 770b70542b3..f156833e463 100644 --- a/setup.cfg +++ b/setup.cfg @@ -74,6 +74,7 @@ install_requires = colorama >= 0.3.9 commoncode >= 32.4.0 container-inspector >= 31.0.0 + cyseq >= 0.0.2 debian-inspector >= 31.1.0 dparse2 >= 0.7.0 fasteners @@ -90,6 +91,7 @@ install_requires = license_expression >= 30.4.4 lxml >= 5.4.0 MarkupSafe >= 2.1.2 + multiregex >= 2.0.3 normality <= 2.6.1 packageurl_python >= 0.9.0 packvers >= 21.0.0 @@ -116,7 +118,6 @@ install_requires = typecode >= 30.0.1 typecode[full] >= 30.0.1 extractcode[full] >= 31.0.0 - cyseq >= 0.0.2 [options.packages.find] @@ -158,6 +159,7 @@ packages = console_scripts = scancode = scancode.cli:scancode scancode-reindex-licenses = licensedcode.reindex:reindex_licenses + scancode-cache-package-patterns = packagedcode.cache:cache_package_patterns scancode-license-data = licensedcode.license_db:dump_scancode_license_data regen-package-docs = packagedcode.regen_package_docs:regen_package_docs add-required-phrases = licensedcode.required_phrases:add_required_phrases diff --git a/src/licensedcode/plugin_license.py b/src/licensedcode/plugin_license.py index 5c42f96760a..717253c4baa 100644 --- a/src/licensedcode/plugin_license.py +++ b/src/licensedcode/plugin_license.py @@ -152,6 +152,9 @@ def setup(self, **kwargs): This is a cache warmup such that child process inherit from the loaded index. """ + if kwargs.get("package_only"): + return + from licensedcode.cache import populate_cache populate_cache() diff --git a/src/packagedcode/__init__.py b/src/packagedcode/__init__.py index 9cc46d0e09b..8626fcf7ff6 100644 --- a/src/packagedcode/__init__.py +++ b/src/packagedcode/__init__.py @@ -246,15 +246,24 @@ win_reg.InstalledProgramFromDockerUtilityvmSoftwareHandler, ] + +# These handlers are special as they use filetype to +# detect these binaries instead of datafile path patterns +# as these are optionally installed, we can skip checking +# for filetype if these are not available +BINARY_PACKAGE_DATAFILE_HANDLERS = [] + try: from go_inspector.binary import get_go_binary_handler - APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(get_go_binary_handler()) + handler = get_go_binary_handler() + BINARY_PACKAGE_DATAFILE_HANDLERS.append(handler) except ImportError: pass try: from rust_inspector.packages import get_rust_binary_handler - APPLICATION_PACKAGE_DATAFILE_HANDLERS.append(get_rust_binary_handler()) + handler = get_rust_binary_handler() + BINARY_PACKAGE_DATAFILE_HANDLERS.append(handler) except ImportError: pass @@ -262,7 +271,7 @@ APPLICATION_PACKAGE_DATAFILE_HANDLERS + [ p for p in SYSTEM_PACKAGE_DATAFILE_HANDLERS if p not in APPLICATION_PACKAGE_DATAFILE_HANDLERS - ] + ] + BINARY_PACKAGE_DATAFILE_HANDLERS ) # registry of all handler classes keyed by datasource_id diff --git a/src/packagedcode/cache.py b/src/packagedcode/cache.py new file mode 100644 index 00000000000..92320379887 --- /dev/null +++ b/src/packagedcode/cache.py @@ -0,0 +1,236 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# ScanCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/scancode-toolkit for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import os +import fnmatch +import pickle +import multiregex + +import attr +import click + +from commoncode.cliutils import PluggableCommandLineOption +from commoncode.fileutils import create_dir +from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS +from packagedcode import SYSTEM_PACKAGE_DATAFILE_HANDLERS + +from scancode_config import packagedcode_cache_dir +from scancode_config import scancode_cache_dir + +""" +An on-disk persistent cache of package manifest patterns and related package +manifest handlers mapping. Loading and dumping the cached package manifest +patterns is safe to use across multiple processes using lock files. +""" + +# global in-memory cache of the PkgManifestPatternsCache +_PACKAGE_CACHE = None + +# This is the Pickle protocol we use, which was added in Python 3.4. +PICKLE_PROTOCOL = 4 + +PACKAGE_INDEX_LOCK_TIMEOUT = 60 * 6 +PACKAGE_INDEX_DIR = 'package_patterns_index' +PACKAGE_INDEX_FILENAME = 'index_cache' +PACKAGE_LOCKFILE_NAME = 'scancode_package_index_lockfile' +PACKAGE_CHECKSUM_FILE = 'scancode_package_index_tree_checksums' + + +@attr.s +class PkgManifestPatternsCache: + """ + Represent cachable package manifest regex patterns, prematchers + and mappings from regex patterns to datasource IDs for all datafile + handlers. + """ + + handler_by_regex = attr.ib(default=attr.Factory(dict)) + system_package_matcher = attr.ib(default=None) + application_package_matcher = attr.ib(default=None) + all_package_matcher = attr.ib(default=None) + + @staticmethod + def all_multiregex_patterns(application_multiregex_patterns, system_multiregex_patterns): + return application_multiregex_patterns + [ + multiregex_pattern + for multiregex_pattern in system_multiregex_patterns + if multiregex_pattern not in application_multiregex_patterns + ] + + @classmethod + def load_or_build( + cls, + packagedcode_cache_dir=packagedcode_cache_dir, + scancode_cache_dir=scancode_cache_dir, + force=False, + timeout=PACKAGE_INDEX_LOCK_TIMEOUT, + system_package_datafile_handlers=SYSTEM_PACKAGE_DATAFILE_HANDLERS, + application_package_datafile_handlers=APPLICATION_PACKAGE_DATAFILE_HANDLERS, + ): + """ + Load or build and save and return a PkgManifestPatternsCache object. + + We either load a cached PkgManifestPatternsCache or build and cache the patterns. + + - If the cache exists, it is returned unless corrupted. + - If ``force`` is True, or if the cache does not exist a new index is built + and cached. + """ + idx_cache_dir = os.path.join(packagedcode_cache_dir, PACKAGE_INDEX_DIR) + create_dir(idx_cache_dir) + cache_file = os.path.join(idx_cache_dir, PACKAGE_INDEX_FILENAME) + has_cache = os.path.exists(cache_file) and os.path.getsize(cache_file) + + # bypass build if cache exists + if has_cache and not force: + try: + return load_cache_file(cache_file) + except Exception as e: + # work around some rare Windows quirks + import traceback + print('Inconsistent Package cache: rebuilding index.') + print(str(e)) + print(traceback.format_exc()) + + from scancode import lockfile + lock_file = os.path.join(scancode_cache_dir, PACKAGE_LOCKFILE_NAME) + + # here, we have no cache: lock, check and rebuild + try: + # acquire lock and wait until timeout to get a lock or die + with lockfile.FileLock(lock_file).locked(timeout=timeout): + + system_multiregex_patterns, system_handlers_by_regex = build_mappings_and_multiregex_patterns( + datafile_handlers=system_package_datafile_handlers, + ) + application_multiregex_patterns, application_handlers_by_regex = build_mappings_and_multiregex_patterns( + datafile_handlers=application_package_datafile_handlers, + ) + all_multiregex_matcher = PkgManifestPatternsCache.all_multiregex_patterns( + application_multiregex_patterns, system_multiregex_patterns, + ) + system_package_matcher = multiregex.RegexMatcher(system_multiregex_patterns) + application_package_matcher = multiregex.RegexMatcher(application_multiregex_patterns) + all_package_matcher = multiregex.RegexMatcher(all_multiregex_matcher) + package_cache = cls( + handler_by_regex=system_handlers_by_regex | application_handlers_by_regex, + system_package_matcher=system_package_matcher, + application_package_matcher=application_package_matcher, + all_package_matcher=all_package_matcher, + ) + package_cache.dump(cache_file) + return package_cache + + except lockfile.LockTimeout: + # TODO: handle unable to lock in a nicer way + raise + + def dump(self, cache_file): + """ + Dump this license cache on disk at ``cache_file``. + """ + with open(cache_file, 'wb') as fn: + pickle.dump(self, fn, protocol=PICKLE_PROTOCOL) + + +def get_prematchers_from_glob_pattern(pattern): + return [ + prematcher.lower().lstrip("/") + for prematcher in pattern.split("*") + if prematcher + ] + + +def build_mappings_and_multiregex_patterns(datafile_handlers): + """ + Return a mapping of regex patterns to datafile handler IDs and + multiregex patterns consisting of regex patterns and prematchers. + """ + handler_by_regex = {} + multiregex_patterns = [] + + if not datafile_handlers: + return multiregex_patterns, handler_by_regex + + with_patterns = [] + + for handler in datafile_handlers: + if handler.path_patterns: + with_patterns.append(handler) + + prematchers_by_regex = {} + + for handler in with_patterns: + for pattern in handler.path_patterns: + regex_pattern = fnmatch.translate(pattern) + regex_pattern = fr"{regex_pattern}" + + prematchers_by_regex[regex_pattern] = get_prematchers_from_glob_pattern(pattern) + + if regex_pattern in handler_by_regex: + handler_by_regex[regex_pattern].append(handler.datasource_id) + else: + handler_by_regex[regex_pattern]= [handler.datasource_id] + + for regex in handler_by_regex.keys(): + regex_and_prematcher = (regex, prematchers_by_regex.get(regex, [])) + multiregex_patterns.append(regex_and_prematcher) + + return multiregex_patterns, handler_by_regex + + +def get_cache( + force=False, + packagedcode_cache_dir=packagedcode_cache_dir, + scancode_cache_dir=scancode_cache_dir, +): + """ + Return a PkgManifestPatternsCache either rebuilt, cached or loaded from disk. + """ + global _PACKAGE_CACHE + + if force or not _PACKAGE_CACHE: + _PACKAGE_CACHE = PkgManifestPatternsCache.load_or_build( + packagedcode_cache_dir=packagedcode_cache_dir, + scancode_cache_dir=scancode_cache_dir, + force=force, + # used for testing only + timeout=PACKAGE_INDEX_LOCK_TIMEOUT, + ) + return _PACKAGE_CACHE + + +def load_cache_file(cache_file): + """ + Return a PkgManifestPatternsCache loaded from ``cache_file``. + """ + with open(cache_file, 'rb') as lfc: + try: + return pickle.load(lfc) + except Exception as e: + msg = ( + 'ERROR: Failed to load package cache (the file may be corrupted ?).\n' + f'Please delete "{cache_file}" and retry.\n' + 'If the problem persists, copy this error message ' + 'and submit a bug report at https://github.com/nexB/scancode-toolkit/issues/' + ) + raise Exception(msg) from e + + +@click.command(name='scancode-cache-package-patterns') +@click.help_option('-h', '--help') +def cache_package_patterns(*args, **kwargs): + """Create scancode package manifest patterns cache and exit""" + click.echo('Rebuilding the package cache patterns...') + get_cache(force=True) + click.echo('Done.') + + +if __name__ == '__main__': + cache_package_patterns() diff --git a/src/packagedcode/data/.gitignore b/src/packagedcode/data/.gitignore new file mode 100644 index 00000000000..0a2101fab9b --- /dev/null +++ b/src/packagedcode/data/.gitignore @@ -0,0 +1 @@ +/cache/ diff --git a/src/packagedcode/plugin_package.py b/src/packagedcode/plugin_package.py index 8dc993e3b7a..dc55e0053fb 100644 --- a/src/packagedcode/plugin_package.py +++ b/src/packagedcode/plugin_package.py @@ -170,6 +170,20 @@ class PackageScanner(ScanPlugin): help_group=SCAN_GROUP, sort_order=21, ), + PluggableCommandLineOption( + ( + '-b', + '--binary-package', + ), + is_flag=True, + default=False, + help=( + 'Scan for package and dependency related data in binaries. ' + 'Currently supported binaries: Go, Rust.' + ), + help_group=SCAN_GROUP, + sort_order=22, + ), PluggableCommandLineOption( ( '--package-only', @@ -182,7 +196,7 @@ class PackageScanner(ScanPlugin): 'license/copyright detection and top-level package creation.' ), help_group=SCAN_GROUP, - sort_order=22, + sort_order=23, ), PluggableCommandLineOption( ('--list-packages',), @@ -195,10 +209,17 @@ class PackageScanner(ScanPlugin): ), ] - def is_enabled(self, package, system_package, package_only, **kwargs): - return package or system_package or package_only + def is_enabled(self, package, system_package, binary_package, package_only, **kwargs): + return package or system_package or binary_package or package_only - def get_scanner(self, package=True, system_package=False, package_only=False, **kwargs): + def get_scanner( + self, + package=True, + system_package=False, + binary_package=False, + package_only=False, + **kwargs + ): """ Return a scanner callable to scan a file for package data. """ @@ -208,6 +229,7 @@ def get_scanner(self, package=True, system_package=False, package_only=False, ** get_package_data, application=package, system=system_package, + binary=binary_package, package_only=package_only, ) @@ -464,7 +486,7 @@ def get_package_and_deps(codebase, package_adder=add_to_package, strip_root=Fals resource.scan_errors.append(msg) resource.save(codebase) - if TRACE: + if TRACE_ASSEMBLY: raise Exception(msg) from e return packages, dependencies diff --git a/src/packagedcode/recognize.py b/src/packagedcode/recognize.py index e41d29c82df..f60107c6904 100644 --- a/src/packagedcode/recognize.py +++ b/src/packagedcode/recognize.py @@ -11,10 +11,12 @@ import sys from commoncode import filetype -from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS -from packagedcode import SYSTEM_PACKAGE_DATAFILE_HANDLERS -from packagedcode import ALL_DATAFILE_HANDLERS +from commoncode.fileutils import as_posixpath + +from packagedcode import HANDLER_BY_DATASOURCE_ID +from packagedcode import BINARY_PACKAGE_DATAFILE_HANDLERS from packagedcode import models +from packagedcode.cache import get_cache TRACE = os.environ.get('SCANCODE_DEBUG_PACKAGE_API', False) @@ -44,6 +46,7 @@ def recognize_package_data( location, application=True, system=False, + binary=False, package_only=False, ): """ @@ -56,25 +59,21 @@ def recognize_package_data( if not filetype.is_file(location): return [] - assert application or system or package_only - if package_only or (application and system): - datafile_handlers = ALL_DATAFILE_HANDLERS - elif application: - datafile_handlers = APPLICATION_PACKAGE_DATAFILE_HANDLERS - elif system: - datafile_handlers = SYSTEM_PACKAGE_DATAFILE_HANDLERS - return list(_parse( location=location, + application=application, + system=system, + binary=binary, package_only=package_only, - datafile_handlers=datafile_handlers, )) def _parse( location, + application=True, + system=False, + binary=False, package_only=False, - datafile_handlers=APPLICATION_PACKAGE_DATAFILE_HANDLERS, ): """ Yield parsed PackageData objects from ``location``. Raises Exceptions on errors. @@ -83,6 +82,46 @@ def _parse( Default to use application packages """ + package_path = as_posixpath(location) + package_patterns = get_cache() + + has_patterns = application or system or package_only + assert has_patterns or binary + if package_only or (application and system): + package_matcher = package_patterns.all_package_matcher + elif application: + package_matcher = package_patterns.application_package_matcher + elif system: + package_matcher = package_patterns.system_package_matcher + + matched_patterns = [] + if has_patterns: + matched_patterns = package_matcher.match(package_path) + + all_handler_ids = [] + for matched_pattern in matched_patterns: + regex, _match = matched_pattern + handler_ids = package_patterns.handler_by_regex.get(regex.pattern) + if TRACE: + logger_debug(f'_parse:.handler_ids: {handler_ids}') + + all_handler_ids.extend([ + handler_id + for handler_id in handler_ids + if handler_id not in all_handler_ids + ]) + + datafile_handlers = [ + HANDLER_BY_DATASOURCE_ID.get(handler_id) + for handler_id in all_handler_ids + ] + + if not datafile_handlers: + if binary: + datafile_handlers.extend(BINARY_PACKAGE_DATAFILE_HANDLERS) + elif TRACE: + logger_debug(f'_parse: no package datafile detected at {package_path}') + for handler in datafile_handlers: if TRACE: logger_debug(f'_parse:.is_datafile: {handler}') diff --git a/src/packagedcode/rubygems.py b/src/packagedcode/rubygems.py index 9cbbf6d7553..e80295c48af 100644 --- a/src/packagedcode/rubygems.py +++ b/src/packagedcode/rubygems.py @@ -211,7 +211,7 @@ def assemble(cls, package_data, resource, codebase, package_adder): # TODO: https://stackoverflow.com/questions/41454333/meaning-of-new-block-git-sourcegithub-in-gemfile class GemfileHandler(GemspecHandler): datasource_id = 'gemfile' - path_patterns = ('*/Gemfile', '*/*.gemfile', '*/Gemfile-*') + path_patterns = ('*/Gemfile', '*.gemfile', '*/Gemfile-*') default_package_type = 'gem' default_primary_language = 'Ruby' description = 'RubyGems Bundler Gemfile' diff --git a/src/scancode/api.py b/src/scancode/api.py index 94592e20ce1..d06af7dcf45 100644 --- a/src/scancode/api.py +++ b/src/scancode/api.py @@ -256,20 +256,28 @@ def get_licenses( SCANCODE_DEBUG_PACKAGE_API = os.environ.get('SCANCODE_DEBUG_PACKAGE_API', False) -def _get_package_data(location, application=True, system=False, package_only=False, **kwargs): +def _get_package_data( + location, + application=True, + system=False, + binary=False, + package_only=False, + **kwargs +): """ Return a mapping of package manifest information detected in the file at ``location``. Include ``application`` packages (such as pypi) and/or ``system`` packages. Note that all exceptions are caught if there are any errors while parsing a package manifest. """ - assert application or system or package_only + assert application or system or binary or package_only from packagedcode.recognize import recognize_package_data try: return recognize_package_data( location=location, application=application, system=system, + binary=binary, package_only=package_only, ) or [] @@ -300,7 +308,14 @@ def get_package_info(location, **kwargs): return dict(packages=[p.to_dict() for p in packages]) -def get_package_data(location, application=True, system=False, package_only=False, **kwargs): +def get_package_data( + location, + application=True, + system=False, + binary=False, + package_only=False, + **kwargs +): """ Return a mapping of package manifest information detected in the file at `location`. @@ -313,6 +328,7 @@ def get_package_data(location, application=True, system=False, package_only=Fals location=location, application=application, system=system, + binary=binary, package_only=package_only, **kwargs, ) or [] diff --git a/src/scancode_config.py b/src/scancode_config.py index 9b6e2b7d075..520a0af9396 100644 --- a/src/scancode_config.py +++ b/src/scancode_config.py @@ -185,7 +185,13 @@ def _create_dir(location): __env_license_cache_dir = os.getenv('SCANCODE_LICENSE_INDEX_CACHE') licensedcode_cache_dir = (__env_license_cache_dir or std_license_cache_dir) + +std_package_cache_dir = join(scancode_src_dir, 'packagedcode', 'data', 'cache') +__env_package_cache_dir = os.getenv('SCANCODE_PACKAGE_INDEX_CACHE') +packagedcode_cache_dir = (__env_package_cache_dir or std_package_cache_dir) + _create_dir(licensedcode_cache_dir) +_create_dir(packagedcode_cache_dir) _create_dir(scancode_cache_dir) # - scancode_temp_dir: for short-lived temporary files which are import- or run- diff --git a/tests/packagedcode/data/cache/.gitignore b/tests/packagedcode/data/cache/.gitignore new file mode 100644 index 00000000000..a738fbc8f7f --- /dev/null +++ b/tests/packagedcode/data/cache/.gitignore @@ -0,0 +1 @@ +/package_patterns_index/ \ No newline at end of file diff --git a/tests/packagedcode/data/plugin/plugins_list_linux.txt b/tests/packagedcode/data/plugin/plugins_list_linux.txt index e24512dfd91..eb4763d6c7e 100755 --- a/tests/packagedcode/data/plugin/plugins_list_linux.txt +++ b/tests/packagedcode/data/plugin/plugins_list_linux.txt @@ -410,7 +410,7 @@ Package type: gem documentation URL: https://bundler.io/man/gemfile.5.html primary language: Ruby description: RubyGems Bundler Gemfile - path_patterns: '*/Gemfile', '*/*.gemfile', '*/Gemfile-*' + path_patterns: '*/Gemfile', '*.gemfile', '*/Gemfile-*' -------------------------------------------- Package type: gem datasource_id: gemfile_extracted diff --git a/tests/packagedcode/test_cache.py b/tests/packagedcode/test_cache.py new file mode 100644 index 00000000000..98951d9fc8f --- /dev/null +++ b/tests/packagedcode/test_cache.py @@ -0,0 +1,61 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# ScanCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/scancode-toolkit for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import os.path + +from packagedcode import cache +from commoncode.fileutils import as_posixpath + +from packages_test_utils import PackageTester +from scancode_config import REGEN_TEST_FIXTURES +from scancode.cli_test_utils import run_scan_click +from scancode.cli_test_utils import check_json_scan + + +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') + + +class TestMultiregexPatterns(PackageTester): + test_data_dir = TEST_DATA_DIR + + def test_build_mappings_and_multiregex_patterns_works(self): + from packagedcode.about import AboutFileHandler + + multiregex_patterns, handler_by_regex = cache.build_mappings_and_multiregex_patterns( + datafile_handlers=[AboutFileHandler], + ) + assert multiregex_patterns == [('(?s:.*\\.ABOUT)\\Z', ['.about'])] + assert handler_by_regex == {'(?s:.*\\.ABOUT)\\Z': ['about_file']} + + def test_build_package_cache_works(self): + from packagedcode.about import AboutFileHandler + from packagedcode.bower import BowerJsonHandler + + package_cache_dir = self.get_test_loc('cache/') + package_cache = cache.PkgManifestPatternsCache.load_or_build( + packagedcode_cache_dir=package_cache_dir, + application_package_datafile_handlers=[AboutFileHandler], + system_package_datafile_handlers=[BowerJsonHandler], + force=True, + ) + test_path = "scancode-toolkit.ABOUT" + + assert not package_cache.system_package_matcher.match(test_path) + assert package_cache.application_package_matcher.match(test_path) + + regex, _match = package_cache.all_package_matcher.match(test_path).pop() + assert package_cache.handler_by_regex.get(regex.pattern).pop() == AboutFileHandler.datasource_id + + def check_empty_file_scan_works(self): + + test_file = self.get_test_loc('cache/.gitignore') + package_path = as_posixpath(test_file) + package_matcher = cache.get_cache() + + assert not package_matcher.match(package_path) diff --git a/tests/packagedcode/test_cargo.py b/tests/packagedcode/test_cargo.py index b71634aa8a1..5b22b69e193 100644 --- a/tests/packagedcode/test_cargo.py +++ b/tests/packagedcode/test_cargo.py @@ -159,7 +159,7 @@ def test_scan_works_on_rust_binary_with_inspector(self): test_file = self.get_test_loc('cargo/binary/cargo_dependencies') expected_file = self.get_test_loc('cargo/binary/cargo-binary.expected.json') result_file = self.get_temp_file('results.json') - run_scan_click(['--package', test_file, '--json', result_file]) + run_scan_click(['--binary-package', test_file, '--json', result_file]) check_json_scan( expected_file, result_file, remove_uuid=True, regen=REGEN_TEST_FIXTURES ) diff --git a/tests/packagedcode/test_recognize.py b/tests/packagedcode/test_recognize.py index f7736aeeb61..98a50164321 100644 --- a/tests/packagedcode/test_recognize.py +++ b/tests/packagedcode/test_recognize.py @@ -202,3 +202,8 @@ def test_recognize_rpmdb_sqlite(self): packages = recognize_package_data(test_file, system=True) assert packages assert isinstance(packages[0], models.PackageData) + + def test_recognize_non_package_manifest_file(self): + test_file = self.get_test_loc('cache/.gitignore') + packages = recognize_package_data(test_file) + assert not packages diff --git a/tests/scancode/data/help/help.txt b/tests/scancode/data/help/help.txt index 8a486871b5d..2c45a354b31 100644 --- a/tests/scancode/data/help/help.txt +++ b/tests/scancode/data/help/help.txt @@ -8,13 +8,16 @@ Usage: scancode [OPTIONS] ... Options: primary scans: - -l, --license Scan for licenses. - -p, --package Scan for application package and dependency - manifests, lockfiles and related data. - --system-package Scan for installed system package databases. - --package-only Scan for system and application package data and skip - license/copyright detection and top-level package creation. - -c, --copyright Scan for copyrights. + -l, --license Scan for licenses. + -p, --package Scan for application package and dependency + manifests, lockfiles and related data. + --system-package Scan for installed system package databases. + -b, --binary-package Scan for package and dependency related data in + binaries. Currently supported binaries: Go, Rust. + --package-only Scan for system and application package data and skip + license/copyright detection and top-level package + creation. + -c, --copyright Scan for copyrights. other scans: -i, --info Scan for file information (size, checksums, etc). diff --git a/tests/scancode/data/help/help_linux.txt b/tests/scancode/data/help/help_linux.txt index 9ca1d26d68a..5d7b1dfed92 100644 --- a/tests/scancode/data/help/help_linux.txt +++ b/tests/scancode/data/help/help_linux.txt @@ -8,15 +8,18 @@ Usage: scancode [OPTIONS] ... Options: primary scans: - -l, --license Scan for licenses. - -p, --package Scan for application package and dependency - manifests, lockfiles and related data. - --system-package Scan for installed system package databases. - --package-only Scan for system and application package data and skip - license/copyright detection and top-level package creation. - -c, --copyright Scan for copyrights. - --go-symbol Collect Go symbols. - --rust-symbol Collect Rust symbols from rust binaries. + -l, --license Scan for licenses. + -p, --package Scan for application package and dependency + manifests, lockfiles and related data. + --system-package Scan for installed system package databases. + -b, --binary-package Scan for package and dependency related data in + binaries. Currently supported binaries: Go, Rust. + --package-only Scan for system and application package data and skip + license/copyright detection and top-level package + creation. + -c, --copyright Scan for copyrights. + --go-symbol Collect Go symbols. + --rust-symbol Collect Rust symbols from rust binaries. other scans: -i, --info Scan for file information (size, checksums, etc).