From 0bc940d625e9fc68ce7b1a0d03c0dc60969559de Mon Sep 17 00:00:00 2001 From: Tomasz Urbaszek Date: Sat, 22 May 2021 13:41:21 +0200 Subject: [PATCH 1/7] Add Github issues and PRs scanner --- kibble/cli/commands/scanners_command.py | 29 ++++++++-- kibble/configuration/__init__.py | 16 +++++ kibble/configuration/yaml_config.py | 35 +++++++++++ kibble/exceptions.py | 28 +++++++++ kibble/kibble.yaml | 8 +++ kibble/scanners/__init__.py | 44 ++++++++++++++ kibble/scanners/base.py | 31 ++++++++++ kibble/scanners/github.py | 77 +++++++++++++++++++++++++ kibble/secrets/__init__.py | 16 +++++ kibble/secrets/env_variable.py | 28 +++++++++ pylintrc | 4 +- 11 files changed, 308 insertions(+), 8 deletions(-) create mode 100644 kibble/configuration/__init__.py create mode 100644 kibble/configuration/yaml_config.py create mode 100644 kibble/exceptions.py create mode 100644 kibble/kibble.yaml create mode 100644 kibble/scanners/base.py create mode 100644 kibble/scanners/github.py create mode 100644 kibble/secrets/__init__.py create mode 100644 kibble/secrets/env_variable.py diff --git a/kibble/cli/commands/scanners_command.py b/kibble/cli/commands/scanners_command.py index 2499512..9bb50ed 100644 --- a/kibble/cli/commands/scanners_command.py +++ b/kibble/cli/commands/scanners_command.py @@ -17,8 +17,13 @@ __all__ = ["scanners_group"] +from typing import Optional + import click +from kibble.configuration.yaml_config import kconfig +from kibble.scanners import get_scanner, get_scanners_classes + @click.group(name="scanners") def scanners_group(): @@ -32,15 +37,27 @@ def add(): @scanners_group.command(name="list") -def list_scanners(): +@click.option("-ds", "--data-source") +def list_scanners(data_source: Optional[str] = None): """List all available scanners""" - scanners_list = ["AbcScanner", "XyzeScanner"] - for scanner in scanners_list: - click.echo(f"- {scanner}") + all_scanners = get_scanners_classes(data_source) + for scanner in sorted(all_scanners, key=lambda cls: cls.__name__): + click.echo(f"{scanner.__name__}") @scanners_group.command() -@click.argument("scanner_name") +@click.option("-s", "--scanner-name", required=True) def run(scanner_name: str): """Trigger a scanning process for given scanner""" - click.echo(f"Running {scanner_name}") + for data_source in kconfig.get("data_sources", []): + if scanner_name not in data_source["enabled"]: + continue + organizations = data_source.get("organizations", []) + if not organizations: + click.echo(f"No organizations to scan in {data_source} data source.") + continue + + scanner = get_scanner(scanner_name=scanner_name) + for org in organizations: + click.echo(f"Running {scanner.__name__} for {org}") + scanner(**org).scan() diff --git a/kibble/configuration/__init__.py b/kibble/configuration/__init__.py new file mode 100644 index 0000000..13a8339 --- /dev/null +++ b/kibble/configuration/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/kibble/configuration/yaml_config.py b/kibble/configuration/yaml_config.py new file mode 100644 index 0000000..e48fe80 --- /dev/null +++ b/kibble/configuration/yaml_config.py @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +from typing import Dict + +import yaml + +KIBBLE_YAML = "kibble.yaml" + + +def parse_kibble_yaml() -> Dict: + """Reads kibble.yaml config file""" + kibble_base_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir) + config_path = os.path.join(kibble_base_path, KIBBLE_YAML) + with open(config_path, "r") as stream: + config = yaml.safe_load(stream) + return config + + +kconfig = parse_kibble_yaml() diff --git a/kibble/exceptions.py b/kibble/exceptions.py new file mode 100644 index 0000000..09f5756 --- /dev/null +++ b/kibble/exceptions.py @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +class KibbleException(Exception): + """Generic Kibble expression""" + + +class SecretNotFound(Exception): + """Exception raised when secret value could not be found.""" + + def __init__(self, secret: str, secret_type: str): + self.message = f"Secret {secret} could not be found in {secret_type}" + super().__init__(self.message) diff --git a/kibble/kibble.yaml b/kibble/kibble.yaml new file mode 100644 index 0000000..971601b --- /dev/null +++ b/kibble/kibble.yaml @@ -0,0 +1,8 @@ +--- +data_sources: + - name: github + organizations: + - repo_owner: apache + repo_name: kibble + enabled: + - github_issues diff --git a/kibble/scanners/__init__.py b/kibble/scanners/__init__.py index 13a8339..5a14626 100644 --- a/kibble/scanners/__init__.py +++ b/kibble/scanners/__init__.py @@ -14,3 +14,47 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import os +from typing import Any, Dict, List, Optional + +from kibble.exceptions import KibbleException + + +def get_scanners( + data_source: Optional[str] = None, +) -> Dict[str, Any]: + """ + Automatically discover all scanners. Scanner class is discoverable + only if it has ``scan`` method. Returns a dictionary where keys are + scanner types. + + :param data_source: if provided then only scanners for this data source are returned. + """ + scanner_classes: Dict[str, Any] = {} + path = os.path.dirname(os.path.abspath(__file__)) + for file in sorted(os.listdir(path)): + if not file.endswith(".py") or file in ("__init__.py", "base.py"): + continue + py_file = file[:-3] + mod = __import__(".".join([__name__, py_file]), fromlist=[py_file]) + classes = [cls for x in dir(mod) if isinstance(cls := getattr(mod, x), type) and hasattr(cls, "scan")] + if data_source: + classes = [cls for cls in classes if cls.data_source == data_source] # type: ignore + for cls in classes: + scanner_ds = cls.data_source + scanner_classes[scanner_ds] = scanner_classes.get(scanner_ds, []) + [cls] + + return scanner_classes + + +def get_scanners_classes(data_source: Optional[str] = None) -> List[Any]: + """Returns all scanner classes""" + return sum(get_scanners(data_source).values(), []) + + +def get_scanner(scanner_name: str): + """Returns scanner by name""" + scanners_with_name = [cls for cls in get_scanners_classes() if cls.scanner_name == scanner_name] + if not scanners_with_name: + raise KibbleException(f"Scanner with name '{scanner_name}' is undefined") + return scanners_with_name[0] diff --git a/kibble/scanners/base.py b/kibble/scanners/base.py new file mode 100644 index 0000000..d2d7de4 --- /dev/null +++ b/kibble/scanners/base.py @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import logging +from typing import Any + + +class BaseScanner: + """Abstract, base class for all scanners""" + + # pylint: disable=too-few-public-methods + def __init__(self, **kwargs): + self.log = logging.getLogger(__name__) + + def _persist(self, payload: Any): # pylint: disable=no-self-use + """Persists data to database. Should be implemented per scanner.""" + raise NotImplementedError() diff --git a/kibble/scanners/github.py b/kibble/scanners/github.py new file mode 100644 index 0000000..dff5fd7 --- /dev/null +++ b/kibble/scanners/github.py @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any, Dict, List, Optional, Union +from urllib.parse import urlencode, urljoin + +import requests + +from kibble.scanners.base import BaseScanner +from kibble.secrets.env_variable import get_secret_from_env + + +class GithubBaseScanner(BaseScanner): + """Github base scanner class""" + + # pylint: disable=too-few-public-methods + data_source = "github" + + def __init__(self, *, repo_owner: str, repo_name: str, api_key: Optional[str] = None, **kwargs): + super().__init__(**kwargs) + self.repo_owner = repo_owner + self.repo_name = repo_name + self.api_key = api_key or get_secret_from_env("GH_API_KEY") + self.repo_full_name = f"{self.repo_owner}/{self.repo_name}" + + self.base_url = "https://api.github.com" + self.headers = {"Accept": "application/vnd.github.v3+json"} + if api_key: + self.headers["Authorization"] = f"token {api_key}" + + def _send_request(self, endpoint: str, query: Optional[Dict] = None) -> Union[List, Dict]: + url = urljoin(self.base_url, endpoint) + url = f"{url}?{urlencode(query)}" if query else url + response = requests.get(url, headers=self.headers) + response.raise_for_status() + return response.json() + + def _persist(self, payload: Any): + pass + + +class GithubIssuesScanner(GithubBaseScanner): + """Github issues and pull requests scanner""" + + scanner_name = "github_issues" + + def scan(self): + endpoint = f"/repos/{self.repo_owner}/{self.repo_name}/issues" + query = {"per_page": 100, "page": 1} + + issues: List[Dict] = [] + prs: List[Dict] = [] + self.log.info("Collecting Github issues and PRs from %s", self.repo_full_name) + while new_issues := self._send_request(endpoint, query): + for issue_pr in new_issues: + if "pull_request" in issue_pr: + prs.append(issue_pr) + else: + issues.append(issue_pr) + query["page"] += 1 + + self.log.info("Collected %d issues and %d PRs from %s", len(issues), len(prs), self.repo_full_name) + return issues, prs diff --git a/kibble/secrets/__init__.py b/kibble/secrets/__init__.py new file mode 100644 index 0000000..13a8339 --- /dev/null +++ b/kibble/secrets/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/kibble/secrets/env_variable.py b/kibble/secrets/env_variable.py new file mode 100644 index 0000000..95856de --- /dev/null +++ b/kibble/secrets/env_variable.py @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os + +from kibble.exceptions import SecretNotFound + + +def get_secret_from_env(key: str): + """Retrieves value from KIBBLE_SECRET_{key}""" + env_key = f"KIBBLE_SECRET_{key.upper()}" + secret = os.environ.get(env_key) + if not secret: + raise SecretNotFound(secret=env_key, secret_type="environment variables") diff --git a/pylintrc b/pylintrc index 5e6d939..9f98da3 100644 --- a/pylintrc +++ b/pylintrc @@ -459,7 +459,7 @@ name-group= # Regular expression which should only match function or class names that do # not require a docstring. -no-docstring-rgx=^_ +no-docstring-rgx=^_|^scan$ # List of decorators that produce properties, such as abc.abstractproperty. Add # to this list to register other decorators that produce valid properties. @@ -578,7 +578,7 @@ max-returns=6 max-statements=50 # Minimum number of public methods for a class (see R0903). -min-public-methods=2 +min-public-methods=1 [EXCEPTIONS] From 5fdca3ba656055999e804b1e887976a838dacfdb Mon Sep 17 00:00:00 2001 From: Tomasz Urbaszek Date: Sat, 22 May 2021 14:06:20 +0200 Subject: [PATCH 2/7] fixup! Add Github issues and PRs scanner --- .pre-commit-config.yaml | 1 + kibble/configuration/yaml_config.py | 8 +++++--- kibble/scanners/__init__.py | 11 ++++++----- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bd27c19..59188fd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -135,6 +135,7 @@ repos: - id: pylint name: Pylint on all sources entry: pylint + rev: 'v2.7.4' language: system types: [python] diff --git a/kibble/configuration/yaml_config.py b/kibble/configuration/yaml_config.py index e48fe80..0887b5a 100644 --- a/kibble/configuration/yaml_config.py +++ b/kibble/configuration/yaml_config.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -import os +from pathlib import Path from typing import Dict import yaml @@ -25,11 +25,13 @@ def parse_kibble_yaml() -> Dict: """Reads kibble.yaml config file""" - kibble_base_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir) - config_path = os.path.join(kibble_base_path, KIBBLE_YAML) + config_path = Path(__file__).parent.parent.joinpath(KIBBLE_YAML) with open(config_path, "r") as stream: config = yaml.safe_load(stream) return config kconfig = parse_kibble_yaml() + +if __name__ == "__main__": + parse_kibble_yaml() diff --git a/kibble/scanners/__init__.py b/kibble/scanners/__init__.py index 5a14626..9051824 100644 --- a/kibble/scanners/__init__.py +++ b/kibble/scanners/__init__.py @@ -14,7 +14,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import os + +from pathlib import Path from typing import Any, Dict, List, Optional from kibble.exceptions import KibbleException @@ -31,11 +32,11 @@ def get_scanners( :param data_source: if provided then only scanners for this data source are returned. """ scanner_classes: Dict[str, Any] = {} - path = os.path.dirname(os.path.abspath(__file__)) - for file in sorted(os.listdir(path)): - if not file.endswith(".py") or file in ("__init__.py", "base.py"): + path = Path(__file__).parent + for file in path.iterdir(): + if file.suffix != ".py" or file.name in ("__init__.py", "base.py"): continue - py_file = file[:-3] + py_file = file.stem mod = __import__(".".join([__name__, py_file]), fromlist=[py_file]) classes = [cls for x in dir(mod) if isinstance(cls := getattr(mod, x), type) and hasattr(cls, "scan")] if data_source: From 6b2228d8e91aab96cee9197af81807846501f8ba Mon Sep 17 00:00:00 2001 From: Tomasz Urbaszek Date: Sat, 22 May 2021 14:22:40 +0200 Subject: [PATCH 3/7] fixup! fixup! Add Github issues and PRs scanner --- tests/cli/commands/test_scanners_command.py | 44 ++++++++++++++++++--- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/tests/cli/commands/test_scanners_command.py b/tests/cli/commands/test_scanners_command.py index 2b62f9e..c2681b9 100644 --- a/tests/cli/commands/test_scanners_command.py +++ b/tests/cli/commands/test_scanners_command.py @@ -14,10 +14,12 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from unittest import mock from click.testing import CliRunner from kibble.cli.commands.scanners_command import scanners_group +from kibble.configuration.yaml_config import kconfig class TestScannerCommand: @@ -28,16 +30,46 @@ def test_add(self): assert result.exit_code == 0 assert result.output.strip() == "To be implemented!" - def test_list(self): + @mock.patch("kibble.cli.commands.scanners_command.get_scanners_classes") + def test_list(self, mock_get_scanners_classes): + class MockScanner: + pass + + mock_get_scanners_classes.return_value = [MockScanner] runner = CliRunner() result = runner.invoke(scanners_group, ["list"]) assert result.exit_code == 0 - assert result.output.strip() == "- AbcScanner\n- XyzeScanner" + assert result.output.strip() == "MockScanner" - def test_run(self): - runner = CliRunner() - result = runner.invoke(scanners_group, ["run", "TestScanner"]) + @mock.patch.dict( + kconfig, + { + "data_sources": [ + { + "name": "github", + "organizations": [{"repo_owner": "apache", "repo_name": "kibble"}], + "enabled": ["mock_scanner"], + } + ] + }, + ) + @mock.patch("kibble.cli.commands.scanners_command.get_scanner") + def test_run(self, mock_get_scanner): + class MockScanner: + scanner_name = "mock_scanner" + + def __init__(self, **kwargs): + pass + def scan(self): + pass + + mock_get_scanner.return_value = MockScanner + + runner = CliRunner() + result = runner.invoke(scanners_group, ["run", "-s", "mock_scanner"]) assert result.exit_code == 0 - assert result.output.strip() == "Running TestScanner" + assert ( + result.output.strip() == "Running MockScanner for {'repo_owner': 'apache', 'repo_name': 'kibble'}" + ) From 75346a981e665d1c96bb325e6e873f52c8a85488 Mon Sep 17 00:00:00 2001 From: Tomasz Urbaszek Date: Sat, 22 May 2021 14:26:52 +0200 Subject: [PATCH 4/7] fixup! fixup! fixup! Add Github issues and PRs scanner --- .github/workflows/ci.yaml | 6 ++++-- .pre-commit-config.yaml | 1 - setup.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b9f4d1e..2a76da4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -29,7 +29,9 @@ jobs: steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 - - run: pip install '.[devel]' + with: + python-version: '3.9.4' + - run: pip install -e '.[devel]' - run: pre-commit install - run: pre-commit run --all-files run-tests: @@ -39,6 +41,6 @@ jobs: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 with: - python-version: '3.8' + python-version: '3.9.4' - run: pip install '.[devel]' - run: pytest tests diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 59188fd..bd27c19 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -135,7 +135,6 @@ repos: - id: pylint name: Pylint on all sources entry: pylint - rev: 'v2.7.4' language: system types: [python] diff --git a/setup.py b/setup.py index a165584..882dcf6 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ DEVEL_REQUIREMENTS = [ "black==20.8b1", "pre-commit==2.7.1", - "pylint==2.6.2", + "pylint>=2.7.4", "pytest==6.1.1", ] From 454233ed3b80cf42d6993cd81c1cccba7772fcf8 Mon Sep 17 00:00:00 2001 From: Tomasz Urbaszek Date: Fri, 28 May 2021 22:12:04 +0200 Subject: [PATCH 5/7] fixup! fixup! fixup! fixup! Add Github issues and PRs scanner --- kibble/cli/commands/scanners_command.py | 50 ++++++--------- kibble/data_sources/__init__.py | 16 +++++ kibble/data_sources/base/__init__.py | 16 +++++ kibble/data_sources/base/base_data_source.py | 38 ++++++++++++ .../base/base_data_type.py} | 19 ++++-- .../data_sources/base/data_source_config.py | 43 +++++++++++++ kibble/data_sources/base/module_loading.py | 38 ++++++++++++ kibble/data_sources/github/__init__.py | 39 ++++++++++++ .../github/data_types/__init__.py | 16 +++++ kibble/data_sources/github/data_types/base.py | 59 ++++++++++++++++++ .../github/data_types/pr_issues.py} | 51 ++++------------ kibble/kibble.yaml | 13 ++-- kibble/scanners/__init__.py | 61 ------------------- kibble/secrets/env_variable.py | 1 + setup.py | 2 +- 15 files changed, 319 insertions(+), 143 deletions(-) create mode 100644 kibble/data_sources/__init__.py create mode 100644 kibble/data_sources/base/__init__.py create mode 100644 kibble/data_sources/base/base_data_source.py rename kibble/{scanners/base.py => data_sources/base/base_data_type.py} (68%) create mode 100644 kibble/data_sources/base/data_source_config.py create mode 100644 kibble/data_sources/base/module_loading.py create mode 100644 kibble/data_sources/github/__init__.py create mode 100644 kibble/data_sources/github/data_types/__init__.py create mode 100644 kibble/data_sources/github/data_types/base.py rename kibble/{scanners/github.py => data_sources/github/data_types/pr_issues.py} (50%) delete mode 100644 kibble/scanners/__init__.py diff --git a/kibble/cli/commands/scanners_command.py b/kibble/cli/commands/scanners_command.py index 9bb50ed..0c23081 100644 --- a/kibble/cli/commands/scanners_command.py +++ b/kibble/cli/commands/scanners_command.py @@ -17,12 +17,10 @@ __all__ = ["scanners_group"] -from typing import Optional - import click from kibble.configuration.yaml_config import kconfig -from kibble.scanners import get_scanner, get_scanners_classes +from kibble.data_sources.base.data_source_config import DataSourceConfig @click.group(name="scanners") @@ -31,33 +29,19 @@ def scanners_group(): @scanners_group.command() -def add(): - """Add new scanner configuration""" - click.echo("To be implemented!") - - -@scanners_group.command(name="list") -@click.option("-ds", "--data-source") -def list_scanners(data_source: Optional[str] = None): - """List all available scanners""" - all_scanners = get_scanners_classes(data_source) - for scanner in sorted(all_scanners, key=lambda cls: cls.__name__): - click.echo(f"{scanner.__name__}") - - -@scanners_group.command() -@click.option("-s", "--scanner-name", required=True) -def run(scanner_name: str): - """Trigger a scanning process for given scanner""" - for data_source in kconfig.get("data_sources", []): - if scanner_name not in data_source["enabled"]: - continue - organizations = data_source.get("organizations", []) - if not organizations: - click.echo(f"No organizations to scan in {data_source} data source.") - continue - - scanner = get_scanner(scanner_name=scanner_name) - for org in organizations: - click.echo(f"Running {scanner.__name__} for {org}") - scanner(**org).scan() +@click.option("-s", "--data-source", "data_source_name", required=True) +def run(data_source_name: str): + """Trigger a scanning process for given data source""" + data_source_config = None + for ds_in_config in kconfig.get("data_sources", []): + if ds_in_config["name"] == data_source_name: + data_source_config = DataSourceConfig.from_dict(ds_in_config) + break + + if not data_source_config: + click.echo(f"Data source {data_source_name} not configured") + return + + data_source = data_source_config.get_object() + click.echo(f"Scanning {data_source_name}") + data_source.scan() diff --git a/kibble/data_sources/__init__.py b/kibble/data_sources/__init__.py new file mode 100644 index 0000000..13a8339 --- /dev/null +++ b/kibble/data_sources/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/kibble/data_sources/base/__init__.py b/kibble/data_sources/base/__init__.py new file mode 100644 index 0000000..13a8339 --- /dev/null +++ b/kibble/data_sources/base/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/kibble/data_sources/base/base_data_source.py b/kibble/data_sources/base/base_data_source.py new file mode 100644 index 0000000..4d44876 --- /dev/null +++ b/kibble/data_sources/base/base_data_source.py @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Dict, List, Optional + +from kibble.data_sources.base.module_loading import import_string + + +class BaseDataSource: + """Base class for all data sources""" + + data_types_classes: Dict[str, str] = {} + + def __init__(self, *, enabled_data_types: Optional[List[str]] = None): + self.enabled_data_types = enabled_data_types + + def scan(self): + """Collect data for configured data types""" + for data_type_name, klass in self.data_types_classes.items(): + if self.enabled_data_types and data_type_name not in self.enabled_data_types: + continue + data_type_class = import_string(klass) + data_type = data_type_class(data_source=self) + data_type.scan() diff --git a/kibble/scanners/base.py b/kibble/data_sources/base/base_data_type.py similarity index 68% rename from kibble/scanners/base.py rename to kibble/data_sources/base/base_data_type.py index d2d7de4..2d042b1 100644 --- a/kibble/scanners/base.py +++ b/kibble/data_sources/base/base_data_type.py @@ -19,13 +19,24 @@ from typing import Any -class BaseScanner: - """Abstract, base class for all scanners""" +class BaseDataType: + """Abstract, base class for all data types""" + + name: str # pylint: disable=too-few-public-methods def __init__(self, **kwargs): self.log = logging.getLogger(__name__) - def _persist(self, payload: Any): # pylint: disable=no-self-use - """Persists data to database. Should be implemented per scanner.""" + def fetch_data(self): # pylint: disable=no-self-use + """Fetch data from data source""" + raise NotImplementedError() + + def persist(self, payload: Any): # pylint: disable=no-self-use + """Persist collected data""" raise NotImplementedError() + + def scan(self): + """Persists data to database""" + payload = self.fetch_data() + self.persist(payload) diff --git a/kibble/data_sources/base/data_source_config.py b/kibble/data_sources/base/data_source_config.py new file mode 100644 index 0000000..1b7d154 --- /dev/null +++ b/kibble/data_sources/base/data_source_config.py @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any, Dict, NamedTuple + +from kibble.data_sources.base.base_data_source import BaseDataSource +from kibble.data_sources.base.module_loading import import_string + + +class DataSourceConfig(NamedTuple): + """Data source configuration""" + + name: str + klass: str + config: Dict[str, Any] + + @classmethod + def from_dict(cls, dictionary: Dict): + """Make DataSourceConfig from a dictionary""" + return cls( + name=dictionary["name"], + klass=dictionary["class"], + config=dictionary["config"], + ) + + def get_object(self) -> BaseDataSource: + """Return data source object defined by this config""" + ds_class = import_string(self.klass) + return ds_class(**self.config) diff --git a/kibble/data_sources/base/module_loading.py b/kibble/data_sources/base/module_loading.py new file mode 100644 index 0000000..97763f0 --- /dev/null +++ b/kibble/data_sources/base/module_loading.py @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from importlib import import_module + + +def import_string(dotted_path: str): + """ + Import a dotted module path and return the attribute/class designated by the + last name in the path. Raise ImportError if the import failed. + """ + try: + module_path, class_name = dotted_path.rsplit(".", 1) + except ValueError: + # pylint: disable =raise-missing-from + raise ImportError(f"{dotted_path} doesn't look like a module path") + + module = import_module(module_path) + + try: + return getattr(module, class_name) + except AttributeError: + # pylint: disable =raise-missing-from + raise ImportError(f'Module "{module_path}" does not define a "{class_name}" attribute/class') diff --git a/kibble/data_sources/github/__init__.py b/kibble/data_sources/github/__init__.py new file mode 100644 index 0000000..8b77d9a --- /dev/null +++ b/kibble/data_sources/github/__init__.py @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any, Dict, List, Optional, Union + +from kibble.data_sources.base.base_data_source import BaseDataSource +from kibble.exceptions import KibbleException +from kibble.secrets.env_variable import get_secret_from_env + + +class GithubDataSource(BaseDataSource): + """Github datasource class""" + + name = "github" + data_types_classes = { + "pr_issues": "kibble.data_sources.github.data_types.pr_issues.GithubPrAndIssuesDataType" + } + + def __init__(self, *, repo_owner: str, repo_name: str, api_key: Optional[str] = None, **kwargs): + super().__init__(**kwargs) + self.repo_owner = repo_owner + self.repo_name = repo_name + self.api_key = api_key or get_secret_from_env("GH_API_KEY") + if not self.api_key: + raise KibbleException("No Github API_KEY") diff --git a/kibble/data_sources/github/data_types/__init__.py b/kibble/data_sources/github/data_types/__init__.py new file mode 100644 index 0000000..13a8339 --- /dev/null +++ b/kibble/data_sources/github/data_types/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/kibble/data_sources/github/data_types/base.py b/kibble/data_sources/github/data_types/base.py new file mode 100644 index 0000000..f2a8500 --- /dev/null +++ b/kibble/data_sources/github/data_types/base.py @@ -0,0 +1,59 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any, Dict, List, Optional, Union +from urllib.parse import urlencode, urljoin + +import requests + +from kibble.data_sources.base.base_data_type import BaseDataType +from kibble.data_sources.github import GithubDataSource + + +class GithubBaseDataType(BaseDataType): + """Base data type class for Github""" + + def __init__(self, *, data_source: GithubDataSource, **kwargs): + super().__init__(**kwargs) + + self.repo_owner = data_source.repo_owner + self.repo_name = data_source.repo_name + self.repo_full_name = f"{self.repo_owner}/{self.repo_name}" + + self.base_url = "https://api.github.com" + self.headers = { + "Accept": "application/vnd.github.v3+json", + "Authorization": f"token {data_source.api_key}", + } + + def _send_request(self, endpoint: str, query: Optional[Dict] = None) -> Union[List, Dict]: + url = urljoin(self.base_url, endpoint) + url = f"{url}?{urlencode(query)}" if query else url + response = requests.get(url, headers=self.headers) + response.raise_for_status() + return response.json() + + def _persist(self, payload: Any): + print(f"Collected {len(payload)} from {self.repo_full_name}") + + def fetch_data(self): # pylint: disable=no-self-use + """Fetch data from data source""" + raise NotImplementedError() + + def persist(self, payload: Any): # pylint: disable=no-self-use + """Persist collected data""" + raise NotImplementedError() diff --git a/kibble/scanners/github.py b/kibble/data_sources/github/data_types/pr_issues.py similarity index 50% rename from kibble/scanners/github.py rename to kibble/data_sources/github/data_types/pr_issues.py index dff5fd7..001638d 100644 --- a/kibble/scanners/github.py +++ b/kibble/data_sources/github/data_types/pr_issues.py @@ -15,50 +15,20 @@ # specific language governing permissions and limitations # under the License. -from typing import Any, Dict, List, Optional, Union -from urllib.parse import urlencode, urljoin +from typing import Any, Dict, List, Tuple -import requests +from kibble.data_sources.github.data_types.base import GithubBaseDataType -from kibble.scanners.base import BaseScanner -from kibble.secrets.env_variable import get_secret_from_env +Issue = Dict[str, Any] +PR = Dict[str, Any] -class GithubBaseScanner(BaseScanner): - """Github base scanner class""" +class GithubPrAndIssuesDataType(GithubBaseDataType): + """Github issues and pull requests""" - # pylint: disable=too-few-public-methods - data_source = "github" + name = "pr_and_issues" - def __init__(self, *, repo_owner: str, repo_name: str, api_key: Optional[str] = None, **kwargs): - super().__init__(**kwargs) - self.repo_owner = repo_owner - self.repo_name = repo_name - self.api_key = api_key or get_secret_from_env("GH_API_KEY") - self.repo_full_name = f"{self.repo_owner}/{self.repo_name}" - - self.base_url = "https://api.github.com" - self.headers = {"Accept": "application/vnd.github.v3+json"} - if api_key: - self.headers["Authorization"] = f"token {api_key}" - - def _send_request(self, endpoint: str, query: Optional[Dict] = None) -> Union[List, Dict]: - url = urljoin(self.base_url, endpoint) - url = f"{url}?{urlencode(query)}" if query else url - response = requests.get(url, headers=self.headers) - response.raise_for_status() - return response.json() - - def _persist(self, payload: Any): - pass - - -class GithubIssuesScanner(GithubBaseScanner): - """Github issues and pull requests scanner""" - - scanner_name = "github_issues" - - def scan(self): + def fetch_data(self): endpoint = f"/repos/{self.repo_owner}/{self.repo_name}/issues" query = {"per_page": 100, "page": 1} @@ -75,3 +45,8 @@ def scan(self): self.log.info("Collected %d issues and %d PRs from %s", len(issues), len(prs), self.repo_full_name) return issues, prs + + def persist(self, payload: Tuple[List[Issue], List[PR]]): + issues, prs = payload + self._persist(issues) + self._persist(prs) diff --git a/kibble/kibble.yaml b/kibble/kibble.yaml index 971601b..a1ac2ae 100644 --- a/kibble/kibble.yaml +++ b/kibble/kibble.yaml @@ -1,8 +1,9 @@ --- data_sources: - - name: github - organizations: - - repo_owner: apache - repo_name: kibble - enabled: - - github_issues + - name: github_kibble + class: kibble.data_sources.github.GithubDataSource + config: + repo_owner: apache + repo_name: kibble + enabled_data_types: + - pr_issues diff --git a/kibble/scanners/__init__.py b/kibble/scanners/__init__.py deleted file mode 100644 index 9051824..0000000 --- a/kibble/scanners/__init__.py +++ /dev/null @@ -1,61 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from pathlib import Path -from typing import Any, Dict, List, Optional - -from kibble.exceptions import KibbleException - - -def get_scanners( - data_source: Optional[str] = None, -) -> Dict[str, Any]: - """ - Automatically discover all scanners. Scanner class is discoverable - only if it has ``scan`` method. Returns a dictionary where keys are - scanner types. - - :param data_source: if provided then only scanners for this data source are returned. - """ - scanner_classes: Dict[str, Any] = {} - path = Path(__file__).parent - for file in path.iterdir(): - if file.suffix != ".py" or file.name in ("__init__.py", "base.py"): - continue - py_file = file.stem - mod = __import__(".".join([__name__, py_file]), fromlist=[py_file]) - classes = [cls for x in dir(mod) if isinstance(cls := getattr(mod, x), type) and hasattr(cls, "scan")] - if data_source: - classes = [cls for cls in classes if cls.data_source == data_source] # type: ignore - for cls in classes: - scanner_ds = cls.data_source - scanner_classes[scanner_ds] = scanner_classes.get(scanner_ds, []) + [cls] - - return scanner_classes - - -def get_scanners_classes(data_source: Optional[str] = None) -> List[Any]: - """Returns all scanner classes""" - return sum(get_scanners(data_source).values(), []) - - -def get_scanner(scanner_name: str): - """Returns scanner by name""" - scanners_with_name = [cls for cls in get_scanners_classes() if cls.scanner_name == scanner_name] - if not scanners_with_name: - raise KibbleException(f"Scanner with name '{scanner_name}' is undefined") - return scanners_with_name[0] diff --git a/kibble/secrets/env_variable.py b/kibble/secrets/env_variable.py index 95856de..c3c36b1 100644 --- a/kibble/secrets/env_variable.py +++ b/kibble/secrets/env_variable.py @@ -26,3 +26,4 @@ def get_secret_from_env(key: str): secret = os.environ.get(env_key) if not secret: raise SecretNotFound(secret=env_key, secret_type="environment variables") + return secret diff --git a/setup.py b/setup.py index 882dcf6..fe3609d 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ "pytest==6.1.1", ] -INSTALL_REQUIREMENTS = ["requests>=2.25.1"] +INSTALL_REQUIREMENTS = ["requests>=2.25.1", "click>=8.0.1", "PyYAML>=5.4.1"] EXTRAS_REQUIREMENTS = {"devel": DEVEL_REQUIREMENTS} From b0359a036750d2f1e1f5530ce2e9c87a6b6b091e Mon Sep 17 00:00:00 2001 From: Tomasz Urbaszek Date: Fri, 11 Jun 2021 21:10:35 +0200 Subject: [PATCH 6/7] Add Elasticsearch --- docker-compose.dev.yaml | 37 +++++ docs/architecture.rst | 126 ++++++++++++++++++ docs/index.rst | 16 +++ docs/installation.rst | 36 +++++ kibble/cli/commands/scanners_command.py | 2 +- kibble/data_sources/base/base_data_source.py | 65 ++++++++- kibble/data_sources/base/base_data_type.py | 44 ++++-- kibble/data_sources/github/__init__.py | 5 +- kibble/data_sources/github/data_types/base.py | 16 +-- .../data_types/{pr_issues.py => issues.py} | 23 +--- .../connection.py} | 28 +--- kibble/kibble.yaml | 5 +- setup.py | 5 +- 13 files changed, 333 insertions(+), 75 deletions(-) create mode 100644 docker-compose.dev.yaml create mode 100644 docs/architecture.rst create mode 100644 docs/index.rst create mode 100644 docs/installation.rst rename kibble/data_sources/github/data_types/{pr_issues.py => issues.py} (67%) rename kibble/{data_sources/base/data_source_config.py => database/connection.py} (51%) diff --git a/docker-compose.dev.yaml b/docker-compose.dev.yaml new file mode 100644 index 0000000..52ac507 --- /dev/null +++ b/docker-compose.dev.yaml @@ -0,0 +1,37 @@ +--- +version: '3' + +services: + # Elasticsearch node required as a database for Apache Kibble + elasticsearch: + image: elasticsearch:7.13.1 + ports: + - 9200:9200 + - 9300:9300 + environment: + node.name: es01 + discovery.seed_hosts: es02 + cluster.initial_master_nodes: es01 + cluster.name: kibble + ES_JAVA_OPTS: -Xms256m -Xmx256m + ulimits: + memlock: + soft: -1 + hard: -1 + volumes: + - "kibble-es-data:/usr/share/elasticsearch/data" + + # Kibana to view and manage Elasticsearch + kibana: + image: kibana:7.13.1 + ports: + - 5601:5601 + depends_on: + - elasticsearch + environment: + ELASTICSEARCH_URL: http://elasticsearch:9200 + ELASTICSEARCH_HOSTS: http://elasticsearch:9200 + +volumes: + # named volumes can be managed easier using docker-compose + kibble-es-data: diff --git a/docs/architecture.rst b/docs/architecture.rst new file mode 100644 index 0000000..ce73621 --- /dev/null +++ b/docs/architecture.rst @@ -0,0 +1,126 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +Apache Kibble Overview +====================== + +Kibble configuration +-------------------- + +Currently Apache Kibble is configured using `kibble.yaml` configuration file. + +Database configuration +...................... + +.. code-block:: + + elasticsearch: + hosts: + - http://localhost:9200 + +Data sources configuration +.......................... + +Multiple data sources can be configured. Each data source is defined by a python class. Additionally to that users +have to pass ``name`` and ``config`` which is a configuration specific for a given data source. + +.. code-block:: + + data_sources: + - name: name + class: path.to.a.Class + config: + # Data source specific configuration + +Data source +----------- + +Data source represents external source of information (for example Github, JIRA, mailing list etc). Each data source +is a python package. In this way users can easily build their own data sources and use them with Kibble. + +Data source package has to have the following structure: + +.. code-block:: + + data_source_name/ + | __init__.py + | ... + | data_types + | | __init__.py + | | type1.py + | | type2.py + | | ... + +The ``data_source_name.__init__`` should include the class defining the data source but the class can be place in other +file in top leve directory of the package. + +Data types +.......... + +Data type represent single type of data within a data source. For example if Github is a data source then issues and +comments will be two different data types. A data type is a class that has to implement ``fetch_data`` method that is +used to fetch and persist data. + +Data types are automatically determined using data source class path. + +Each data type is an index in Kibble elasticsearch instance. The data should be stored "as is" so users can leverage existing +documentation. + +Next to persisting data, a data type should also define metrics that can be calculate on retrieved data. + +Configuring a data source +......................... + +As described previous a data sources can be configured in ``kibble.yaml`` config file. For example: + +.. code-block:: + + data_sources: + - name: kibble_github + class: kibble.data_sources.github.GithubDataSource + config: + repo_owner: apache + repo_name: kibble + enabled_data_types: + - issues + - discussions + + - name: pulsar_github + class: kibble.data_sources.github.GithubDataSource + config: + repo_owner: apache + repo_name: pulsar + enabled_data_types: + - issues + - comments + + - name: pulsar_dev_list + class: kibble.data_sources.pony.PonyDataSource + config: + list_name: dev@pulsar.apache.org + enabled_data_types: + - threads + +In the above example we can see that: + +* We configured two different data sources based on ``GithubDataSource``: apache/pulsar and apache/kibble Github repositories. + For both sources we fetch different information. For Kibble we fetch issues and discussions data while for Apache + Pulsar we fetch issues and comments data. +* There's also a third data source using ``PonyDataSource`` configured for Apache Pulsar dev list. + +Thanks to this design users gain a big granularity on configuring the data they want to fetch. This also creates a big +chance for configuring different authorization option for each data source in future. diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..106592b --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,16 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. diff --git a/docs/installation.rst b/docs/installation.rst new file mode 100644 index 0000000..375662d --- /dev/null +++ b/docs/installation.rst @@ -0,0 +1,36 @@ + .. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + .. http://www.apache.org/licenses/LICENSE-2.0 + + .. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +Installation steps +================== + +To install Apache Kibble run: + +.. code-block:: + + pip install -e ".[devel]" + +You will also need a Elasticsearch instance up and running. You can setup one using docker-compose + +.. code-block:: + + docker-compose -f docker-compose.dev.yaml up + +Once ES is running you can scan configured data sources: + +.. code-block:: + kibble scanners run -s github_kibble diff --git a/kibble/cli/commands/scanners_command.py b/kibble/cli/commands/scanners_command.py index 0c23081..77c25f7 100644 --- a/kibble/cli/commands/scanners_command.py +++ b/kibble/cli/commands/scanners_command.py @@ -20,7 +20,7 @@ import click from kibble.configuration.yaml_config import kconfig -from kibble.data_sources.base.data_source_config import DataSourceConfig +from kibble.data_sources.base.base_data_source import DataSourceConfig @click.group(name="scanners") diff --git a/kibble/data_sources/base/base_data_source.py b/kibble/data_sources/base/base_data_source.py index 4d44876..fbc130d 100644 --- a/kibble/data_sources/base/base_data_source.py +++ b/kibble/data_sources/base/base_data_source.py @@ -15,24 +15,77 @@ # specific language governing permissions and limitations # under the License. -from typing import Dict, List, Optional +import inspect +from functools import cached_property +from pathlib import Path +from typing import Any, Dict, List, NamedTuple, Optional +from kibble.data_sources.base.base_data_type import BaseDataType from kibble.data_sources.base.module_loading import import_string class BaseDataSource: """Base class for all data sources""" - data_types_classes: Dict[str, str] = {} + _data_types_folder = "data_types" + _excluded_files = {"base"} def __init__(self, *, enabled_data_types: Optional[List[str]] = None): self.enabled_data_types = enabled_data_types + def _get_data_type_classes(self) -> Dict[str, str]: + data_source_path = Path(inspect.getfile(self.__class__)) + data_types_dir = data_source_path.parent.joinpath(self._data_types_folder) + data_type_classes = {} + + for file in data_types_dir.iterdir(): + if file.stem in self._excluded_files or file.stem.startswith("_"): + continue + data_type_classes[file.stem] = f"{self.__module__}.{self._data_types_folder}.{file.stem}.DataType" + return data_type_classes + + @cached_property + def data_types_classes(self) -> Dict[str, str]: + """Returns data types defined in this data source""" + return self._get_data_type_classes() + def scan(self): """Collect data for configured data types""" - for data_type_name, klass in self.data_types_classes.items(): - if self.enabled_data_types and data_type_name not in self.enabled_data_types: + unscanned = [] + if not self.enabled_data_types: + print("No data types enabled") + return + + for data_type_name in self.enabled_data_types: + klass = self.data_types_classes.get(data_type_name) + if not klass: + unscanned.append(data_type_name) continue data_type_class = import_string(klass) - data_type = data_type_class(data_source=self) - data_type.scan() + data_type: BaseDataType = data_type_class(data_source=self) + data_type.fetch_data() + + if unscanned: + print(f"Found no data types for following configurations {unscanned}") + + +class DataSourceConfig(NamedTuple): + """Data source configuration""" + + name: str + klass: str + config: Dict[str, Any] + + @classmethod + def from_dict(cls, dictionary: Dict): + """Make DataSourceConfig from a dictionary""" + return cls( + name=dictionary["name"], + klass=dictionary["class"], + config=dictionary["config"], + ) + + def get_object(self) -> BaseDataSource: + """Return data source object defined by this config""" + ds_class = import_string(self.klass) + return ds_class(**self.config) diff --git a/kibble/data_sources/base/base_data_type.py b/kibble/data_sources/base/base_data_type.py index 2d042b1..9446314 100644 --- a/kibble/data_sources/base/base_data_type.py +++ b/kibble/data_sources/base/base_data_type.py @@ -16,15 +16,19 @@ # under the License. import logging -from typing import Any +from typing import Any, Dict, List, Optional + +from elasticsearch import RequestError + +from kibble.database.connection import es +from kibble.exceptions import KibbleException class BaseDataType: """Abstract, base class for all data types""" - name: str + _index: Optional[str] - # pylint: disable=too-few-public-methods def __init__(self, **kwargs): self.log = logging.getLogger(__name__) @@ -32,11 +36,31 @@ def fetch_data(self): # pylint: disable=no-self-use """Fetch data from data source""" raise NotImplementedError() - def persist(self, payload: Any): # pylint: disable=no-self-use - """Persist collected data""" - raise NotImplementedError() + def persist(self, payload: List[Any], doc_type: str, id_mapper): + """ + Persists the payload in data type index + + :param payload: List of documents to be persisted in ES + :param doc_type: Name of the document to be used + :param id_mapper: Function that takes a single document and retrieves its id that will + be used as document ID in ES. + """ + if not self._index: + raise KibbleException(f"Data type {self.__class__.__name__} has no index defined") + + if not id_mapper: + raise KibbleException("id_mapper has to be specified to created id for document") + + try: + es.indices.create(index=self._index) + except RequestError as err: + if err.error != "resource_already_exists_exception": + raise + + for document in payload: + es.index(index=self._index, doc_type=doc_type, body=document, id=id_mapper(document)) - def scan(self): - """Persists data to database""" - payload = self.fetch_data() - self.persist(payload) + def read(self, query: Optional[Dict[str, Any]] = None): + """Read data from data type index""" + query = query or {"match_all": {}} + return es.search(index=self._index, body={"query": query}) diff --git a/kibble/data_sources/github/__init__.py b/kibble/data_sources/github/__init__.py index 8b77d9a..063d070 100644 --- a/kibble/data_sources/github/__init__.py +++ b/kibble/data_sources/github/__init__.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -from typing import Any, Dict, List, Optional, Union +from typing import Optional from kibble.data_sources.base.base_data_source import BaseDataSource from kibble.exceptions import KibbleException @@ -26,9 +26,6 @@ class GithubDataSource(BaseDataSource): """Github datasource class""" name = "github" - data_types_classes = { - "pr_issues": "kibble.data_sources.github.data_types.pr_issues.GithubPrAndIssuesDataType" - } def __init__(self, *, repo_owner: str, repo_name: str, api_key: Optional[str] = None, **kwargs): super().__init__(**kwargs) diff --git a/kibble/data_sources/github/data_types/base.py b/kibble/data_sources/github/data_types/base.py index f2a8500..d7cb0a3 100644 --- a/kibble/data_sources/github/data_types/base.py +++ b/kibble/data_sources/github/data_types/base.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -from typing import Any, Dict, List, Optional, Union +from typing import Dict, List, Optional, Union from urllib.parse import urlencode, urljoin import requests @@ -24,9 +24,12 @@ from kibble.data_sources.github import GithubDataSource +# pylint: disable=abstract-method class GithubBaseDataType(BaseDataType): """Base data type class for Github""" + _index = "github" + def __init__(self, *, data_source: GithubDataSource, **kwargs): super().__init__(**kwargs) @@ -46,14 +49,3 @@ def _send_request(self, endpoint: str, query: Optional[Dict] = None) -> Union[Li response = requests.get(url, headers=self.headers) response.raise_for_status() return response.json() - - def _persist(self, payload: Any): - print(f"Collected {len(payload)} from {self.repo_full_name}") - - def fetch_data(self): # pylint: disable=no-self-use - """Fetch data from data source""" - raise NotImplementedError() - - def persist(self, payload: Any): # pylint: disable=no-self-use - """Persist collected data""" - raise NotImplementedError() diff --git a/kibble/data_sources/github/data_types/pr_issues.py b/kibble/data_sources/github/data_types/issues.py similarity index 67% rename from kibble/data_sources/github/data_types/pr_issues.py rename to kibble/data_sources/github/data_types/issues.py index 001638d..8e2ddbc 100644 --- a/kibble/data_sources/github/data_types/pr_issues.py +++ b/kibble/data_sources/github/data_types/issues.py @@ -15,38 +15,25 @@ # specific language governing permissions and limitations # under the License. -from typing import Any, Dict, List, Tuple +from typing import Dict, List from kibble.data_sources.github.data_types.base import GithubBaseDataType -Issue = Dict[str, Any] -PR = Dict[str, Any] - -class GithubPrAndIssuesDataType(GithubBaseDataType): +class DataType(GithubBaseDataType): """Github issues and pull requests""" - name = "pr_and_issues" + _doc_type = "issue" def fetch_data(self): endpoint = f"/repos/{self.repo_owner}/{self.repo_name}/issues" query = {"per_page": 100, "page": 1} issues: List[Dict] = [] - prs: List[Dict] = [] self.log.info("Collecting Github issues and PRs from %s", self.repo_full_name) while new_issues := self._send_request(endpoint, query): for issue_pr in new_issues: - if "pull_request" in issue_pr: - prs.append(issue_pr) - else: - issues.append(issue_pr) + issues.append(issue_pr) query["page"] += 1 - self.log.info("Collected %d issues and %d PRs from %s", len(issues), len(prs), self.repo_full_name) - return issues, prs - - def persist(self, payload: Tuple[List[Issue], List[PR]]): - issues, prs = payload - self._persist(issues) - self._persist(prs) + self.persist(issues, doc_type=self._doc_type, id_mapper=lambda r: r["id"]) diff --git a/kibble/data_sources/base/data_source_config.py b/kibble/database/connection.py similarity index 51% rename from kibble/data_sources/base/data_source_config.py rename to kibble/database/connection.py index 1b7d154..099b763 100644 --- a/kibble/data_sources/base/data_source_config.py +++ b/kibble/database/connection.py @@ -15,29 +15,15 @@ # specific language governing permissions and limitations # under the License. -from typing import Any, Dict, NamedTuple +from elasticsearch import Elasticsearch -from kibble.data_sources.base.base_data_source import BaseDataSource -from kibble.data_sources.base.module_loading import import_string +from kibble.configuration.yaml_config import kconfig -class DataSourceConfig(NamedTuple): - """Data source configuration""" +def create_es() -> Elasticsearch: + """Creates ES instance connected to Kibble database""" + es_hosts = kconfig["elasticsearch"]["hosts"] + return Elasticsearch(es_hosts) - name: str - klass: str - config: Dict[str, Any] - @classmethod - def from_dict(cls, dictionary: Dict): - """Make DataSourceConfig from a dictionary""" - return cls( - name=dictionary["name"], - klass=dictionary["class"], - config=dictionary["config"], - ) - - def get_object(self) -> BaseDataSource: - """Return data source object defined by this config""" - ds_class = import_string(self.klass) - return ds_class(**self.config) +es = create_es() diff --git a/kibble/kibble.yaml b/kibble/kibble.yaml index a1ac2ae..f318d03 100644 --- a/kibble/kibble.yaml +++ b/kibble/kibble.yaml @@ -1,4 +1,7 @@ --- +elasticsearch: + hosts: + - http://localhost:9200 data_sources: - name: github_kibble class: kibble.data_sources.github.GithubDataSource @@ -6,4 +9,4 @@ data_sources: repo_owner: apache repo_name: kibble enabled_data_types: - - pr_issues + - issues diff --git a/setup.py b/setup.py index fe3609d..02c696c 100644 --- a/setup.py +++ b/setup.py @@ -16,12 +16,13 @@ # under the License. import os +from pathlib import Path from setuptools import find_packages, setup VERSION = "2.0.0dev" -BASE_PATH = os.path.dirname(os.path.realpath(__file__)) +BASE_PATH = Path(__file__).parent DEVEL_REQUIREMENTS = [ "black==20.8b1", @@ -30,7 +31,7 @@ "pytest==6.1.1", ] -INSTALL_REQUIREMENTS = ["requests>=2.25.1", "click>=8.0.1", "PyYAML>=5.4.1"] +INSTALL_REQUIREMENTS = ["elasticsearch==7.13.1", "requests>=2.25.1", "click>=8.0.1", "PyYAML>=5.4.1"] EXTRAS_REQUIREMENTS = {"devel": DEVEL_REQUIREMENTS} From 0fb16e6db7cb9847601c20680e93a5c56725028d Mon Sep 17 00:00:00 2001 From: sharanf Date: Sun, 13 Jun 2021 17:41:40 +0200 Subject: [PATCH 7/7] Update architecture.rst Minor text changes --- docs/architecture.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/architecture.rst b/docs/architecture.rst index ce73621..daec7a2 100644 --- a/docs/architecture.rst +++ b/docs/architecture.rst @@ -49,7 +49,7 @@ have to pass ``name`` and ``config`` which is a configuration specific for a giv Data source ----------- -Data source represents external source of information (for example Github, JIRA, mailing list etc). Each data source +Data source represents an external source of information (for example Github, JIRA, mailing list etc). Each data source is a python package. In this way users can easily build their own data sources and use them with Kibble. Data source package has to have the following structure: @@ -65,13 +65,13 @@ Data source package has to have the following structure: | | type2.py | | ... -The ``data_source_name.__init__`` should include the class defining the data source but the class can be place in other +The ``data_source_name.__init__`` should include the class defining the data source but the class can be placed in another file in top leve directory of the package. Data types .......... -Data type represent single type of data within a data source. For example if Github is a data source then issues and +Data type represents a single type of data within a data source. For example if Github is a data source then issues and comments will be two different data types. A data type is a class that has to implement ``fetch_data`` method that is used to fetch and persist data. @@ -85,7 +85,7 @@ Next to persisting data, a data type should also define metrics that can be calc Configuring a data source ......................... -As described previous a data sources can be configured in ``kibble.yaml`` config file. For example: +As described previously data sources can be configured in ``kibble.yaml`` config file. For example: .. code-block:: @@ -122,5 +122,5 @@ In the above example we can see that: Pulsar we fetch issues and comments data. * There's also a third data source using ``PonyDataSource`` configured for Apache Pulsar dev list. -Thanks to this design users gain a big granularity on configuring the data they want to fetch. This also creates a big -chance for configuring different authorization option for each data source in future. +Thanks to this design users will gain more granularity to configure the data they want to fetch. This also creates a big +opportunity for configuring different authorization options for each data source in future.