From 5f7754a0eba37c5b27baae57f59fc73aec234dd0 Mon Sep 17 00:00:00 2001 From: kompotkot Date: Fri, 14 May 2021 10:43:02 +0000 Subject: [PATCH 1/3] Minor fixes in structure --- mirror/__init__.py | 2 +- mirror/github/allrepos.py | 20 ++++++-------------- mirror/github/clone_repos.py | 9 ++------- mirror/github/commits.py | 20 ++++---------------- mirror/github/data.py | 6 ++---- mirror/github/generate_snippets.py | 6 ++---- mirror/github/licenses.py | 7 +++---- mirror/github/search.py | 19 +++++-------------- mirror/github/sync.py | 5 ++--- mirror/github/utils.py | 7 +++---- mirror/settings.py | 16 ++++++++++++---- 11 files changed, 42 insertions(+), 75 deletions(-) diff --git a/mirror/__init__.py b/mirror/__init__.py index ae889c6..ad0bf0b 100644 --- a/mirror/__init__.py +++ b/mirror/__init__.py @@ -7,7 +7,7 @@ __email__ = "engineering@bugout.dev" __license__ = "MIT" -__version__ = "0.2.6" +__version__ = "0.2.7" __all__ = ( "__author__", diff --git a/mirror/github/allrepos.py b/mirror/github/allrepos.py index 336272f..f9dc051 100644 --- a/mirror/github/allrepos.py +++ b/mirror/github/allrepos.py @@ -3,9 +3,6 @@ Support checkpointing against a small state object - the integer ID of the last repository seen. """ - -import argparse -import csv import json import glob import multiprocessing @@ -13,21 +10,17 @@ import random import sys import time -from typing import Any, Callable, Dict, Iterator, List, Optional, TextIO, Tuple +from typing import Any, Dict, Iterator, List, Tuple import click import requests from tqdm import tqdm # type: ignore -from ..populate import populate_cli -from ..settings import GITHUB_TOKEN +from .. import settings subcommand = "allrepos" -REPOSITORIES_URL = "https://api.github.com/repositories" -REMAINING_RATELIMIT_HEADER = "X-RateLimit-Remaining" - def crawl( start_id: int, max_id: int, interval: float, min_rate_limit: int @@ -60,15 +53,14 @@ def crawl( "Accept": "application/vnd.github.v3+json", "User-Agent": "simiotics mirror", } - github_token = GITHUB_TOKEN - if github_token is not None and github_token != "": - headers["Authorization"] = f"token {github_token}" + if settings.GITHUB_TOKEN is not None and settings.GITHUB_TOKEN != "": + headers["Authorization"] = f"token {settings.GITHUB_TOKEN}" since = start_id curr_rate_limit = min_rate_limit + 10 while since is not None and since < max_id and curr_rate_limit > min_rate_limit: time.sleep(interval) - r = requests.get(REPOSITORIES_URL, params={"since": since}, headers=headers) + r = requests.get(settings.REPOSITORIES_URL, params={"since": since}, headers=headers) response_body = r.json() if not response_body: break @@ -76,7 +68,7 @@ def crawl( result["data"].extend(response_body) # type: ignore since = response_body[-1].get("id") - curr_rate_limit_raw = r.headers.get(REMAINING_RATELIMIT_HEADER) + curr_rate_limit_raw = r.headers.get(settings.REMAINING_RATELIMIT_HEADER) try: curr_rate_limit = -1 if curr_rate_limit_raw is not None: diff --git a/mirror/github/clone_repos.py b/mirror/github/clone_repos.py index d8c1588..9a60532 100644 --- a/mirror/github/clone_repos.py +++ b/mirror/github/clone_repos.py @@ -1,19 +1,14 @@ import os import json -import time import traceback import subprocess from typing import Optional import click -import requests -from ..settings import module_version +from .. import settings from .utils import get_nearest_value, read_command_type, forward_languages_config -DATETIME_HEADER = "Date" - - class CommandNotExistError(Exception): """Raised when coomand is not exist.""" @@ -59,7 +54,7 @@ def create_dir_meta_if_not_exists(lang_path: str, meta_file: str, lang: str): "language": lang, "repos": [], "crawled_at": None, - "mirror version": module_version, + "mirror version": settings.MIRROR_VERSION, }, meta, ) diff --git a/mirror/github/commits.py b/mirror/github/commits.py index 973e0f0..5280946 100644 --- a/mirror/github/commits.py +++ b/mirror/github/commits.py @@ -1,29 +1,17 @@ -import re import os import csv -import sys import json -import time -import glob import zipfile -import string -import traceback -from pathlib import Path from typing import Optional from .utils import flatten_json, get_nearest_value -import requests import click -from ..settings import GITHUB_TOKEN +from .. import settings from .utils import write_with_size, read_command_type, request_with_limit from .data import CommitPublic - -DATETIME_HEADER = "Date" - - validate_models = {"CommitPublic": CommitPublic} @@ -204,13 +192,13 @@ def commits( os.makedirs(crawldir) if not token: - token = GITHUB_TOKEN + token = settings.GITHUB_TOKEN headers = { "accept": "application/vnd.github.v3+json", } - if GITHUB_TOKEN is not None: + if settings.GITHUB_TOKEN is not None: headers["Authorization"] = f"token {token}" else: click.echo(f"start with low rate limit") @@ -264,7 +252,7 @@ def commits( license = repo["license"] # date of creating that commits file - date = commits_responce.headers.get(DATETIME_HEADER) + date = commits_responce.headers.get(settings.DATETIME_HEADER) # Indexing writer.writerow( diff --git a/mirror/github/data.py b/mirror/github/data.py index c97a537..8b93b3c 100644 --- a/mirror/github/data.py +++ b/mirror/github/data.py @@ -1,10 +1,8 @@ # pylint: disable=no-name-in-module # pylint: disable=no-self-argument +from typing import Optional -from datetime import datetime -from typing import Any, Dict, List, Optional - -from pydantic import BaseModel, Field +from pydantic import BaseModel class MyBaseModel(BaseModel): diff --git a/mirror/github/generate_snippets.py b/mirror/github/generate_snippets.py index 93e651c..20e81fe 100644 --- a/mirror/github/generate_snippets.py +++ b/mirror/github/generate_snippets.py @@ -222,9 +222,7 @@ def generate_datasets( rows_step = chunksize if not clone_dir: - clone_dir = os.environ.get("CLONE_DIR") - if not clone_dir: - raise ReadReposDirectoryError("CLONE_DIR not set.") + clone_dir = settings.CLONE_DIR # Read languages config file try: @@ -335,7 +333,7 @@ def generate_datasets( json.dump( { - "mirror version": settings.module_version, + "mirror version": settings.MIRROR_VERSION, "date": f"{datetime.now()}", "languages init config": language_to_extensions, "chunksize": chunksize, diff --git a/mirror/github/licenses.py b/mirror/github/licenses.py index 521887c..b68667a 100644 --- a/mirror/github/licenses.py +++ b/mirror/github/licenses.py @@ -1,8 +1,6 @@ """ Collect license information for a repository or a list of repositories """ - -import argparse import json import os import sys @@ -11,7 +9,8 @@ from typing import Any, Dict, List import requests -from tqdm import tqdm # type: ignore + +from .. import settings subcommand = "licenses" @@ -33,7 +32,7 @@ def get_license(repo_api_url: str) -> Dict[str, Any]: "Accept": "application/vnd.github.v3+json", "User-Agent": "simiotics mirror", } - github_token = os.environ.get("GITHUB_TOKEN") + github_token = settings.GITHUB_TOKEN if github_token is not None and github_token != "": headers["Authorization"] = f"token {github_token}" diff --git a/mirror/github/search.py b/mirror/github/search.py index b67625d..d26db80 100644 --- a/mirror/github/search.py +++ b/mirror/github/search.py @@ -1,24 +1,15 @@ import os -import csv import json -import time import string import traceback import urllib.parse -from pathlib import Path -from typing import Optional, Tuple - +from typing import Optional import click -import requests -from ..settings import * +from .. import settings from .utils import forward_languages_config, request_with_limit - -DATETIME_HEADER = "Date" - - class Error(Exception): """Base class for exceptions in this module.""" @@ -128,13 +119,13 @@ def popular_repos( """ if not token: - token = GITHUB_TOKEN + token = settings.GITHUB_TOKEN headers = { "accept": "application/vnd.github.v3+json", } - if GITHUB_TOKEN is not None: + if settings.GITHUB_TOKEN is not None: headers["Authorization"] = f"token {token}" else: click.echo(f"start with low rate limit") @@ -207,7 +198,7 @@ def popular_repos( write_repos( data, alredy_parsed, - search_response.headers.get(DATETIME_HEADER), + search_response.headers.get(settings.DATETIME_HEADER), files_counter, crawldir, language, diff --git a/mirror/github/sync.py b/mirror/github/sync.py index ceef18d..5c0797d 100644 --- a/mirror/github/sync.py +++ b/mirror/github/sync.py @@ -1,16 +1,15 @@ """ Synchronize repository metadata into a SQLite database """ - -import argparse from datetime import datetime, timezone import json import sqlite3 import sys from typing import Any, Dict, Iterator, List, Optional, Tuple -from tqdm import tqdm # type: ignore import click +from tqdm import tqdm # type: ignore + from .allrepos import ordered_crawl diff --git a/mirror/github/utils.py b/mirror/github/utils.py index 89e0560..54434cb 100644 --- a/mirror/github/utils.py +++ b/mirror/github/utils.py @@ -7,8 +7,7 @@ import click import requests -REMAINING_RATELIMIT_HEADER = "X-RateLimit-Remaining" -X_RATELIMIT_RESET = "X-RateLimit-Reset" +from .. import settings def request_with_limit(url, headers, min_rate_limit): @@ -17,12 +16,12 @@ def request_with_limit(url, headers, min_rate_limit): response = requests.get(url, headers=headers) - rate_limit_raw = response.headers.get(REMAINING_RATELIMIT_HEADER) + rate_limit_raw = response.headers.get(settings.REMAINING_RATELIMIT_HEADER) if rate_limit_raw is not None: current_rate_limit = int(rate_limit_raw) if current_rate_limit <= min_rate_limit: - reset_time = response.headers.get(X_RATELIMIT_RESET) + reset_time = response.headers.get(settings.RESET_RATELIMIT_HEADER) time.sleep(abs(int(reset_time) - int(time.time())) + 1) else: break diff --git a/mirror/settings.py b/mirror/settings.py index 48ae83c..a374bf6 100644 --- a/mirror/settings.py +++ b/mirror/settings.py @@ -1,15 +1,23 @@ import os -import uuid -from typing import Optional from . import __version__ -MODULE_NAME = "mirror" +MIRROR_VERSION = __version__ -module_version = __version__ +DATETIME_HEADER = "Date" + +REPOSITORIES_URL = "https://api.github.com/repositories" +REMAINING_RATELIMIT_HEADER = "X-RateLimit-Remaining" +RESET_RATELIMIT_HEADER = "X-RateLimit-Reset" GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN") +if GITHUB_TOKEN is None: + raise ValueError("GITHUB_TOKEN environment variable must be set") + CLONE_DIR = os.environ.get("CLONE_DIR") +if CLONE_DIR is None: + raise ValueError("CLONE_DIR environment variable must be set") + MIRROR_CRAWL_INTERVAL_SECONDS = os.environ.get("MIRROR_CRAWL_INTERVAL_SECONDS") MIRROR_CRAWL_MIN_RATE_LIMIT = os.environ.get("MIRROR_CRAWL_MIN_RATE_LIMIT") MIRROR_CRAWL_BATCH_SIZE = os.environ.get("MIRROR_CRAWL_BATCH_SIZE") From aba1af452b5c3ae8488cd4c814c694ee4296fc89 Mon Sep 17 00:00:00 2001 From: kompotkot Date: Fri, 14 May 2021 16:21:38 +0000 Subject: [PATCH 2/3] Added forks handler --- mirror/cli.py | 3 +- mirror/db/__init__.py | 0 mirror/{github/db_tool.py => db/db.py} | 0 mirror/github/allrepos.py | 12 +++--- mirror/github/calls.py | 42 +++++++++++++++++++++ mirror/github/clone_repos.py | 8 +++- mirror/github/commits.py | 3 ++ mirror/github/data.py | 10 +++++ mirror/github/forks.py | 52 ++++++++++++++++++++++++++ mirror/github/generate_snippets.py | 16 +++++--- mirror/github/licenses.py | 2 +- mirror/github/search.py | 4 ++ mirror/github/sync.py | 2 +- mirror/settings.py | 12 +----- 14 files changed, 140 insertions(+), 26 deletions(-) create mode 100644 mirror/db/__init__.py rename mirror/{github/db_tool.py => db/db.py} (100%) create mode 100644 mirror/github/calls.py create mode 100644 mirror/github/forks.py diff --git a/mirror/cli.py b/mirror/cli.py index 45922d2..611fa36 100644 --- a/mirror/cli.py +++ b/mirror/cli.py @@ -1,4 +1,5 @@ import click + from . import __version__ from .github.allrepos import crawl_handler as crawl_populator from .github.allrepos import nextid_handler as nextid_populator @@ -8,8 +9,6 @@ from .github.search import popular_repos from .github.clone_repos import clone_repos from .github.generate_snippets import generate_datasets -from .github.sync import handler as sync_populator -from .github.licenses import licenses_handler as licenses_populator @click.group() diff --git a/mirror/db/__init__.py b/mirror/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mirror/github/db_tool.py b/mirror/db/db.py similarity index 100% rename from mirror/github/db_tool.py rename to mirror/db/db.py diff --git a/mirror/github/allrepos.py b/mirror/github/allrepos.py index f9dc051..407f478 100644 --- a/mirror/github/allrepos.py +++ b/mirror/github/allrepos.py @@ -17,7 +17,7 @@ import requests from tqdm import tqdm # type: ignore -from .. import settings +from ..settings import GITHUB_API_URL, GITHUB_TOKEN, REMAINING_RATELIMIT_HEADER subcommand = "allrepos" @@ -53,14 +53,16 @@ def crawl( "Accept": "application/vnd.github.v3+json", "User-Agent": "simiotics mirror", } - if settings.GITHUB_TOKEN is not None and settings.GITHUB_TOKEN != "": - headers["Authorization"] = f"token {settings.GITHUB_TOKEN}" + if GITHUB_TOKEN is not None and GITHUB_TOKEN != "": + headers["Authorization"] = f"token {GITHUB_TOKEN}" since = start_id curr_rate_limit = min_rate_limit + 10 while since is not None and since < max_id and curr_rate_limit > min_rate_limit: time.sleep(interval) - r = requests.get(settings.REPOSITORIES_URL, params={"since": since}, headers=headers) + r = requests.get( + f"{GITHUB_API_URL}/repositories", params={"since": since}, headers=headers + ) response_body = r.json() if not response_body: break @@ -68,7 +70,7 @@ def crawl( result["data"].extend(response_body) # type: ignore since = response_body[-1].get("id") - curr_rate_limit_raw = r.headers.get(settings.REMAINING_RATELIMIT_HEADER) + curr_rate_limit_raw = r.headers.get(REMAINING_RATELIMIT_HEADER) try: curr_rate_limit = -1 if curr_rate_limit_raw is not None: diff --git a/mirror/github/calls.py b/mirror/github/calls.py new file mode 100644 index 0000000..ff0adc8 --- /dev/null +++ b/mirror/github/calls.py @@ -0,0 +1,42 @@ +""" +Processing requests to GitHub API. +""" +import logging +from typing import Any, Dict, List + +import requests + +from ..settings import GITHUB_API_URL, GITHUB_API_REQUEST_TIMEOUT + +logger = logging.getLogger(__name__) + + +class GitHubApiCallFailed(Exception): + """ + Raised on actions that involve calls to GitHub API which are failed. + """ + + +def fetch_repository_forks( + owner: str, + repo: str, + sort: str = "newest", + per_page: int = 100, + page: int = 1, +) -> List[Dict[str, Any]]: + """ + Fetch forks for provided repository from GitHub. + """ + url = f"{GITHUB_API_URL}/repos/{owner}/{repo}/forks" + headers = {"Accept": "application/vnd.github.v3+json"} + params = {"sort": sort, "per_page": per_page, "page": page} + try: + r = requests.get( + url, headers=headers, params=params, timeout=GITHUB_API_REQUEST_TIMEOUT + ) + r.raise_for_status() + response = r.json() + except Exception as e: + logger.error(repr(e)) + raise GitHubApiCallFailed("An error occurred due fetching forks via GitHub API") + return response diff --git a/mirror/github/clone_repos.py b/mirror/github/clone_repos.py index 9a60532..67ead8c 100644 --- a/mirror/github/clone_repos.py +++ b/mirror/github/clone_repos.py @@ -1,3 +1,6 @@ +""" +Cloning repository workflow. +""" import os import json import traceback @@ -6,9 +9,10 @@ import click -from .. import settings +from .. import __version__ from .utils import get_nearest_value, read_command_type, forward_languages_config + class CommandNotExistError(Exception): """Raised when coomand is not exist.""" @@ -54,7 +58,7 @@ def create_dir_meta_if_not_exists(lang_path: str, meta_file: str, lang: str): "language": lang, "repos": [], "crawled_at": None, - "mirror version": settings.MIRROR_VERSION, + "mirror version": __version__, }, meta, ) diff --git a/mirror/github/commits.py b/mirror/github/commits.py index 5280946..2e626d1 100644 --- a/mirror/github/commits.py +++ b/mirror/github/commits.py @@ -1,3 +1,6 @@ +""" +Process commits for repository. +""" import os import csv import json diff --git a/mirror/github/data.py b/mirror/github/data.py index 8b93b3c..9ce35ee 100644 --- a/mirror/github/data.py +++ b/mirror/github/data.py @@ -17,3 +17,13 @@ class CommitPublic(MyBaseModel): html_url: Optional[str] = None author_html_url: Optional[str] = None committer_html_url: Optional[str] = None + + +class RepositoryFork(BaseModel): + name: Optional[str] = None + full_name: Optional[str] = None + owner: Optional[str] = None + html_url: Optional[str] = None + forks_count: Optional[str] = None + created_at: Optional[str] = None + updated_at: Optional[str] = None diff --git a/mirror/github/forks.py b/mirror/github/forks.py new file mode 100644 index 0000000..53f2387 --- /dev/null +++ b/mirror/github/forks.py @@ -0,0 +1,52 @@ +import logging +import time +from typing import List + +from . import calls +from .data import RepositoryFork + +logger = logging.getLogger(__name__) + +interval = 1 + + +def get_repository_forks(owner: str, repo: str) -> List[RepositoryFork]: + """ + Parse repository forks and return organized pydantic data. + """ + forks: List[RepositoryFork] = [] + + page = 1 + while True: + try: + time.sleep(interval) + forks_raw = calls.fetch_repository_forks( + owner=owner, repo=repo, per_page=100, page=page + ) + for fork_raw in forks_raw: + forks.append( + RepositoryFork( + name=fork_raw.get("name"), + full_name=fork_raw.get("full_name"), + owner=fork_raw.get("owner").get("login") + if fork_raw.get("owner") is not None + else None, + html_url=fork_raw.get("html_url"), + forks_count=fork_raw.get("created_at"), + created_at=fork_raw.get("updated_at"), + updated_at=fork_raw.get("forks_count"), + ) + ) + if len(forks_raw) == 0: + logger.info( + f"Parsing of repository forks finished, total number of forks: {len(forks)}" + ) + break + except Exception: + logger.error( + f"Unexpected error occurred due parsing repository forks" + ) + break + page += 1 + + return forks diff --git a/mirror/github/generate_snippets.py b/mirror/github/generate_snippets.py index 20e81fe..afdd9de 100644 --- a/mirror/github/generate_snippets.py +++ b/mirror/github/generate_snippets.py @@ -1,3 +1,6 @@ +""" +Snippets generator. +""" import base64 from collections import defaultdict from datetime import datetime @@ -10,7 +13,8 @@ import click -from . import db_tool +from ..db import db +from .. import __version__ from .. import settings @@ -223,6 +227,8 @@ def generate_datasets( if not clone_dir: clone_dir = settings.CLONE_DIR + if clone_dir is None: + raise ReadReposDirectoryError("CLONE_DIR environment variable must be set") # Read languages config file try: @@ -255,8 +261,8 @@ def generate_datasets( os.makedirs(snippets_dir) # Create connection - conn = db_tool.create_connection(os.path.join(snippets_dir, "snippets.db")) - db_tool.create_snippets_table(conn) + conn = db.create_connection(os.path.join(snippets_dir, "snippets.db")) + db.create_snippets_table(conn) crawled_repos: Dict[str, Dict[str, Union[str, None]]] = {} @@ -315,7 +321,7 @@ def generate_datasets( for chunk_data in chunks ] - db_tool.write_snippet_to_db(conn, batch) + db.write_snippet_to_db(conn, batch) if not chunks: break @@ -333,7 +339,7 @@ def generate_datasets( json.dump( { - "mirror version": settings.MIRROR_VERSION, + "mirror version": __version__, "date": f"{datetime.now()}", "languages init config": language_to_extensions, "chunksize": chunksize, diff --git a/mirror/github/licenses.py b/mirror/github/licenses.py index b68667a..ccd10d9 100644 --- a/mirror/github/licenses.py +++ b/mirror/github/licenses.py @@ -1,5 +1,5 @@ """ -Collect license information for a repository or a list of repositories +Collect license information for a repository or a list of repositories. """ import json import os diff --git a/mirror/github/search.py b/mirror/github/search.py index d26db80..e9762a4 100644 --- a/mirror/github/search.py +++ b/mirror/github/search.py @@ -1,3 +1,6 @@ +""" +Popular repositories search engine. +""" import os import json import string @@ -10,6 +13,7 @@ from .. import settings from .utils import forward_languages_config, request_with_limit + class Error(Exception): """Base class for exceptions in this module.""" diff --git a/mirror/github/sync.py b/mirror/github/sync.py index 5c0797d..74cae82 100644 --- a/mirror/github/sync.py +++ b/mirror/github/sync.py @@ -1,5 +1,5 @@ """ -Synchronize repository metadata into a SQLite database +Synchronize repository metadata with local SQLite database. """ from datetime import datetime, timezone import json diff --git a/mirror/settings.py b/mirror/settings.py index a374bf6..654aa82 100644 --- a/mirror/settings.py +++ b/mirror/settings.py @@ -1,22 +1,14 @@ import os -from . import __version__ - -MIRROR_VERSION = __version__ - DATETIME_HEADER = "Date" -REPOSITORIES_URL = "https://api.github.com/repositories" +GITHUB_API_URL = "https://api.github.com" REMAINING_RATELIMIT_HEADER = "X-RateLimit-Remaining" RESET_RATELIMIT_HEADER = "X-RateLimit-Reset" +GITHUB_API_REQUEST_TIMEOUT = 2 GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN") -if GITHUB_TOKEN is None: - raise ValueError("GITHUB_TOKEN environment variable must be set") - CLONE_DIR = os.environ.get("CLONE_DIR") -if CLONE_DIR is None: - raise ValueError("CLONE_DIR environment variable must be set") MIRROR_CRAWL_INTERVAL_SECONDS = os.environ.get("MIRROR_CRAWL_INTERVAL_SECONDS") MIRROR_CRAWL_MIN_RATE_LIMIT = os.environ.get("MIRROR_CRAWL_MIN_RATE_LIMIT") From 029ea848cbe371b7c04afaac37bd20efbbefc65b Mon Sep 17 00:00:00 2001 From: kompotkot Date: Mon, 17 May 2021 09:31:04 +0000 Subject: [PATCH 3/3] Cli for forks, new entry point for argparse cli --- mirror/__init__.py | 2 +- mirror/{populate.py => cli_argp.py} | 20 ++++++++++++ mirror/github/calls.py | 8 +++-- mirror/github/data.py | 10 ++++-- mirror/github/forks.py | 47 ++++++++++++++++++++--------- mirror/settings.py | 2 +- setup.py | 9 ++++-- 7 files changed, 75 insertions(+), 23 deletions(-) rename mirror/{populate.py => cli_argp.py} (69%) diff --git a/mirror/__init__.py b/mirror/__init__.py index ad0bf0b..6f2c04a 100644 --- a/mirror/__init__.py +++ b/mirror/__init__.py @@ -3,7 +3,7 @@ """ __author__ = "Bugout" __maintainer__ = __author__ -__description__ = "Tools for software project analysis" +__description__ = "Tools for GitHub software project analysis" __email__ = "engineering@bugout.dev" __license__ = "MIT" diff --git a/mirror/populate.py b/mirror/cli_argp.py similarity index 69% rename from mirror/populate.py rename to mirror/cli_argp.py index 8b80ac3..1567939 100644 --- a/mirror/populate.py +++ b/mirror/cli_argp.py @@ -6,6 +6,9 @@ import argparse from typing import Callable, Dict +from .github import forks +from . import __version__ + def populate_cli( parser: argparse.ArgumentParser, @@ -30,3 +33,20 @@ def populate_cli( for subcommand, populator in subcommand_populators.items(): subparser = subcommand_parsers.add_parser(subcommand) populator(subparser) + + +def main(): + parser = argparse.ArgumentParser( + description="Mirror: Tools for GitHub software project analysis", + epilog=f"Version {__version__}", + ) + subcommand = parser.add_subparsers(description="Mirror commands") + + forks.mutate_argparser(subcommand) + + args = parser.parse_args() + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/mirror/github/calls.py b/mirror/github/calls.py index ff0adc8..2fd085f 100644 --- a/mirror/github/calls.py +++ b/mirror/github/calls.py @@ -2,7 +2,7 @@ Processing requests to GitHub API. """ import logging -from typing import Any, Dict, List +from typing import Any, Dict, List, Union import requests @@ -29,7 +29,11 @@ def fetch_repository_forks( """ url = f"{GITHUB_API_URL}/repos/{owner}/{repo}/forks" headers = {"Accept": "application/vnd.github.v3+json"} - params = {"sort": sort, "per_page": per_page, "page": page} + params: Dict[str, Union[str, int]] = { + "sort": sort, + "per_page": per_page, + "page": page, + } try: r = requests.get( url, headers=headers, params=params, timeout=GITHUB_API_REQUEST_TIMEOUT diff --git a/mirror/github/data.py b/mirror/github/data.py index 9ce35ee..d76c4ce 100644 --- a/mirror/github/data.py +++ b/mirror/github/data.py @@ -1,8 +1,8 @@ # pylint: disable=no-name-in-module # pylint: disable=no-self-argument -from typing import Optional +from typing import List, Optional -from pydantic import BaseModel +from pydantic import BaseModel, Field class MyBaseModel(BaseModel): @@ -27,3 +27,9 @@ class RepositoryFork(BaseModel): forks_count: Optional[str] = None created_at: Optional[str] = None updated_at: Optional[str] = None + + +class RepositoryForksList(BaseModel): + owner: str + repo: str + forks: List[RepositoryFork] = Field(default_factory=set) diff --git a/mirror/github/forks.py b/mirror/github/forks.py index 53f2387..47322ca 100644 --- a/mirror/github/forks.py +++ b/mirror/github/forks.py @@ -1,16 +1,17 @@ +import argparse import logging import time from typing import List from . import calls -from .data import RepositoryFork +from .data import RepositoryFork, RepositoryForksList logger = logging.getLogger(__name__) -interval = 1 - -def get_repository_forks(owner: str, repo: str) -> List[RepositoryFork]: +def get_repository_forks( + owner: str, repo: str, sleep_interval: int = 1 +) -> RepositoryForksList: """ Parse repository forks and return organized pydantic data. """ @@ -19,22 +20,23 @@ def get_repository_forks(owner: str, repo: str) -> List[RepositoryFork]: page = 1 while True: try: - time.sleep(interval) + time.sleep(sleep_interval) forks_raw = calls.fetch_repository_forks( owner=owner, repo=repo, per_page=100, page=page ) for fork_raw in forks_raw: + owner_dict = fork_raw.get("owner") forks.append( RepositoryFork( name=fork_raw.get("name"), full_name=fork_raw.get("full_name"), - owner=fork_raw.get("owner").get("login") - if fork_raw.get("owner") is not None + owner=owner_dict.get("login") + if owner_dict is not None else None, html_url=fork_raw.get("html_url"), - forks_count=fork_raw.get("created_at"), - created_at=fork_raw.get("updated_at"), - updated_at=fork_raw.get("forks_count"), + forks_count=fork_raw.get("forks_count"), + created_at=fork_raw.get("created_at"), + updated_at=fork_raw.get("updated_at"), ) ) if len(forks_raw) == 0: @@ -43,10 +45,27 @@ def get_repository_forks(owner: str, repo: str) -> List[RepositoryFork]: ) break except Exception: - logger.error( - f"Unexpected error occurred due parsing repository forks" - ) + logger.error(f"Unexpected error occurred due parsing repository forks") break page += 1 - return forks + return RepositoryForksList(owner=owner, repo=repo, forks=forks) + + +def cli_forks_handler(args: argparse.Namespace) -> None: + forks = get_repository_forks(args.owner, args.repo) + print(forks.json()) + + +def mutate_argparser(subcommand) -> None: + """ + Mutates the provided parser with GitHub Forks functionality. + """ + parser_forks = subcommand.add_parser("forks", description="Mirror forks") + parser_forks.add_argument( + "-o", "--owner", required=True, help="GitHub username or organization name" + ) + parser_forks.add_argument( + "-r", "--repo", required=True, help="GitHub repository name" + ) + parser_forks.set_defaults(func=cli_forks_handler) diff --git a/mirror/settings.py b/mirror/settings.py index 654aa82..9c748ab 100644 --- a/mirror/settings.py +++ b/mirror/settings.py @@ -5,7 +5,7 @@ GITHUB_API_URL = "https://api.github.com" REMAINING_RATELIMIT_HEADER = "X-RateLimit-Remaining" RESET_RATELIMIT_HEADER = "X-RateLimit-Reset" -GITHUB_API_REQUEST_TIMEOUT = 2 +GITHUB_API_REQUEST_TIMEOUT = 10 GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN") CLONE_DIR = os.environ.get("CLONE_DIR") diff --git a/setup.py b/setup.py index 4fe1256..9c8ede5 100644 --- a/setup.py +++ b/setup.py @@ -45,8 +45,11 @@ "requests", "tqdm", ], - extras_require={ - "dev": ["black", "mypy", "jupyter"] + extras_require={"dev": ["black", "mypy", "jupyter"]}, + entry_points={ + "console_scripts": [ + "{0} = {0}.cli:cli".format(MODULE_NAME), + "{0}-cli = {0}.cli_argp:main".format(MODULE_NAME) + ] }, - entry_points={"console_scripts": ["{0} = {0}.cli:cli".format(MODULE_NAME)]}, )