Skip to content

Commit 3ce8e7e

Browse files
fix: handling of branch names with slashes (#131)
* Fixing the branch name with nested / * Adding fallback logic if git command fails'
1 parent 0e74d67 commit 3ce8e7e

File tree

3 files changed

+139
-20
lines changed

3 files changed

+139
-20
lines changed

src/gitingest/query_parser.py

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,15 @@
44
import re
55
import string
66
import uuid
7+
import warnings
78
from pathlib import Path
89
from typing import Any
910
from urllib.parse import unquote, urlparse
1011

1112
from config import TMP_BASE_PATH
1213
from gitingest.exceptions import InvalidPatternError
1314
from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS
14-
from gitingest.repository_clone import _check_repo_exists
15+
from gitingest.repository_clone import _check_repo_exists, fetch_remote_branch_list
1516

1617
HEX_DIGITS: set[str] = set(string.hexdigits)
1718

@@ -169,19 +170,48 @@ async def _parse_repo_source(source: str) -> dict[str, Any]:
169170
parsed["type"] = possible_type
170171

171172
# Commit or branch
172-
commit_or_branch = remaining_parts.pop(0)
173+
commit_or_branch = remaining_parts[0]
173174
if _is_valid_git_commit_hash(commit_or_branch):
174175
parsed["commit"] = commit_or_branch
176+
parsed["subpath"] += "/".join(remaining_parts[1:])
175177
else:
176-
parsed["branch"] = commit_or_branch
177-
178-
# Subpath if anything left
179-
if remaining_parts:
178+
parsed["branch"] = await _configure_branch_and_subpath(remaining_parts, url)
180179
parsed["subpath"] += "/".join(remaining_parts)
181-
182180
return parsed
183181

184182

183+
async def _configure_branch_and_subpath(remaining_parts: list[str], url: str) -> str | None:
184+
"""
185+
Configure the branch and subpath based on the remaining parts of the URL.
186+
Parameters
187+
----------
188+
remaining_parts : list[str]
189+
The remaining parts of the URL path.
190+
url : str
191+
The URL of the repository.
192+
Returns
193+
-------
194+
str | None
195+
The branch name if found, otherwise None.
196+
197+
"""
198+
try:
199+
# Fetch the list of branches from the remote repository
200+
branches: list[str] = await fetch_remote_branch_list(url)
201+
except RuntimeError as e:
202+
warnings.warn(f"Warning: Failed to fetch branch list: {str(e)}")
203+
return remaining_parts.pop(0)
204+
205+
branch = []
206+
while remaining_parts:
207+
branch.append(remaining_parts.pop(0))
208+
branch_name = "/".join(branch)
209+
if branch_name in branches:
210+
return branch_name
211+
212+
return None
213+
214+
185215
def _is_valid_git_commit_hash(commit: str) -> bool:
186216
"""
187217
Validate if the provided string is a valid Git commit hash.

src/gitingest/repository_clone.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from gitingest.utils import async_timeout
77

8-
CLONE_TIMEOUT: int = 20
8+
TIMEOUT: int = 20
99

1010

1111
@dataclass
@@ -34,7 +34,7 @@ class CloneConfig:
3434
branch: str | None = None
3535

3636

37-
@async_timeout(CLONE_TIMEOUT)
37+
@async_timeout(TIMEOUT)
3838
async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]:
3939
"""
4040
Clone a repository to a local path based on the provided configuration.
@@ -141,6 +141,30 @@ async def _check_repo_exists(url: str) -> bool:
141141
raise RuntimeError(f"Unexpected status code: {status_code}")
142142

143143

144+
@async_timeout(TIMEOUT)
145+
async def fetch_remote_branch_list(url: str) -> list[str]:
146+
"""
147+
Fetch the list of branches from a remote Git repository.
148+
Parameters
149+
----------
150+
url : str
151+
The URL of the Git repository to fetch branches from.
152+
Returns
153+
-------
154+
list[str]
155+
A list of branch names available in the remote repository.
156+
"""
157+
fetch_branches_command = ["git", "ls-remote", "--heads", url]
158+
stdout, _ = await _run_git_command(*fetch_branches_command)
159+
stdout_decoded = stdout.decode()
160+
161+
return [
162+
line.split("refs/heads/", 1)[1]
163+
for line in stdout_decoded.splitlines()
164+
if line.strip() and "refs/heads/" in line
165+
]
166+
167+
144168
async def _run_git_command(*args: str) -> tuple[bytes, bytes]:
145169
"""
146170
Execute a Git command asynchronously and captures its output.

tests/query_parser/test_query_parser.py

Lines changed: 76 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
""" Tests for the query_parser module. """
22

33
from pathlib import Path
4+
from unittest.mock import AsyncMock, patch
45

56
import pytest
67

@@ -109,11 +110,17 @@ async def test_parse_url_with_subpaths() -> None:
109110
Verifies that user name, repository name, branch, and subpath are correctly extracted.
110111
"""
111112
url = "https://github.com/user/repo/tree/main/subdir/file"
112-
result = await _parse_repo_source(url)
113-
assert result["user_name"] == "user"
114-
assert result["repo_name"] == "repo"
115-
assert result["branch"] == "main"
116-
assert result["subpath"] == "/subdir/file"
113+
with patch("gitingest.repository_clone._run_git_command", new_callable=AsyncMock) as mock_run_git_command:
114+
mock_run_git_command.return_value = (b"refs/heads/main\nrefs/heads/dev\nrefs/heads/feature-branch\n", b"")
115+
with patch(
116+
"gitingest.repository_clone.fetch_remote_branch_list", new_callable=AsyncMock
117+
) as mock_fetch_branches:
118+
mock_fetch_branches.return_value = ["main", "dev", "feature-branch"]
119+
result = await _parse_repo_source(url)
120+
assert result["user_name"] == "user"
121+
assert result["repo_name"] == "repo"
122+
assert result["branch"] == "main"
123+
assert result["subpath"] == "/subdir/file"
117124

118125

119126
async def test_parse_url_invalid_repo_structure() -> None:
@@ -228,14 +235,20 @@ async def test_parse_url_branch_and_commit_distinction() -> None:
228235
url_branch = "https://github.com/user/repo/tree/main"
229236
url_commit = "https://github.com/user/repo/tree/abcd1234abcd1234abcd1234abcd1234abcd1234"
230237

231-
result_branch = await _parse_repo_source(url_branch)
232-
result_commit = await _parse_repo_source(url_commit)
238+
with patch("gitingest.repository_clone._run_git_command", new_callable=AsyncMock) as mock_run_git_command:
239+
mock_run_git_command.return_value = (b"refs/heads/main\nrefs/heads/dev\nrefs/heads/feature-branch\n", b"")
240+
with patch(
241+
"gitingest.repository_clone.fetch_remote_branch_list", new_callable=AsyncMock
242+
) as mock_fetch_branches:
243+
mock_fetch_branches.return_value = ["main", "dev", "feature-branch"]
233244

234-
assert result_branch["branch"] == "main"
235-
assert result_branch["commit"] is None
245+
result_branch = await _parse_repo_source(url_branch)
246+
result_commit = await _parse_repo_source(url_commit)
247+
assert result_branch["branch"] == "main"
248+
assert result_branch["commit"] is None
236249

237-
assert result_commit["branch"] is None
238-
assert result_commit["commit"] == "abcd1234abcd1234abcd1234abcd1234abcd1234"
250+
assert result_commit["branch"] is None
251+
assert result_commit["commit"] == "abcd1234abcd1234abcd1234abcd1234abcd1234"
239252

240253

241254
async def test_parse_query_uuid_uniqueness() -> None:
@@ -280,3 +293,55 @@ async def test_parse_query_with_branch() -> None:
280293
assert result["branch"] == "2.2.x"
281294
assert result["commit"] is None
282295
assert result["type"] == "blob"
296+
297+
298+
@pytest.mark.asyncio
299+
@pytest.mark.parametrize(
300+
"url, expected_branch, expected_subpath",
301+
[
302+
("https://github.com/user/repo/tree/main/src", "main", "/src"),
303+
("https://github.com/user/repo/tree/fix1", "fix1", "/"),
304+
("https://github.com/user/repo/tree/nonexistent-branch/src", "nonexistent-branch", "/src"),
305+
],
306+
)
307+
async def test_parse_repo_source_with_failed_git_command(url, expected_branch, expected_subpath):
308+
"""
309+
Test `_parse_repo_source` when git command fails.
310+
Verifies that the function returns the first path component as the branch.
311+
"""
312+
with patch("gitingest.repository_clone.fetch_remote_branch_list", new_callable=AsyncMock) as mock_fetch_branches:
313+
mock_fetch_branches.side_effect = Exception("Failed to fetch branch list")
314+
315+
result = await _parse_repo_source(url)
316+
317+
assert result["branch"] == expected_branch
318+
assert result["subpath"] == expected_subpath
319+
320+
321+
@pytest.mark.asyncio
322+
@pytest.mark.parametrize(
323+
"url, expected_branch, expected_subpath",
324+
[
325+
("https://github.com/user/repo/tree/feature/fix1/src", "feature/fix1", "/src"),
326+
("https://github.com/user/repo/tree/main/src", "main", "/src"),
327+
("https://github.com/user/repo", None, "/"), # No
328+
("https://github.com/user/repo/tree/nonexistent-branch/src", None, "/"), # Non-existent branch
329+
("https://github.com/user/repo/tree/fix", "fix", "/"),
330+
("https://github.com/user/repo/blob/fix/page.html", "fix", "/page.html"),
331+
],
332+
)
333+
async def test_parse_repo_source_with_various_url_patterns(url, expected_branch, expected_subpath):
334+
with (
335+
patch("gitingest.repository_clone._run_git_command", new_callable=AsyncMock) as mock_run_git_command,
336+
patch("gitingest.repository_clone.fetch_remote_branch_list", new_callable=AsyncMock) as mock_fetch_branches,
337+
):
338+
339+
mock_run_git_command.return_value = (
340+
b"refs/heads/feature/fix1\nrefs/heads/main\nrefs/heads/feature-branch\nrefs/heads/fix\n",
341+
b"",
342+
)
343+
mock_fetch_branches.return_value = ["feature/fix1", "main", "feature-branch"]
344+
345+
result = await _parse_repo_source(url)
346+
assert result["branch"] == expected_branch
347+
assert result["subpath"] == expected_subpath

0 commit comments

Comments
 (0)