Skip to content

Commit a1d1ab6

Browse files
authored
Merge pull request #85 from gethari/crawling-progress
#feat: Report progress of the crawling #64
2 parents d0f4304 + 0b2078b commit a1d1ab6

File tree

2 files changed

+68
-63
lines changed

2 files changed

+68
-63
lines changed

nodes.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,23 @@ def exec(self, prep_res):
6161
)
6262
else:
6363
print(f"Crawling directory: {prep_res['local_dir']}...")
64+
65+
def progress_callback(processed, total):
66+
percentage = (processed / total) * 100 if total > 0 else 0
67+
rounded_percentage = int(percentage)
68+
if rounded_percentage > progress_callback.last_reported:
69+
progress_callback.last_reported = rounded_percentage
70+
print(f"\033[92mProgress: {processed}/{total} files ({rounded_percentage}%)\033[0m")
71+
72+
progress_callback.last_reported = -1
73+
6474
result = crawl_local_files(
6575
directory=prep_res["local_dir"],
6676
include_patterns=prep_res["include_patterns"],
6777
exclude_patterns=prep_res["exclude_patterns"],
6878
max_file_size=prep_res["max_file_size"],
6979
use_relative_paths=prep_res["use_relative_paths"],
80+
progress_callback=progress_callback
7081
)
7182

7283
# Convert dict to list of tuples: [(path, content), ...]

utils/crawl_local_files.py

Lines changed: 57 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22
import fnmatch
33
import pathspec
44

5-
65
def crawl_local_files(
76
directory,
87
include_patterns=None,
98
exclude_patterns=None,
109
max_file_size=None,
1110
use_relative_paths=True,
11+
progress_callback=None,
1212
):
1313
"""
1414
Crawl files in a local directory with similar interface as crawl_github_files.
@@ -18,6 +18,7 @@ def crawl_local_files(
1818
exclude_patterns (set): File patterns to exclude (e.g. {"tests/*"})
1919
max_file_size (int): Maximum file size in bytes
2020
use_relative_paths (bool): Whether to use paths relative to directory
21+
progress_callback (callable): Function to report progress, takes (processed, total) as arguments
2122
2223
Returns:
2324
dict: {"files": {filepath: content}}
@@ -34,91 +35,84 @@ def crawl_local_files(
3435
try:
3536
with open(gitignore_path, "r", encoding="utf-8") as f:
3637
gitignore_patterns = f.readlines()
37-
gitignore_spec = pathspec.PathSpec.from_lines(
38-
"gitwildmatch", gitignore_patterns
39-
)
38+
gitignore_spec = pathspec.PathSpec.from_lines("gitwildmatch", gitignore_patterns)
4039
print(f"Loaded .gitignore patterns from {gitignore_path}")
4140
except Exception as e:
42-
print(
43-
f"Warning: Could not read or parse .gitignore file {gitignore_path}: {e}"
44-
)
45-
# --- End Load .gitignore ---
41+
print(f"Warning: Could not read or parse .gitignore file {gitignore_path}: {e}")
4642

43+
all_files = []
4744
for root, dirs, files in os.walk(directory):
48-
# Filter directories using .gitignore and exclude_patterns early to avoid descending
49-
# Need to process dirs list *in place* for os.walk to respect it
45+
# Filter directories using .gitignore and exclude_patterns early
5046
excluded_dirs = set()
5147
for d in dirs:
5248
dirpath_rel = os.path.relpath(os.path.join(root, d), directory)
5349

54-
# Check against .gitignore (important for directories)
5550
if gitignore_spec and gitignore_spec.match_file(dirpath_rel):
5651
excluded_dirs.add(d)
57-
continue # Skip further checks if gitignored
52+
continue
5853

59-
# Check against standard exclude_patterns
6054
if exclude_patterns:
6155
for pattern in exclude_patterns:
62-
# Match pattern against full relative path or directory name itself
63-
if fnmatch.fnmatch(dirpath_rel, pattern) or fnmatch.fnmatch(
64-
d, pattern
65-
):
56+
if fnmatch.fnmatch(dirpath_rel, pattern) or fnmatch.fnmatch(d, pattern):
6657
excluded_dirs.add(d)
6758
break
6859

69-
# Modify dirs in-place: remove excluded ones
70-
# Iterate over a copy (.copy()) because we are modifying the list during iteration
7160
for d in dirs.copy():
7261
if d in excluded_dirs:
7362
dirs.remove(d)
7463

75-
# Now process files in the non-excluded directories
7664
for filename in files:
7765
filepath = os.path.join(root, filename)
66+
all_files.append(filepath)
67+
68+
total_files = len(all_files)
69+
processed_files = 0
70+
71+
for filepath in all_files:
72+
relpath = os.path.relpath(filepath, directory) if use_relative_paths else filepath
73+
74+
# --- Exclusion check ---
75+
excluded = False
76+
if gitignore_spec and gitignore_spec.match_file(relpath):
77+
excluded = True
78+
79+
if not excluded and exclude_patterns:
80+
for pattern in exclude_patterns:
81+
if fnmatch.fnmatch(relpath, pattern):
82+
excluded = True
83+
break
84+
85+
included = False
86+
if include_patterns:
87+
for pattern in include_patterns:
88+
if fnmatch.fnmatch(relpath, pattern):
89+
included = True
90+
break
91+
else:
92+
included = True
93+
94+
if not included or excluded:
95+
processed_files += 1
96+
if progress_callback:
97+
progress_callback(processed_files, total_files)
98+
continue
99+
100+
if max_file_size and os.path.getsize(filepath) > max_file_size:
101+
processed_files += 1
102+
if progress_callback:
103+
progress_callback(processed_files, total_files)
104+
continue
78105

79-
# Get path relative to directory if requested
80-
if use_relative_paths:
81-
relpath = os.path.relpath(filepath, directory)
82-
else:
83-
relpath = filepath
84-
85-
# --- Exclusion check ---
86-
excluded = False
87-
# 1. Check .gitignore first
88-
if gitignore_spec and gitignore_spec.match_file(relpath):
89-
excluded = True
90-
91-
# 2. Check standard exclude_patterns if not already excluded by .gitignore
92-
if not excluded and exclude_patterns:
93-
for pattern in exclude_patterns:
94-
if fnmatch.fnmatch(relpath, pattern):
95-
excluded = True
96-
break
97-
98-
included = False
99-
if include_patterns:
100-
for pattern in include_patterns:
101-
if fnmatch.fnmatch(relpath, pattern):
102-
included = True
103-
break
104-
else:
105-
# If no include patterns, include everything *not excluded*
106-
included = True
107-
108-
# Skip if not included or if excluded (by either method)
109-
if not included or excluded:
110-
continue
111-
112-
# Check file size
113-
if max_file_size and os.path.getsize(filepath) > max_file_size:
114-
continue
106+
try:
107+
with open(filepath, "r", encoding="utf-8") as f:
108+
content = f.read()
109+
files_dict[relpath] = content
110+
except Exception as e:
111+
print(f"Warning: Could not read file {filepath}: {e}")
115112

116-
try:
117-
with open(filepath, "r", encoding="utf-8") as f:
118-
content = f.read()
119-
files_dict[relpath] = content
120-
except Exception as e:
121-
print(f"Warning: Could not read file {filepath}: {e}")
113+
processed_files += 1
114+
if progress_callback:
115+
progress_callback(processed_files, total_files)
122116

123117
return {"files": files_dict}
124118

@@ -138,4 +132,4 @@ def crawl_local_files(
138132
)
139133
print(f"Found {len(files_data['files'])} files:")
140134
for path in files_data["files"]:
141-
print(f" {path}")
135+
print(f" {path}")

0 commit comments

Comments
 (0)