Skip to content

Commit a5991a6

Browse files
authored
Merge pull request #67 from robertguss/feat-ignore-gitignore-by-default
Feat: Add contents of `.gitignore` to ignore files by default
2 parents 7478f1d + 0a75de0 commit a5991a6

File tree

2 files changed

+67
-20
lines changed

2 files changed

+67
-20
lines changed

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ gitpython>=3.1.0
55
google-cloud-aiplatform>=1.25.0
66
google-genai>=1.9.0
77
python-dotenv>=1.0.0
8+
pathspec>=0.11.0

utils/crawl_local_files.py

Lines changed: 66 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,67 +1,113 @@
11
import os
22
import fnmatch
3+
import pathspec
34

45
def crawl_local_files(directory, include_patterns=None, exclude_patterns=None, max_file_size=None, use_relative_paths=True):
56
"""
67
Crawl files in a local directory with similar interface as crawl_github_files.
7-
8+
89
Args:
910
directory (str): Path to local directory
1011
include_patterns (set): File patterns to include (e.g. {"*.py", "*.js"})
1112
exclude_patterns (set): File patterns to exclude (e.g. {"tests/*"})
1213
max_file_size (int): Maximum file size in bytes
1314
use_relative_paths (bool): Whether to use paths relative to directory
14-
15+
1516
Returns:
1617
dict: {"files": {filepath: content}}
1718
"""
1819
if not os.path.isdir(directory):
1920
raise ValueError(f"Directory does not exist: {directory}")
20-
21+
2122
files_dict = {}
22-
23-
for root, _, files in os.walk(directory):
23+
24+
# --- Load .gitignore ---
25+
gitignore_path = os.path.join(directory, '.gitignore')
26+
gitignore_spec = None
27+
if os.path.exists(gitignore_path):
28+
try:
29+
with open(gitignore_path, 'r', encoding='utf-8') as f:
30+
gitignore_patterns = f.readlines()
31+
gitignore_spec = pathspec.PathSpec.from_lines('gitwildmatch', gitignore_patterns)
32+
print(f"Loaded .gitignore patterns from {gitignore_path}")
33+
except Exception as e:
34+
print(f"Warning: Could not read or parse .gitignore file {gitignore_path}: {e}")
35+
# --- End Load .gitignore ---
36+
37+
for root, dirs, files in os.walk(directory):
38+
# Filter directories using .gitignore and exclude_patterns early to avoid descending
39+
# Need to process dirs list *in place* for os.walk to respect it
40+
excluded_dirs = set()
41+
for d in dirs:
42+
dirpath_rel = os.path.relpath(os.path.join(root, d), directory)
43+
44+
# Check against .gitignore (important for directories)
45+
if gitignore_spec and gitignore_spec.match_file(dirpath_rel):
46+
excluded_dirs.add(d)
47+
continue # Skip further checks if gitignored
48+
49+
# Check against standard exclude_patterns
50+
if exclude_patterns:
51+
for pattern in exclude_patterns:
52+
# Match pattern against full relative path or directory name itself
53+
if fnmatch.fnmatch(dirpath_rel, pattern) or fnmatch.fnmatch(d, pattern):
54+
excluded_dirs.add(d)
55+
break
56+
57+
# Modify dirs in-place: remove excluded ones
58+
# Iterate over a copy (.copy()) because we are modifying the list during iteration
59+
for d in dirs.copy():
60+
if d in excluded_dirs:
61+
dirs.remove(d)
62+
63+
# Now process files in the non-excluded directories
2464
for filename in files:
2565
filepath = os.path.join(root, filename)
26-
66+
2767
# Get path relative to directory if requested
2868
if use_relative_paths:
2969
relpath = os.path.relpath(filepath, directory)
3070
else:
3171
relpath = filepath
32-
33-
# Check if file matches any include pattern
72+
73+
# --- Exclusion check ---
74+
excluded = False
75+
# 1. Check .gitignore first
76+
if gitignore_spec and gitignore_spec.match_file(relpath):
77+
excluded = True
78+
79+
# 2. Check standard exclude_patterns if not already excluded by .gitignore
80+
if not excluded and exclude_patterns:
81+
for pattern in exclude_patterns:
82+
if fnmatch.fnmatch(relpath, pattern):
83+
excluded = True
84+
break
85+
3486
included = False
3587
if include_patterns:
3688
for pattern in include_patterns:
3789
if fnmatch.fnmatch(relpath, pattern):
3890
included = True
3991
break
4092
else:
93+
# If no include patterns, include everything *not excluded*
4194
included = True
42-
43-
# Check if file matches any exclude pattern
44-
excluded = False
45-
if exclude_patterns:
46-
for pattern in exclude_patterns:
47-
if fnmatch.fnmatch(relpath, pattern):
48-
excluded = True
49-
break
50-
95+
96+
# Skip if not included or if excluded (by either method)
5197
if not included or excluded:
5298
continue
53-
99+
54100
# Check file size
55101
if max_file_size and os.path.getsize(filepath) > max_file_size:
56102
continue
57-
103+
58104
try:
59105
with open(filepath, 'r', encoding='utf-8') as f:
60106
content = f.read()
61107
files_dict[relpath] = content
62108
except Exception as e:
63109
print(f"Warning: Could not read file {filepath}: {e}")
64-
110+
65111
return {"files": files_dict}
66112

67113
if __name__ == "__main__":

0 commit comments

Comments
 (0)