|
1 | 1 | import os
|
2 | 2 | import fnmatch
|
| 3 | +import pathspec |
3 | 4 |
|
4 | 5 | def crawl_local_files(directory, include_patterns=None, exclude_patterns=None, max_file_size=None, use_relative_paths=True):
|
5 | 6 | """
|
6 | 7 | Crawl files in a local directory with similar interface as crawl_github_files.
|
7 |
| - |
| 8 | +
|
8 | 9 | Args:
|
9 | 10 | directory (str): Path to local directory
|
10 | 11 | include_patterns (set): File patterns to include (e.g. {"*.py", "*.js"})
|
11 | 12 | exclude_patterns (set): File patterns to exclude (e.g. {"tests/*"})
|
12 | 13 | max_file_size (int): Maximum file size in bytes
|
13 | 14 | use_relative_paths (bool): Whether to use paths relative to directory
|
14 |
| - |
| 15 | +
|
15 | 16 | Returns:
|
16 | 17 | dict: {"files": {filepath: content}}
|
17 | 18 | """
|
18 | 19 | if not os.path.isdir(directory):
|
19 | 20 | raise ValueError(f"Directory does not exist: {directory}")
|
20 |
| - |
| 21 | + |
21 | 22 | files_dict = {}
|
22 |
| - |
23 |
| - for root, _, files in os.walk(directory): |
| 23 | + |
| 24 | + # --- Load .gitignore --- |
| 25 | + gitignore_path = os.path.join(directory, '.gitignore') |
| 26 | + gitignore_spec = None |
| 27 | + if os.path.exists(gitignore_path): |
| 28 | + try: |
| 29 | + with open(gitignore_path, 'r', encoding='utf-8') as f: |
| 30 | + gitignore_patterns = f.readlines() |
| 31 | + gitignore_spec = pathspec.PathSpec.from_lines('gitwildmatch', gitignore_patterns) |
| 32 | + print(f"Loaded .gitignore patterns from {gitignore_path}") |
| 33 | + except Exception as e: |
| 34 | + print(f"Warning: Could not read or parse .gitignore file {gitignore_path}: {e}") |
| 35 | + # --- End Load .gitignore --- |
| 36 | + |
| 37 | + for root, dirs, files in os.walk(directory): |
| 38 | + # Filter directories using .gitignore and exclude_patterns early to avoid descending |
| 39 | + # Need to process dirs list *in place* for os.walk to respect it |
| 40 | + excluded_dirs = set() |
| 41 | + for d in dirs: |
| 42 | + dirpath_rel = os.path.relpath(os.path.join(root, d), directory) |
| 43 | + |
| 44 | + # Check against .gitignore (important for directories) |
| 45 | + if gitignore_spec and gitignore_spec.match_file(dirpath_rel): |
| 46 | + excluded_dirs.add(d) |
| 47 | + continue # Skip further checks if gitignored |
| 48 | + |
| 49 | + # Check against standard exclude_patterns |
| 50 | + if exclude_patterns: |
| 51 | + for pattern in exclude_patterns: |
| 52 | + # Match pattern against full relative path or directory name itself |
| 53 | + if fnmatch.fnmatch(dirpath_rel, pattern) or fnmatch.fnmatch(d, pattern): |
| 54 | + excluded_dirs.add(d) |
| 55 | + break |
| 56 | + |
| 57 | + # Modify dirs in-place: remove excluded ones |
| 58 | + # Iterate over a copy (.copy()) because we are modifying the list during iteration |
| 59 | + for d in dirs.copy(): |
| 60 | + if d in excluded_dirs: |
| 61 | + dirs.remove(d) |
| 62 | + |
| 63 | + # Now process files in the non-excluded directories |
24 | 64 | for filename in files:
|
25 | 65 | filepath = os.path.join(root, filename)
|
26 |
| - |
| 66 | + |
27 | 67 | # Get path relative to directory if requested
|
28 | 68 | if use_relative_paths:
|
29 | 69 | relpath = os.path.relpath(filepath, directory)
|
30 | 70 | else:
|
31 | 71 | relpath = filepath
|
32 |
| - |
33 |
| - # Check if file matches any include pattern |
| 72 | + |
| 73 | + # --- Exclusion check --- |
| 74 | + excluded = False |
| 75 | + # 1. Check .gitignore first |
| 76 | + if gitignore_spec and gitignore_spec.match_file(relpath): |
| 77 | + excluded = True |
| 78 | + |
| 79 | + # 2. Check standard exclude_patterns if not already excluded by .gitignore |
| 80 | + if not excluded and exclude_patterns: |
| 81 | + for pattern in exclude_patterns: |
| 82 | + if fnmatch.fnmatch(relpath, pattern): |
| 83 | + excluded = True |
| 84 | + break |
| 85 | + |
34 | 86 | included = False
|
35 | 87 | if include_patterns:
|
36 | 88 | for pattern in include_patterns:
|
37 | 89 | if fnmatch.fnmatch(relpath, pattern):
|
38 | 90 | included = True
|
39 | 91 | break
|
40 | 92 | else:
|
| 93 | + # If no include patterns, include everything *not excluded* |
41 | 94 | included = True
|
42 |
| - |
43 |
| - # Check if file matches any exclude pattern |
44 |
| - excluded = False |
45 |
| - if exclude_patterns: |
46 |
| - for pattern in exclude_patterns: |
47 |
| - if fnmatch.fnmatch(relpath, pattern): |
48 |
| - excluded = True |
49 |
| - break |
50 |
| - |
| 95 | + |
| 96 | + # Skip if not included or if excluded (by either method) |
51 | 97 | if not included or excluded:
|
52 | 98 | continue
|
53 |
| - |
| 99 | + |
54 | 100 | # Check file size
|
55 | 101 | if max_file_size and os.path.getsize(filepath) > max_file_size:
|
56 | 102 | continue
|
57 |
| - |
| 103 | + |
58 | 104 | try:
|
59 | 105 | with open(filepath, 'r', encoding='utf-8') as f:
|
60 | 106 | content = f.read()
|
61 | 107 | files_dict[relpath] = content
|
62 | 108 | except Exception as e:
|
63 | 109 | print(f"Warning: Could not read file {filepath}: {e}")
|
64 |
| - |
| 110 | + |
65 | 111 | return {"files": files_dict}
|
66 | 112 |
|
67 | 113 | if __name__ == "__main__":
|
|
0 commit comments