@@ -12,7 +12,9 @@ All three readers will require a personal access token (which you can generate u
1212
1313## Repository Reader
1414
15- This reader will read through a repo, with options to specifically filter directories and file extensions.
15+ This reader will read through a repo, with options to specifically filter directories, file extensions, file paths, and custom processing logic.
16+
17+ ### Basic Usage
1618
1719``` python
1820from llama_index.readers.github import GithubRepositoryReader, GithubClient
@@ -47,6 +49,187 @@ reader = GithubRepositoryReader(
4749documents = reader.load_data(branch = " main" )
4850```
4951
52+ ### Advanced Filtering Options
53+
54+ #### Filter Specific File Paths
55+
56+ ``` python
57+ # Include only specific files
58+ reader = GithubRepositoryReader(
59+ github_client = github_client,
60+ owner = " run-llama" ,
61+ repo = " llama_index" ,
62+ filter_file_paths = (
63+ [" README.md" , " src/main.py" , " docs/guide.md" ],
64+ GithubRepositoryReader.FilterType.INCLUDE ,
65+ ),
66+ )
67+
68+ # Exclude specific files
69+ reader = GithubRepositoryReader(
70+ github_client = github_client,
71+ owner = " run-llama" ,
72+ repo = " llama_index" ,
73+ filter_file_paths = (
74+ [" tests/test_file.py" , " temp/cache.txt" ],
75+ GithubRepositoryReader.FilterType.EXCLUDE ,
76+ ),
77+ )
78+ ```
79+
80+ #### Custom File Processing Callback
81+
82+ ``` python
83+ def process_file_callback (file_path : str , file_size : int ) -> tuple[bool , str ]:
84+ """ Custom logic to determine if a file should be processed.
85+
86+ Args:
87+ file_path: The full path to the file
88+ file_size: The size of the file in bytes
89+
90+ Returns:
91+ Tuple of (should_process: bool, reason: str)
92+ """
93+ # Skip large files
94+ if file_size > 1024 * 1024 : # 1MB
95+ return False , f " File too large: { file_size} bytes "
96+
97+ # Skip test files
98+ if " test" in file_path.lower():
99+ return False , " Skipping test files"
100+
101+ # Skip binary files by extension
102+ binary_extensions = [" .exe" , " .bin" , " .so" , " .dylib" ]
103+ if any (file_path.endswith(ext) for ext in binary_extensions):
104+ return False , " Skipping binary files"
105+
106+ return True , " "
107+
108+
109+ reader = GithubRepositoryReader(
110+ github_client = github_client,
111+ owner = " run-llama" ,
112+ repo = " llama_index" ,
113+ process_file_callback = process_file_callback,
114+ fail_on_error = False , # Continue processing if callback fails
115+ )
116+ ```
117+
118+ #### Custom Folder for Temporary Files
119+
120+ ``` python
121+ from llama_index.core.readers.base import BaseReader
122+
123+
124+ # Custom parser for specific file types
125+ class CustomMarkdownParser (BaseReader ):
126+ def load_data (self , file_path , extra_info = None ):
127+ # Custom parsing logic here
128+ pass
129+
130+
131+ reader = GithubRepositoryReader(
132+ github_client = github_client,
133+ owner = " run-llama" ,
134+ repo = " llama_index" ,
135+ use_parser = True ,
136+ custom_parsers = {" .md" : CustomMarkdownParser()},
137+ custom_folder = " /tmp/github_processing" , # Custom temp directory
138+ )
139+ ```
140+
141+ ### Event System Integration
142+
143+ The reader integrates with LlamaIndex's instrumentation system to provide detailed events during processing:
144+
145+ ``` python
146+ from llama_index.core.instrumentation import get_dispatcher
147+ from llama_index.core.instrumentation.event_handlers import BaseEventHandler
148+ from llama_index.readers.github.repository.event import (
149+ GitHubFileProcessedEvent,
150+ GitHubFileSkippedEvent,
151+ GitHubFileFailedEvent,
152+ GitHubRepositoryProcessingStartedEvent,
153+ GitHubRepositoryProcessingCompletedEvent,
154+ )
155+
156+
157+ class GitHubEventHandler (BaseEventHandler ):
158+ def handle (self , event ):
159+ if isinstance (event, GitHubRepositoryProcessingStartedEvent):
160+ print (f " Started processing repository: { event.repository_name} " )
161+ elif isinstance (event, GitHubFileProcessedEvent):
162+ print (
163+ f " Processed file: { event.file_path} ( { event.file_size} bytes) "
164+ )
165+ elif isinstance (event, GitHubFileSkippedEvent):
166+ print (f " Skipped file: { event.file_path} - { event.reason} " )
167+ elif isinstance (event, GitHubFileFailedEvent):
168+ print (f " Failed to process file: { event.file_path} - { event.error} " )
169+ elif isinstance (event, GitHubRepositoryProcessingCompletedEvent):
170+ print (
171+ f " Completed processing. Total documents: { event.total_documents} "
172+ )
173+
174+
175+ # Register the event handler
176+ dispatcher = get_dispatcher()
177+ handler = GitHubEventHandler()
178+ dispatcher.add_event_handler(handler)
179+
180+ # Use the reader - events will be automatically dispatched
181+ reader = GithubRepositoryReader(
182+ github_client = github_client,
183+ owner = " run-llama" ,
184+ repo = " llama_index" ,
185+ )
186+ documents = reader.load_data(branch = " main" )
187+ ```
188+
189+ #### Available Events
190+
191+ The following events are dispatched during repository processing:
192+
193+ - ** ` GitHubRepositoryProcessingStartedEvent ` ** : Fired when repository processing begins
194+
195+ - ` repository_name ` : Name of the repository (owner/repo)
196+ - ` branch_or_commit ` : Branch name or commit SHA being processed
197+
198+ - ** ` GitHubRepositoryProcessingCompletedEvent ` ** : Fired when repository processing completes
199+
200+ - ` repository_name ` : Name of the repository
201+ - ` branch_or_commit ` : Branch name or commit SHA
202+ - ` total_documents ` : Number of documents created
203+
204+ - ** ` GitHubTotalFilesToProcessEvent ` ** : Fired with the total count of files to be processed
205+
206+ - ` repository_name ` : Name of the repository
207+ - ` branch_or_commit ` : Branch name or commit SHA
208+ - ` total_files ` : Total number of files found
209+
210+ - ** ` GitHubFileProcessingStartedEvent ` ** : Fired when individual file processing starts
211+
212+ - ` file_path ` : Path to the file being processed
213+ - ` file_type ` : File extension
214+
215+ - ** ` GitHubFileProcessedEvent ` ** : Fired when a file is successfully processed
216+
217+ - ` file_path ` : Path to the processed file
218+ - ` file_type ` : File extension
219+ - ` file_size ` : Size of the file in bytes
220+ - ` document ` : The created Document object
221+
222+ - ** ` GitHubFileSkippedEvent ` ** : Fired when a file is skipped
223+
224+ - ` file_path ` : Path to the skipped file
225+ - ` file_type ` : File extension
226+ - ` reason ` : Reason why the file was skipped
227+
228+ - ** ` GitHubFileFailedEvent ` ** : Fired when file processing fails
229+ - ` file_path ` : Path to the failed file
230+ - ` file_type ` : File extension
231+ - ` error ` : Error message describing the failure
232+
50233## Issues Reader
51234
52235``` python
0 commit comments