1
- import os
2
- from pathlib import Path
3
-
4
1
import pytest
5
2
from fastapi import FastAPI
6
3
from fastapi .staticfiles import StaticFiles
11
8
configure_routes ,
12
9
configure_search_types ,
13
10
)
11
+ from khoj .database .adapters import get_default_search_model
14
12
from khoj .database .models import (
15
13
Agent ,
16
14
ChatModel ,
19
17
GithubRepoConfig ,
20
18
KhojApiUser ,
21
19
KhojUser ,
22
- LocalMarkdownConfig ,
23
- LocalOrgConfig ,
24
- LocalPdfConfig ,
25
- LocalPlaintextConfig ,
26
20
)
27
21
from khoj .processor .content .org_mode .org_to_entries import OrgToEntries
28
22
from khoj .processor .content .plaintext .plaintext_to_entries import PlaintextToEntries
29
23
from khoj .processor .embeddings import CrossEncoderModel , EmbeddingsModel
30
24
from khoj .routers .api_content import configure_content
31
25
from khoj .search_type import text_search
32
- from khoj .utils import fs_syncer , state
33
- from khoj .utils .config import SearchModels
26
+ from khoj .utils import state
34
27
from khoj .utils .constants import web_directory
35
- from khoj .utils .helpers import resolve_absolute_path
36
- from khoj .utils .rawconfig import ContentConfig , SearchConfig
37
28
from tests .helpers import (
38
29
AiModelApiFactory ,
39
30
ChatModelFactory ,
43
34
UserFactory ,
44
35
get_chat_api_key ,
45
36
get_chat_provider ,
37
+ get_index_files ,
38
+ get_sample_data ,
46
39
)
47
40
48
41
@@ -59,17 +52,16 @@ def django_db_setup(django_db_setup, django_db_blocker):
59
52
60
53
61
54
@pytest .fixture (scope = "session" )
62
- def search_config () -> SearchConfig :
55
+ def search_config ():
56
+ search_model = get_default_search_model ()
63
57
state .embeddings_model = dict ()
64
- state .embeddings_model ["default" ] = EmbeddingsModel ()
58
+ state .embeddings_model ["default" ] = EmbeddingsModel (
59
+ model_name = search_model .bi_encoder , model_kwargs = search_model .bi_encoder_model_config
60
+ )
65
61
state .cross_encoder_model = dict ()
66
- state .cross_encoder_model ["default" ] = CrossEncoderModel ()
67
-
68
- model_dir = resolve_absolute_path ("~/.khoj/search" )
69
- model_dir .mkdir (parents = True , exist_ok = True )
70
- search_config = SearchConfig ()
71
-
72
- return search_config
62
+ state .cross_encoder_model ["default" ] = CrossEncoderModel (
63
+ model_name = search_model .cross_encoder , model_kwargs = search_model .cross_encoder_model_config
64
+ )
73
65
74
66
75
67
@pytest .mark .django_db
@@ -201,13 +193,6 @@ def openai_agent():
201
193
)
202
194
203
195
204
- @pytest .fixture (scope = "session" )
205
- def search_models (search_config : SearchConfig ):
206
- search_models = SearchModels ()
207
-
208
- return search_models
209
-
210
-
211
196
@pytest .mark .django_db
212
197
@pytest .fixture
213
198
def default_process_lock ():
@@ -219,72 +204,23 @@ def anyio_backend():
219
204
return "asyncio"
220
205
221
206
222
- @pytest .mark .django_db
223
207
@pytest .fixture (scope = "function" )
224
- def content_config (tmp_path_factory , search_models : SearchModels , default_user : KhojUser ):
225
- content_dir = tmp_path_factory .mktemp ("content" )
226
-
227
- # Generate Image Embeddings from Test Images
228
- content_config = ContentConfig ()
229
-
230
- LocalOrgConfig .objects .create (
231
- input_files = None ,
232
- input_filter = ["tests/data/org/*.org" ],
233
- index_heading_entries = False ,
234
- user = default_user ,
235
- )
236
-
237
- text_search .setup (OrgToEntries , get_sample_data ("org" ), regenerate = False , user = default_user )
238
-
239
- if os .getenv ("GITHUB_PAT_TOKEN" ):
240
- GithubConfig .objects .create (
241
- pat_token = os .getenv ("GITHUB_PAT_TOKEN" ),
242
- user = default_user ,
243
- )
244
-
245
- GithubRepoConfig .objects .create (
246
- owner = "khoj-ai" ,
247
- name = "lantern" ,
248
- branch = "master" ,
249
- github_config = GithubConfig .objects .get (user = default_user ),
250
- )
251
-
252
- LocalPlaintextConfig .objects .create (
253
- input_files = None ,
254
- input_filter = ["tests/data/plaintext/*.txt" , "tests/data/plaintext/*.md" , "tests/data/plaintext/*.html" ],
255
- user = default_user ,
256
- )
257
-
258
- return content_config
259
-
260
-
261
- @pytest .fixture (scope = "session" )
262
- def md_content_config ():
263
- markdown_config = LocalMarkdownConfig .objects .create (
264
- input_files = None ,
265
- input_filter = ["tests/data/markdown/*.markdown" ],
266
- )
267
-
268
- return markdown_config
269
-
270
-
271
- @pytest .fixture (scope = "function" )
272
- def chat_client (search_config : SearchConfig , default_user2 : KhojUser ):
208
+ def chat_client (search_config , default_user2 : KhojUser ):
273
209
return chat_client_builder (search_config , default_user2 , require_auth = False )
274
210
275
211
276
212
@pytest .fixture (scope = "function" )
277
- def chat_client_with_auth (search_config : SearchConfig , default_user2 : KhojUser ):
213
+ def chat_client_with_auth (search_config , default_user2 : KhojUser ):
278
214
return chat_client_builder (search_config , default_user2 , require_auth = True )
279
215
280
216
281
217
@pytest .fixture (scope = "function" )
282
- def chat_client_no_background (search_config : SearchConfig , default_user2 : KhojUser ):
218
+ def chat_client_no_background (search_config , default_user2 : KhojUser ):
283
219
return chat_client_builder (search_config , default_user2 , index_content = False , require_auth = False )
284
220
285
221
286
222
@pytest .fixture (scope = "function" )
287
- def chat_client_with_large_kb (search_config : SearchConfig , default_user2 : KhojUser ):
223
+ def chat_client_with_large_kb (search_config , default_user2 : KhojUser ):
288
224
"""
289
225
Chat client fixture that creates a large knowledge base with many files
290
226
for stress testing atomic agent updates.
@@ -298,15 +234,11 @@ def chat_client_builder(search_config, user, index_content=True, require_auth=Fa
298
234
state .SearchType = configure_search_types ()
299
235
300
236
if index_content :
301
- LocalMarkdownConfig .objects .create (
302
- input_files = None ,
303
- input_filter = ["tests/data/markdown/*.markdown" ],
304
- user = user ,
305
- )
237
+ file_type = "markdown"
238
+ files_to_index = {file_type : get_index_files (input_filters = [f"tests/data/{ file_type } /*.{ file_type } " ])}
306
239
307
240
# Index Markdown Content for Search
308
- all_files = fs_syncer .collect_files (user = user )
309
- configure_content (user , all_files )
241
+ configure_content (user , files_to_index )
310
242
311
243
# Initialize Processor from Config
312
244
chat_provider = get_chat_provider ()
@@ -346,12 +278,13 @@ def large_kb_chat_client_builder(search_config, user):
346
278
347
279
# Create temporary directory for large number of test files
348
280
temp_dir = tempfile .mkdtemp (prefix = "khoj_test_large_kb_" )
281
+ file_type = "markdown"
349
282
large_file_list = []
350
283
351
284
try :
352
285
# Generate 200 test files with substantial content
353
286
for i in range (300 ):
354
- file_path = os .path .join (temp_dir , f"test_file_{ i :03d} .markdown " )
287
+ file_path = os .path .join (temp_dir , f"test_file_{ i :03d} .{ file_type } " )
355
288
content = f"""
356
289
# Test File { i }
357
290
@@ -401,16 +334,9 @@ def process(self):
401
334
f .write (content )
402
335
large_file_list .append (file_path )
403
336
404
- # Create LocalMarkdownConfig with all the generated files
405
- LocalMarkdownConfig .objects .create (
406
- input_files = large_file_list ,
407
- input_filter = None ,
408
- user = user ,
409
- )
410
-
411
- # Index all the files into the user's knowledge base
412
- all_files = fs_syncer .collect_files (user = user )
413
- configure_content (user , all_files )
337
+ # Index all generated files into the user's knowledge base
338
+ files_to_index = {file_type : get_index_files (input_files = large_file_list , input_filters = None )}
339
+ configure_content (user , files_to_index )
414
340
415
341
# Verify we have a substantial knowledge base
416
342
file_count = FileObject .objects .filter (user = user , agent = None ).count ()
@@ -493,139 +419,18 @@ def client(
493
419
return TestClient (app )
494
420
495
421
496
- @pytest .fixture (scope = "function" )
497
- def new_org_file (default_user : KhojUser , content_config : ContentConfig ):
498
- # Setup
499
- org_config = LocalOrgConfig .objects .filter (user = default_user ).first ()
500
- input_filters = org_config .input_filter
501
- new_org_file = Path (input_filters [0 ]).parent / "new_file.org"
502
- new_org_file .touch ()
503
-
504
- yield new_org_file
505
-
506
- # Cleanup
507
- if new_org_file .exists ():
508
- new_org_file .unlink ()
509
-
510
-
511
- @pytest .fixture (scope = "function" )
512
- def org_config_with_only_new_file (new_org_file : Path , default_user : KhojUser ):
513
- LocalOrgConfig .objects .update (input_files = [str (new_org_file )], input_filter = None )
514
- return LocalOrgConfig .objects .filter (user = default_user ).first ()
515
-
516
-
517
422
@pytest .fixture (scope = "function" )
518
423
def pdf_configured_user1 (default_user : KhojUser ):
519
- LocalPdfConfig . objects . create (
520
- input_files = None ,
521
- input_filter = [ "tests/data/pdf/singlepage.pdf" ],
522
- user = default_user ,
523
- )
524
- # Index Markdown Content for Search
525
- all_files = fs_syncer . collect_files ( user = default_user )
526
- configure_content (default_user , all_files )
424
+ # Read data from pdf file at tests/data/pdf/singlepage.pdf
425
+ pdf_file_path = "tests/data/pdf/singlepage.pdf"
426
+ with open ( pdf_file_path , "rb" ) as pdf_file :
427
+ pdf_data = pdf_file . read ()
428
+
429
+ knowledge_base = { "pdf" : { "singlepage.pdf" : pdf_data }}
430
+ # Index Content for Search
431
+ configure_content (default_user , knowledge_base )
527
432
528
433
529
434
@pytest .fixture (scope = "function" )
530
435
def sample_org_data ():
531
436
return get_sample_data ("org" )
532
-
533
-
534
- def get_sample_data (type ):
535
- sample_data = {
536
- "org" : {
537
- "elisp.org" : """
538
- * Emacs Khoj
539
- /An Emacs interface for [[https://github.com/khoj-ai/khoj][khoj]]/
540
-
541
- ** Requirements
542
- - Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
543
-
544
- ** Installation
545
- *** Direct
546
- - Put ~khoj.el~ in your Emacs load path. For e.g. ~/.emacs.d/lisp
547
- - Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
548
- #+begin_src elisp
549
- ;; Khoj Package
550
- (use-package khoj
551
- :load-path "~/.emacs.d/lisp/khoj.el"
552
- :bind ("C-c s" . 'khoj))
553
- #+end_src
554
-
555
- *** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
556
- - Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
557
- - Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
558
- #+begin_src elisp
559
- ;; Khoj Package
560
- (use-package khoj
561
- :quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/khoj-ai/khoj/master/interface/emacs/khoj.el")
562
- :bind ("C-c s" . 'khoj))
563
- #+end_src
564
-
565
- ** Usage
566
- 1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
567
- 2. Enter Query in Natural Language
568
- e.g. "What is the meaning of life?" "What are my life goals?"
569
- 3. Wait for results
570
- *Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
571
- 4. (Optional) Narrow down results further
572
- Include/Exclude specific words from results by adding to query
573
- e.g. "What is the meaning of life? -god +none"
574
-
575
- """ ,
576
- "readme.org" : """
577
- * Khoj
578
- /Allow natural language search on user content like notes, images using transformer based models/
579
-
580
- All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
581
-
582
- ** Dependencies
583
- - Python3
584
- - [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
585
-
586
- ** Install
587
- #+begin_src shell
588
- git clone https://github.com/khoj-ai/khoj && cd khoj
589
- conda env create -f environment.yml
590
- conda activate khoj
591
- #+end_src""" ,
592
- },
593
- "markdown" : {
594
- "readme.markdown" : """
595
- # Khoj
596
- Allow natural language search on user content like notes, images using transformer based models
597
-
598
- All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline
599
-
600
- ## Dependencies
601
- - Python3
602
- - [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
603
-
604
- ## Install
605
- ```shell
606
- git clone
607
- conda env create -f environment.yml
608
- conda activate khoj
609
- ```
610
- """
611
- },
612
- "plaintext" : {
613
- "readme.txt" : """
614
- Khoj
615
- Allow natural language search on user content like notes, images using transformer based models
616
-
617
- All data is processed locally. User can interface with khoj app via Emacs, API or Commandline
618
-
619
- Dependencies
620
- - Python3
621
- - Miniconda
622
-
623
- Install
624
- git clone
625
- conda env create -f environment.yml
626
- conda activate khoj
627
- """
628
- },
629
- }
630
-
631
- return sample_data [type ]
0 commit comments