Skip to content

Commit 8ff93e2

Browse files
committed
Update test setup to index test data after old indexing code removed
- Delete tests testing deprecated server side indexing flows - Delete `Local(Plaintext|Org|Markdown|Pdf)Config' methods, files and references in tests - Index test data via new helper method, `get_index_files' - It is modelled after the old `get_org_files' variants in main app - It passes the test data in required format to `configure_content' Allows maintaining the more realistic tests from before while using new indexing mechanism (rather than the deprecated server side indexing mechanism
1 parent 0af54d2 commit 8ff93e2

12 files changed

+294
-603
lines changed

tests/conftest.py

Lines changed: 32 additions & 227 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
import os
2-
from pathlib import Path
3-
41
import pytest
52
from fastapi import FastAPI
63
from fastapi.staticfiles import StaticFiles
@@ -11,6 +8,7 @@
118
configure_routes,
129
configure_search_types,
1310
)
11+
from khoj.database.adapters import get_default_search_model
1412
from khoj.database.models import (
1513
Agent,
1614
ChatModel,
@@ -19,21 +17,14 @@
1917
GithubRepoConfig,
2018
KhojApiUser,
2119
KhojUser,
22-
LocalMarkdownConfig,
23-
LocalOrgConfig,
24-
LocalPdfConfig,
25-
LocalPlaintextConfig,
2620
)
2721
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
2822
from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
2923
from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
3024
from khoj.routers.api_content import configure_content
3125
from khoj.search_type import text_search
32-
from khoj.utils import fs_syncer, state
33-
from khoj.utils.config import SearchModels
26+
from khoj.utils import state
3427
from khoj.utils.constants import web_directory
35-
from khoj.utils.helpers import resolve_absolute_path
36-
from khoj.utils.rawconfig import ContentConfig, SearchConfig
3728
from tests.helpers import (
3829
AiModelApiFactory,
3930
ChatModelFactory,
@@ -43,6 +34,8 @@
4334
UserFactory,
4435
get_chat_api_key,
4536
get_chat_provider,
37+
get_index_files,
38+
get_sample_data,
4639
)
4740

4841

@@ -59,17 +52,16 @@ def django_db_setup(django_db_setup, django_db_blocker):
5952

6053

6154
@pytest.fixture(scope="session")
62-
def search_config() -> SearchConfig:
55+
def search_config():
56+
search_model = get_default_search_model()
6357
state.embeddings_model = dict()
64-
state.embeddings_model["default"] = EmbeddingsModel()
58+
state.embeddings_model["default"] = EmbeddingsModel(
59+
model_name=search_model.bi_encoder, model_kwargs=search_model.bi_encoder_model_config
60+
)
6561
state.cross_encoder_model = dict()
66-
state.cross_encoder_model["default"] = CrossEncoderModel()
67-
68-
model_dir = resolve_absolute_path("~/.khoj/search")
69-
model_dir.mkdir(parents=True, exist_ok=True)
70-
search_config = SearchConfig()
71-
72-
return search_config
62+
state.cross_encoder_model["default"] = CrossEncoderModel(
63+
model_name=search_model.cross_encoder, model_kwargs=search_model.cross_encoder_model_config
64+
)
7365

7466

7567
@pytest.mark.django_db
@@ -201,13 +193,6 @@ def openai_agent():
201193
)
202194

203195

204-
@pytest.fixture(scope="session")
205-
def search_models(search_config: SearchConfig):
206-
search_models = SearchModels()
207-
208-
return search_models
209-
210-
211196
@pytest.mark.django_db
212197
@pytest.fixture
213198
def default_process_lock():
@@ -219,72 +204,23 @@ def anyio_backend():
219204
return "asyncio"
220205

221206

222-
@pytest.mark.django_db
223207
@pytest.fixture(scope="function")
224-
def content_config(tmp_path_factory, search_models: SearchModels, default_user: KhojUser):
225-
content_dir = tmp_path_factory.mktemp("content")
226-
227-
# Generate Image Embeddings from Test Images
228-
content_config = ContentConfig()
229-
230-
LocalOrgConfig.objects.create(
231-
input_files=None,
232-
input_filter=["tests/data/org/*.org"],
233-
index_heading_entries=False,
234-
user=default_user,
235-
)
236-
237-
text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=False, user=default_user)
238-
239-
if os.getenv("GITHUB_PAT_TOKEN"):
240-
GithubConfig.objects.create(
241-
pat_token=os.getenv("GITHUB_PAT_TOKEN"),
242-
user=default_user,
243-
)
244-
245-
GithubRepoConfig.objects.create(
246-
owner="khoj-ai",
247-
name="lantern",
248-
branch="master",
249-
github_config=GithubConfig.objects.get(user=default_user),
250-
)
251-
252-
LocalPlaintextConfig.objects.create(
253-
input_files=None,
254-
input_filter=["tests/data/plaintext/*.txt", "tests/data/plaintext/*.md", "tests/data/plaintext/*.html"],
255-
user=default_user,
256-
)
257-
258-
return content_config
259-
260-
261-
@pytest.fixture(scope="session")
262-
def md_content_config():
263-
markdown_config = LocalMarkdownConfig.objects.create(
264-
input_files=None,
265-
input_filter=["tests/data/markdown/*.markdown"],
266-
)
267-
268-
return markdown_config
269-
270-
271-
@pytest.fixture(scope="function")
272-
def chat_client(search_config: SearchConfig, default_user2: KhojUser):
208+
def chat_client(search_config, default_user2: KhojUser):
273209
return chat_client_builder(search_config, default_user2, require_auth=False)
274210

275211

276212
@pytest.fixture(scope="function")
277-
def chat_client_with_auth(search_config: SearchConfig, default_user2: KhojUser):
213+
def chat_client_with_auth(search_config, default_user2: KhojUser):
278214
return chat_client_builder(search_config, default_user2, require_auth=True)
279215

280216

281217
@pytest.fixture(scope="function")
282-
def chat_client_no_background(search_config: SearchConfig, default_user2: KhojUser):
218+
def chat_client_no_background(search_config, default_user2: KhojUser):
283219
return chat_client_builder(search_config, default_user2, index_content=False, require_auth=False)
284220

285221

286222
@pytest.fixture(scope="function")
287-
def chat_client_with_large_kb(search_config: SearchConfig, default_user2: KhojUser):
223+
def chat_client_with_large_kb(search_config, default_user2: KhojUser):
288224
"""
289225
Chat client fixture that creates a large knowledge base with many files
290226
for stress testing atomic agent updates.
@@ -298,15 +234,11 @@ def chat_client_builder(search_config, user, index_content=True, require_auth=Fa
298234
state.SearchType = configure_search_types()
299235

300236
if index_content:
301-
LocalMarkdownConfig.objects.create(
302-
input_files=None,
303-
input_filter=["tests/data/markdown/*.markdown"],
304-
user=user,
305-
)
237+
file_type = "markdown"
238+
files_to_index = {file_type: get_index_files(input_filters=[f"tests/data/{file_type}/*.{file_type}"])}
306239

307240
# Index Markdown Content for Search
308-
all_files = fs_syncer.collect_files(user=user)
309-
configure_content(user, all_files)
241+
configure_content(user, files_to_index)
310242

311243
# Initialize Processor from Config
312244
chat_provider = get_chat_provider()
@@ -346,12 +278,13 @@ def large_kb_chat_client_builder(search_config, user):
346278

347279
# Create temporary directory for large number of test files
348280
temp_dir = tempfile.mkdtemp(prefix="khoj_test_large_kb_")
281+
file_type = "markdown"
349282
large_file_list = []
350283

351284
try:
352285
# Generate 200 test files with substantial content
353286
for i in range(300):
354-
file_path = os.path.join(temp_dir, f"test_file_{i:03d}.markdown")
287+
file_path = os.path.join(temp_dir, f"test_file_{i:03d}.{file_type}")
355288
content = f"""
356289
# Test File {i}
357290
@@ -401,16 +334,9 @@ def process(self):
401334
f.write(content)
402335
large_file_list.append(file_path)
403336

404-
# Create LocalMarkdownConfig with all the generated files
405-
LocalMarkdownConfig.objects.create(
406-
input_files=large_file_list,
407-
input_filter=None,
408-
user=user,
409-
)
410-
411-
# Index all the files into the user's knowledge base
412-
all_files = fs_syncer.collect_files(user=user)
413-
configure_content(user, all_files)
337+
# Index all generated files into the user's knowledge base
338+
files_to_index = {file_type: get_index_files(input_files=large_file_list, input_filters=None)}
339+
configure_content(user, files_to_index)
414340

415341
# Verify we have a substantial knowledge base
416342
file_count = FileObject.objects.filter(user=user, agent=None).count()
@@ -493,139 +419,18 @@ def client(
493419
return TestClient(app)
494420

495421

496-
@pytest.fixture(scope="function")
497-
def new_org_file(default_user: KhojUser, content_config: ContentConfig):
498-
# Setup
499-
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
500-
input_filters = org_config.input_filter
501-
new_org_file = Path(input_filters[0]).parent / "new_file.org"
502-
new_org_file.touch()
503-
504-
yield new_org_file
505-
506-
# Cleanup
507-
if new_org_file.exists():
508-
new_org_file.unlink()
509-
510-
511-
@pytest.fixture(scope="function")
512-
def org_config_with_only_new_file(new_org_file: Path, default_user: KhojUser):
513-
LocalOrgConfig.objects.update(input_files=[str(new_org_file)], input_filter=None)
514-
return LocalOrgConfig.objects.filter(user=default_user).first()
515-
516-
517422
@pytest.fixture(scope="function")
518423
def pdf_configured_user1(default_user: KhojUser):
519-
LocalPdfConfig.objects.create(
520-
input_files=None,
521-
input_filter=["tests/data/pdf/singlepage.pdf"],
522-
user=default_user,
523-
)
524-
# Index Markdown Content for Search
525-
all_files = fs_syncer.collect_files(user=default_user)
526-
configure_content(default_user, all_files)
424+
# Read data from pdf file at tests/data/pdf/singlepage.pdf
425+
pdf_file_path = "tests/data/pdf/singlepage.pdf"
426+
with open(pdf_file_path, "rb") as pdf_file:
427+
pdf_data = pdf_file.read()
428+
429+
knowledge_base = {"pdf": {"singlepage.pdf": pdf_data}}
430+
# Index Content for Search
431+
configure_content(default_user, knowledge_base)
527432

528433

529434
@pytest.fixture(scope="function")
530435
def sample_org_data():
531436
return get_sample_data("org")
532-
533-
534-
def get_sample_data(type):
535-
sample_data = {
536-
"org": {
537-
"elisp.org": """
538-
* Emacs Khoj
539-
/An Emacs interface for [[https://github.com/khoj-ai/khoj][khoj]]/
540-
541-
** Requirements
542-
- Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
543-
544-
** Installation
545-
*** Direct
546-
- Put ~khoj.el~ in your Emacs load path. For e.g. ~/.emacs.d/lisp
547-
- Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
548-
#+begin_src elisp
549-
;; Khoj Package
550-
(use-package khoj
551-
:load-path "~/.emacs.d/lisp/khoj.el"
552-
:bind ("C-c s" . 'khoj))
553-
#+end_src
554-
555-
*** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
556-
- Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
557-
- Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
558-
#+begin_src elisp
559-
;; Khoj Package
560-
(use-package khoj
561-
:quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/khoj-ai/khoj/master/interface/emacs/khoj.el")
562-
:bind ("C-c s" . 'khoj))
563-
#+end_src
564-
565-
** Usage
566-
1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
567-
2. Enter Query in Natural Language
568-
e.g. "What is the meaning of life?" "What are my life goals?"
569-
3. Wait for results
570-
*Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
571-
4. (Optional) Narrow down results further
572-
Include/Exclude specific words from results by adding to query
573-
e.g. "What is the meaning of life? -god +none"
574-
575-
""",
576-
"readme.org": """
577-
* Khoj
578-
/Allow natural language search on user content like notes, images using transformer based models/
579-
580-
All data is processed locally. User can interface with khoj app via [[./interface/emacs/khoj.el][Emacs]], API or Commandline
581-
582-
** Dependencies
583-
- Python3
584-
- [[https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links][Miniconda]]
585-
586-
** Install
587-
#+begin_src shell
588-
git clone https://github.com/khoj-ai/khoj && cd khoj
589-
conda env create -f environment.yml
590-
conda activate khoj
591-
#+end_src""",
592-
},
593-
"markdown": {
594-
"readme.markdown": """
595-
# Khoj
596-
Allow natural language search on user content like notes, images using transformer based models
597-
598-
All data is processed locally. User can interface with khoj app via [Emacs](./interface/emacs/khoj.el), API or Commandline
599-
600-
## Dependencies
601-
- Python3
602-
- [Miniconda](https://docs.conda.io/en/latest/miniconda.html#latest-miniconda-installer-links)
603-
604-
## Install
605-
```shell
606-
git clone
607-
conda env create -f environment.yml
608-
conda activate khoj
609-
```
610-
"""
611-
},
612-
"plaintext": {
613-
"readme.txt": """
614-
Khoj
615-
Allow natural language search on user content like notes, images using transformer based models
616-
617-
All data is processed locally. User can interface with khoj app via Emacs, API or Commandline
618-
619-
Dependencies
620-
- Python3
621-
- Miniconda
622-
623-
Install
624-
git clone
625-
conda env create -f environment.yml
626-
conda activate khoj
627-
"""
628-
},
629-
}
630-
631-
return sample_data[type]

0 commit comments

Comments
 (0)