From 8116282d6ecefbf61ac3010ca141cad59fc84bdf Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Thu, 17 Jul 2025 19:48:21 +0000 Subject: [PATCH 1/9] ci: update requirements and Dockerfile --- Dockerfile | 2 +- dev.Dockerfile | 2 +- gpu.Dockerfile | 2 +- requirements/torch-cpu-requirements.txt | 52 +++++++++++++----------- requirements/torch-cuda-requirements.txt | 52 +++++++++++++----------- 5 files changed, 61 insertions(+), 49 deletions(-) diff --git a/Dockerfile b/Dockerfile index 78d6d39..045935e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM kernai/refinery-parent-images:v1.22.0-torch-cpu +FROM kernai/refinery-parent-images:parent-image-updates-torch-cpu WORKDIR /program diff --git a/dev.Dockerfile b/dev.Dockerfile index 6ff2099..ee09b36 100644 --- a/dev.Dockerfile +++ b/dev.Dockerfile @@ -1,4 +1,4 @@ -FROM kernai/refinery-parent-images:v1.22.0-torch-cpu +FROM kernai/refinery-parent-images:parent-image-updates-torch-cpu WORKDIR /app diff --git a/gpu.Dockerfile b/gpu.Dockerfile index 1b3c4bf..89ff713 100644 --- a/gpu.Dockerfile +++ b/gpu.Dockerfile @@ -1,4 +1,4 @@ -FROM kernai/refinery-parent-images:v1.22.0-torch-cuda +FROM kernai/refinery-parent-images:parent-image-updates-torch-cuda WORKDIR /program diff --git a/requirements/torch-cpu-requirements.txt b/requirements/torch-cpu-requirements.txt index 3e3a4df..dccae98 100644 --- a/requirements/torch-cpu-requirements.txt +++ b/requirements/torch-cpu-requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile torch-cpu-requirements.in @@ -10,22 +10,26 @@ annotated-types==0.7.0 # via pydantic anyio==4.9.0 # via starlette -boto3==1.25.0 +argon2-cffi==25.1.0 + # via minio +argon2-cffi-bindings==21.2.0 + # via argon2-cffi +boto3==1.39.6 # via -r common-requirements.in -botocore==1.28.5 +botocore==1.39.6 # via # boto3 # s3transfer -certifi==2025.6.15 +certifi==2025.7.14 # via # minio # requests +cffi==1.17.1 + # via argon2-cffi-bindings charset-normalizer==3.4.2 # via requests -click==8.1.8 +click==8.2.1 # via uvicorn -exceptiongroup==1.3.0 - # via anyio fastapi==0.115.2 # via -r mini-requirements.in filelock==3.18.0 @@ -33,7 +37,7 @@ filelock==3.18.0 # huggingface-hub # torch # transformers -fsspec==2025.5.1 +fsspec==2025.7.0 # via # huggingface-hub # torch @@ -41,7 +45,7 @@ h11==0.16.0 # via uvicorn hf-xet==1.1.5 # via huggingface-hub -# huggingface-hub==0.33.2 +huggingface-hub==0.33.4 # via # tokenizers # transformers @@ -61,11 +65,11 @@ joblib==1.5.1 # scikit-optimize markupsafe==3.0.2 # via jinja2 -minio==7.1.12 +minio==7.2.15 # via -r common-requirements.in mpmath==1.3.0 # via sympy -networkx==3.2.1 +networkx==3.5 # via torch numpy==1.23.4 # via @@ -83,8 +87,12 @@ pandas==1.5.1 # via -r common-requirements.in psycopg2-binary==2.9.9 # via -r common-requirements.in -pyaml==25.5.0 +pyaml==25.7.0 # via scikit-optimize +pycparser==2.22 + # via cffi +pycryptodome==3.23.0 + # via minio pydantic==2.7.4 # via # -r mini-requirements.in @@ -104,16 +112,16 @@ pyyaml==6.0.2 # transformers regex==2024.11.6 # via transformers -requests==2.31.0 +requests==2.32.4 # via # -r mini-requirements.in # huggingface-hub # transformers -s3transfer==0.6.2 +s3transfer==0.13.0 # via boto3 safetensors==0.5.3 # via transformers -scikit-learn==1.1.2 +scikit-learn==1.5.2 # via # -r torch-cpu-requirements.in # scikit-optimize @@ -135,28 +143,26 @@ sympy==1.14.0 # via torch threadpoolctl==3.6.0 # via scikit-learn -# tokenizers==0.21.2 +tokenizers==0.21.2 # via transformers -# torch==2.7.1 +torch==2.7.1 # via -r torch-cpu-requirements.in tqdm==4.67.1 # via # huggingface-hub # transformers -# transformers==4.50.0 +transformers==4.53.2 # via -r torch-cpu-requirements.in -typing-extensions==4.14.0 +typing-extensions==4.14.1 # via # anyio - # exceptiongroup # fastapi # huggingface-hub + # minio # pydantic # pydantic-core - # starlette # torch - # uvicorn -urllib3==1.26.20 +urllib3==2.5.0 # via # botocore # minio diff --git a/requirements/torch-cuda-requirements.txt b/requirements/torch-cuda-requirements.txt index 32da02b..b5ba5df 100644 --- a/requirements/torch-cuda-requirements.txt +++ b/requirements/torch-cuda-requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile torch-cuda-requirements.in @@ -10,22 +10,26 @@ annotated-types==0.7.0 # via pydantic anyio==4.9.0 # via starlette -boto3==1.25.0 +argon2-cffi==25.1.0 + # via minio +argon2-cffi-bindings==21.2.0 + # via argon2-cffi +boto3==1.39.6 # via -r common-requirements.in -botocore==1.28.5 +botocore==1.39.6 # via # boto3 # s3transfer -certifi==2025.6.15 +certifi==2025.7.14 # via # minio # requests +cffi==1.17.1 + # via argon2-cffi-bindings charset-normalizer==3.4.2 # via requests -click==8.1.8 +click==8.2.1 # via uvicorn -exceptiongroup==1.3.0 - # via anyio fastapi==0.115.2 # via -r mini-requirements.in filelock==3.18.0 @@ -33,7 +37,7 @@ filelock==3.18.0 # huggingface-hub # torch # transformers -fsspec==2025.5.1 +fsspec==2025.7.0 # via # huggingface-hub # torch @@ -41,7 +45,7 @@ h11==0.16.0 # via uvicorn hf-xet==1.1.5 # via huggingface-hub -# huggingface-hub==0.33.2 +huggingface-hub==0.33.4 # via # tokenizers # transformers @@ -61,11 +65,11 @@ joblib==1.5.1 # scikit-optimize markupsafe==3.0.2 # via jinja2 -minio==7.1.12 +minio==7.2.15 # via -r common-requirements.in mpmath==1.3.0 # via sympy -networkx==3.2.1 +networkx==3.5 # via torch numpy==1.23.4 # via @@ -83,8 +87,12 @@ pandas==1.5.1 # via -r common-requirements.in psycopg2-binary==2.9.9 # via -r common-requirements.in -pyaml==25.5.0 +pyaml==25.7.0 # via scikit-optimize +pycparser==2.22 + # via cffi +pycryptodome==3.23.0 + # via minio pydantic==2.7.4 # via # -r mini-requirements.in @@ -104,16 +112,16 @@ pyyaml==6.0.2 # transformers regex==2024.11.6 # via transformers -requests==2.31.0 +requests==2.32.4 # via # -r mini-requirements.in # huggingface-hub # transformers -s3transfer==0.6.2 +s3transfer==0.13.0 # via boto3 safetensors==0.5.3 # via transformers -scikit-learn==1.1.2 +scikit-learn==1.5.2 # via # -r torch-cuda-requirements.in # scikit-optimize @@ -135,28 +143,26 @@ sympy==1.14.0 # via torch threadpoolctl==3.6.0 # via scikit-learn -# tokenizers==0.21.2 +tokenizers==0.21.2 # via transformers -# torch==2.7.1 +torch==2.7.1 # via -r torch-cuda-requirements.in tqdm==4.67.1 # via # huggingface-hub # transformers -# transformers==4.50.0 +transformers==4.53.2 # via -r torch-cuda-requirements.in -typing-extensions==4.14.0 +typing-extensions==4.14.1 # via # anyio - # exceptiongroup # fastapi # huggingface-hub + # minio # pydantic # pydantic-core - # starlette # torch - # uvicorn -urllib3==1.26.20 +urllib3==2.5.0 # via # botocore # minio From 50426fb3f6b7d479e675d4f46767efab98362b5c Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Thu, 17 Jul 2025 21:49:34 +0200 Subject: [PATCH 2/9] fix(ci): update requirements --- requirements/gpu-requirements.in | 9 ++++----- requirements/requirements.in | 8 +++----- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/requirements/gpu-requirements.in b/requirements/gpu-requirements.in index ce66ebf..da3b804 100644 --- a/requirements/gpu-requirements.in +++ b/requirements/gpu-requirements.in @@ -1,9 +1,8 @@ --r torch-cuda-requirements.txt -embedders==0.1.8 +-r torch-cpu-requirements.txt +embedders @ git+https://github.com/code-kern-ai/embedders@python-upgrade spacy==3.7.5 -pydantic==2.7.4 -torchvision==0.17.0 # define version for torchvision to avoid dependency conflict +torchvision==0.19.1 # define version for torchvision to avoid dependency conflict +sentence-transformers==5.0.0 # last version with default_prompt_name & pooling_mode_weightedmean_tokens # higher only possible with embedders/gates change openai>=0.27.6,<1.0 # define version for openai to avoid dependency conflict for embedder lib -huggingface-hub==0.25.2 # define version for huggingface-hub to avoid dependency conflict for embedder lib httpx==0.28.1 httpcore==1.0.9 \ No newline at end of file diff --git a/requirements/requirements.in b/requirements/requirements.in index 1415a3c..da3b804 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -1,10 +1,8 @@ -r torch-cpu-requirements.txt -embedders==0.1.8 +embedders @ git+https://github.com/code-kern-ai/embedders@python-upgrade spacy==3.7.5 -pydantic==2.7.4 -torchvision==0.17.0 # define version for torchvision to avoid dependency conflict -sentence-transformers~=2.2.0 # last version with default_prompt_name & pooling_mode_weightedmean_tokens # higher only possible with embedders/gates change +torchvision==0.19.1 # define version for torchvision to avoid dependency conflict +sentence-transformers==5.0.0 # last version with default_prompt_name & pooling_mode_weightedmean_tokens # higher only possible with embedders/gates change openai>=0.27.6,<1.0 # define version for openai to avoid dependency conflict for embedder lib -huggingface-hub==0.25.2 # define version for huggingface-hub to avoid dependency conflict for embedder lib httpx==0.28.1 httpcore==1.0.9 \ No newline at end of file From 9d90035d33131d572e10fa02e723200658aed370 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Thu, 17 Jul 2025 21:55:16 +0200 Subject: [PATCH 3/9] fix(ci): requirements --- requirements/gpu-requirements.in | 6 ++---- requirements/requirements.in | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/requirements/gpu-requirements.in b/requirements/gpu-requirements.in index da3b804..c93be56 100644 --- a/requirements/gpu-requirements.in +++ b/requirements/gpu-requirements.in @@ -1,8 +1,6 @@ -r torch-cpu-requirements.txt embedders @ git+https://github.com/code-kern-ai/embedders@python-upgrade spacy==3.7.5 -torchvision==0.19.1 # define version for torchvision to avoid dependency conflict +torchvision==0.22.1 # define version for torchvision to avoid dependency conflict sentence-transformers==5.0.0 # last version with default_prompt_name & pooling_mode_weightedmean_tokens # higher only possible with embedders/gates change -openai>=0.27.6,<1.0 # define version for openai to avoid dependency conflict for embedder lib -httpx==0.28.1 -httpcore==1.0.9 \ No newline at end of file +openai>=0.27.6,<1.0 # define version for openai to avoid dependency conflict for embedder lib \ No newline at end of file diff --git a/requirements/requirements.in b/requirements/requirements.in index da3b804..c93be56 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -1,8 +1,6 @@ -r torch-cpu-requirements.txt embedders @ git+https://github.com/code-kern-ai/embedders@python-upgrade spacy==3.7.5 -torchvision==0.19.1 # define version for torchvision to avoid dependency conflict +torchvision==0.22.1 # define version for torchvision to avoid dependency conflict sentence-transformers==5.0.0 # last version with default_prompt_name & pooling_mode_weightedmean_tokens # higher only possible with embedders/gates change -openai>=0.27.6,<1.0 # define version for openai to avoid dependency conflict for embedder lib -httpx==0.28.1 -httpcore==1.0.9 \ No newline at end of file +openai>=0.27.6,<1.0 # define version for openai to avoid dependency conflict for embedder lib \ No newline at end of file From 12b042c5d8df29e834a8d9baaf2aa0e57eeffc84 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Thu, 17 Jul 2025 19:56:16 +0000 Subject: [PATCH 4/9] ci: update requirements.txt --- gpu-requirements.txt | 251 ++++++++++++++++++++++------------------- requirements.txt | 261 ++++++++++++++++++++++--------------------- 2 files changed, 272 insertions(+), 240 deletions(-) diff --git a/gpu-requirements.txt b/gpu-requirements.txt index b76f9c4..0ca0cd0 100644 --- a/gpu-requirements.txt +++ b/gpu-requirements.txt @@ -1,35 +1,43 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile --output-file=gpu-requirements.txt requirements/gpu-requirements.in # ---extra-index-url https://download.pytorch.org/whl/cu113 +--extra-index-url https://download.pytorch.org/whl/cpu -aiohttp==3.9.5 +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.12.14 # via openai -aiosignal==1.3.1 +aiosignal==1.4.0 # via aiohttp annotated-types==0.7.0 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # pydantic anyio==4.9.0 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # httpx # starlette -async-timeout==4.0.3 - # via aiohttp -attrs==23.2.0 +argon2-cffi==25.1.0 + # via + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # minio +argon2-cffi-bindings==21.2.0 + # via + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # argon2-cffi +attrs==25.3.0 # via aiohttp blis==0.7.11 # via thinc -boto3==1.25.0 - # via -r requirements/torch-cuda-requirements.txt -botocore==1.28.5 +boto3==1.39.6 + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt +botocore==1.39.6 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # boto3 # s3transfer catalogue==2.0.10 @@ -37,148 +45,147 @@ catalogue==2.0.10 # spacy # srsly # thinc -certifi==2025.6.15 +certifi==2025.7.14 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # httpcore # httpx # minio # requests +cffi==1.17.1 + # via + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # argon2-cffi-bindings charset-normalizer==3.4.2 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # requests -click==8.1.8 +click==8.2.1 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # typer # uvicorn -cloudpathlib==0.18.1 +cloudpathlib==0.21.1 # via weasel -cohere==5.3.5 +cohere==5.16.1 # via embedders confection==0.1.5 # via # thinc # weasel -cymem==2.0.8 +cymem==2.0.11 # via # preshed # spacy # thinc -embedders==0.1.8 +embedders @ git+https://github.com/code-kern-ai/embedders@python-upgrade # via -r requirements/gpu-requirements.in -exceptiongroup==1.3.0 - # via - # -r requirements/torch-cuda-requirements.txt - # anyio fastapi==0.115.2 - # via -r requirements/torch-cuda-requirements.txt -fastavro==1.9.4 + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt +fastavro==1.11.1 # via cohere filelock==3.18.0 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # huggingface-hub # torch # transformers -frozenlist==1.4.1 +frozenlist==1.7.0 # via # aiohttp # aiosignal -fsspec==2025.5.1 +fsspec==2025.7.0 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # huggingface-hub # torch +greenlet==3.2.3 + # via sqlalchemy h11==0.16.0 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # httpcore # uvicorn hf-xet==1.1.5 - # via -r requirements/torch-cuda-requirements.txt -httpcore==1.0.9 # via - # -r requirements/gpu-requirements.in - # httpx + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # huggingface-hub +httpcore==1.0.9 + # via httpx httpx==0.28.1 - # via - # -r requirements/gpu-requirements.in - # cohere + # via cohere httpx-sse==0.4.0 # via cohere -huggingface-hub==0.25.2 +huggingface-hub==0.33.4 # via - # -r requirements/gpu-requirements.in + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # sentence-transformers # tokenizers # transformers idna==3.10 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # anyio # httpx # requests # yarl jinja2==3.1.6 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # spacy # torch jmespath==1.0.1 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # boto3 # botocore joblib==1.5.1 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # scikit-learn # scikit-optimize -langcodes==3.4.0 +langcodes==3.5.0 # via spacy -language-data==1.2.0 +language-data==1.3.0 # via langcodes -marisa-trie==1.2.0 +marisa-trie==1.2.1 # via language-data markdown-it-py==3.0.0 # via rich markupsafe==3.0.2 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # jinja2 mdurl==0.1.2 # via markdown-it-py -minio==7.1.12 - # via -r requirements/torch-cuda-requirements.txt +minio==7.2.15 + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt mpmath==1.3.0 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # sympy -multidict==6.0.5 +multidict==6.6.3 # via # aiohttp # yarl -murmurhash==1.0.10 +murmurhash==1.0.13 # via # preshed # spacy # thinc -networkx==3.2.1 +networkx==3.5 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # torch numpy==1.23.4 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # blis # embedders # pandas # scikit-learn # scikit-optimize # scipy - # sentence-transformers # spacy # thinc # torchvision @@ -189,32 +196,43 @@ openai==0.28.1 # embedders packaging==25.0 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # huggingface-hub # spacy # thinc # transformers # weasel pandas==1.5.1 - # via -r requirements/torch-cuda-requirements.txt -pillow==10.3.0 + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt +pillow==11.3.0 # via # sentence-transformers # torchvision -preshed==3.0.9 +preshed==3.0.10 # via # spacy # thinc +propcache==0.3.2 + # via + # aiohttp + # yarl psycopg2-binary==2.9.9 - # via -r requirements/torch-cuda-requirements.txt -pyaml==25.5.0 + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt +pyaml==25.7.0 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # scikit-optimize +pycparser==2.22 + # via + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # cffi +pycryptodome==3.23.0 + # via + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # minio pydantic==2.7.4 # via - # -r requirements/gpu-requirements.in - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # cohere # confection # fastapi @@ -223,76 +241,78 @@ pydantic==2.7.4 # weasel pydantic-core==2.18.4 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # cohere # pydantic -pygments==2.18.0 +pygments==2.19.2 # via rich python-dateutil==2.9.0.post0 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # botocore # pandas pytz==2025.2 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # pandas pyyaml==6.0.2 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # huggingface-hub # pyaml # transformers regex==2024.11.6 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # transformers -requests==2.31.0 +requests==2.32.4 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # cohere # huggingface-hub # openai # spacy - # torchvision # transformers # weasel -rich==13.7.1 +rich==14.0.0 # via typer -s3transfer==0.6.2 +s3transfer==0.13.0 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # boto3 safetensors==0.5.3 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # transformers -scikit-learn==1.1.2 +scikit-learn==1.5.2 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # embedders # scikit-optimize # sentence-transformers scikit-optimize==0.9.0 - # via -r requirements/torch-cuda-requirements.txt + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt scipy==1.13.1 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # scikit-learn # scikit-optimize # sentence-transformers -sentence-transformers==3.0.1 - # via embedders +sentence-transformers==5.0.0 + # via + # -r requirements/gpu-requirements.in + # embedders shellingham==1.5.4 # via typer six==1.17.0 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # python-dateutil -smart-open==7.0.4 +smart-open==7.3.0.post1 # via weasel sniffio==1.3.1 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # anyio spacy==3.7.5 # via @@ -303,8 +323,8 @@ spacy-legacy==3.0.12 spacy-loggers==1.0.5 # via spacy sqlalchemy==1.4.42 - # via -r requirements/torch-cuda-requirements.txt -srsly==2.4.8 + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt +srsly==2.5.1 # via # confection # spacy @@ -312,73 +332,74 @@ srsly==2.4.8 # weasel starlette==0.40.0 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # fastapi sympy==1.14.0 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # torch thinc==8.2.5 # via spacy threadpoolctl==3.6.0 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # scikit-learn -tokenizers==0.19.1 +tokenizers==0.21.2 # via + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # cohere # transformers -torch==2.2.0 +torch==2.7.1+cpu # via + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # embedders # sentence-transformers # torchvision -torchvision==0.17.0 +torchvision==0.22.1+cpu # via -r requirements/gpu-requirements.in tqdm==4.67.1 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # embedders # huggingface-hub # openai # sentence-transformers # spacy # transformers -transformers==4.41.2 +transformers==4.53.2 # via + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # embedders # sentence-transformers -typer==0.12.3 +typer==0.16.0 # via # spacy # weasel -types-requests==2.31.0.6 +types-requests==2.32.4.20250611 # via cohere -types-urllib3==1.26.25.14 - # via types-requests -typing-extensions==4.14.0 +typing-extensions==4.14.1 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # aiosignal # anyio - # cloudpathlib # cohere - # exceptiongroup # fastapi # huggingface-hub + # minio # pydantic # pydantic-core - # starlette + # sentence-transformers # torch # typer - # uvicorn -urllib3==1.26.20 +urllib3==2.5.0 # via - # -r requirements/torch-cuda-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # botocore # minio # requests + # types-requests uvicorn==0.35.0 - # via -r requirements/torch-cuda-requirements.txt + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt wasabi==1.1.3 # via # spacy @@ -386,9 +407,9 @@ wasabi==1.1.3 # weasel weasel==0.4.1 # via spacy -wrapt==1.16.0 +wrapt==1.17.2 # via smart-open -yarl==1.9.4 +yarl==1.20.1 # via aiohttp # The following packages are considered to be unsafe in a requirements file: diff --git a/requirements.txt b/requirements.txt index 2254e91..15034a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,35 +1,43 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # # pip-compile --output-file=requirements.txt requirements/requirements.in # --extra-index-url https://download.pytorch.org/whl/cpu -aiohttp==3.9.5 +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.12.14 # via openai -aiosignal==1.3.1 +aiosignal==1.4.0 # via aiohttp annotated-types==0.7.0 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # pydantic anyio==4.9.0 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # httpx # starlette -async-timeout==4.0.3 - # via aiohttp -attrs==23.2.0 +argon2-cffi==25.1.0 + # via + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # minio +argon2-cffi-bindings==21.2.0 + # via + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # argon2-cffi +attrs==25.3.0 # via aiohttp blis==0.7.11 # via thinc -boto3==1.25.0 - # via -r requirements/torch-cpu-requirements.txt -botocore==1.28.5 +boto3==1.39.6 + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt +botocore==1.39.6 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # boto3 # s3transfer catalogue==2.0.10 @@ -37,152 +45,147 @@ catalogue==2.0.10 # spacy # srsly # thinc -certifi==2025.6.15 +certifi==2025.7.14 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # httpcore # httpx # minio # requests +cffi==1.17.1 + # via + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # argon2-cffi-bindings charset-normalizer==3.4.2 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # requests -click==8.1.8 +click==8.2.1 # via - # -r requirements/torch-cpu-requirements.txt - # nltk + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # typer # uvicorn -cloudpathlib==0.18.1 +cloudpathlib==0.21.1 # via weasel -cohere==5.3.5 +cohere==5.16.1 # via embedders confection==0.1.5 # via # thinc # weasel -cymem==2.0.8 +cymem==2.0.11 # via # preshed # spacy # thinc -embedders==0.1.8 +embedders @ git+https://github.com/code-kern-ai/embedders@python-upgrade # via -r requirements/requirements.in -exceptiongroup==1.3.0 - # via - # -r requirements/torch-cpu-requirements.txt - # anyio fastapi==0.115.2 - # via -r requirements/torch-cpu-requirements.txt -fastavro==1.9.4 + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt +fastavro==1.11.1 # via cohere filelock==3.18.0 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # huggingface-hub # torch # transformers -frozenlist==1.4.1 +frozenlist==1.7.0 # via # aiohttp # aiosignal -fsspec==2025.5.1 +fsspec==2025.7.0 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # huggingface-hub # torch +greenlet==3.2.3 + # via sqlalchemy h11==0.16.0 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # httpcore # uvicorn hf-xet==1.1.5 - # via -r requirements/torch-cpu-requirements.txt -httpcore==1.0.9 # via - # -r requirements/requirements.in - # httpx + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # huggingface-hub +httpcore==1.0.9 + # via httpx httpx==0.28.1 - # via - # -r requirements/requirements.in - # cohere + # via cohere httpx-sse==0.4.0 # via cohere -huggingface-hub==0.25.2 +huggingface-hub==0.33.4 # via - # -r requirements/requirements.in + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # sentence-transformers # tokenizers # transformers idna==3.10 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # anyio # httpx # requests # yarl jinja2==3.1.6 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # spacy # torch jmespath==1.0.1 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # boto3 # botocore joblib==1.5.1 # via - # -r requirements/torch-cpu-requirements.txt - # nltk + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # scikit-learn # scikit-optimize -langcodes==3.4.0 +langcodes==3.5.0 # via spacy -language-data==1.2.0 +language-data==1.3.0 # via langcodes -marisa-trie==1.2.0 +marisa-trie==1.2.1 # via language-data markdown-it-py==3.0.0 # via rich markupsafe==3.0.2 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # jinja2 mdurl==0.1.2 # via markdown-it-py -minio==7.1.12 - # via -r requirements/torch-cpu-requirements.txt +minio==7.2.15 + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt mpmath==1.3.0 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # sympy -multidict==6.0.5 +multidict==6.6.3 # via # aiohttp # yarl -murmurhash==1.0.10 +murmurhash==1.0.13 # via # preshed # spacy # thinc -networkx==3.2.1 +networkx==3.5 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # torch -nltk==3.8.1 - # via sentence-transformers numpy==1.23.4 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # blis # embedders # pandas # scikit-learn # scikit-optimize # scipy - # sentence-transformers # spacy # thinc # torchvision @@ -193,30 +196,43 @@ openai==0.28.1 # embedders packaging==25.0 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # huggingface-hub # spacy # thinc # transformers # weasel pandas==1.5.1 - # via -r requirements/torch-cpu-requirements.txt -pillow==10.3.0 - # via torchvision -preshed==3.0.9 + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt +pillow==11.3.0 + # via + # sentence-transformers + # torchvision +preshed==3.0.10 # via # spacy # thinc +propcache==0.3.2 + # via + # aiohttp + # yarl psycopg2-binary==2.9.9 - # via -r requirements/torch-cpu-requirements.txt -pyaml==25.5.0 + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt +pyaml==25.7.0 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # scikit-optimize +pycparser==2.22 + # via + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # cffi +pycryptodome==3.23.0 + # via + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # minio pydantic==2.7.4 # via - # -r requirements/requirements.in - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # cohere # confection # fastapi @@ -225,81 +241,78 @@ pydantic==2.7.4 # weasel pydantic-core==2.18.4 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # cohere # pydantic -pygments==2.18.0 +pygments==2.19.2 # via rich python-dateutil==2.9.0.post0 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # botocore # pandas pytz==2025.2 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # pandas pyyaml==6.0.2 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # huggingface-hub # pyaml # transformers regex==2024.11.6 # via - # -r requirements/torch-cpu-requirements.txt - # nltk + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # transformers -requests==2.31.0 +requests==2.32.4 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # cohere # huggingface-hub # openai # spacy - # torchvision # transformers # weasel -rich==13.7.1 +rich==14.0.0 # via typer -s3transfer==0.6.2 +s3transfer==0.13.0 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # boto3 safetensors==0.5.3 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # transformers -scikit-learn==1.1.2 +scikit-learn==1.5.2 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # embedders # scikit-optimize # sentence-transformers scikit-optimize==0.9.0 - # via -r requirements/torch-cpu-requirements.txt + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt scipy==1.13.1 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # scikit-learn # scikit-optimize # sentence-transformers -sentence-transformers==2.2.2 +sentence-transformers==5.0.0 # via # -r requirements/requirements.in # embedders -sentencepiece==0.2.0 - # via sentence-transformers shellingham==1.5.4 # via typer six==1.17.0 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # python-dateutil -smart-open==7.0.4 +smart-open==7.3.0.post1 # via weasel sniffio==1.3.1 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # anyio spacy==3.7.5 # via @@ -310,8 +323,8 @@ spacy-legacy==3.0.12 spacy-loggers==1.0.5 # via spacy sqlalchemy==1.4.42 - # via -r requirements/torch-cpu-requirements.txt -srsly==2.4.8 + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt +srsly==2.5.1 # via # confection # spacy @@ -319,76 +332,74 @@ srsly==2.4.8 # weasel starlette==0.40.0 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # fastapi sympy==1.14.0 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # torch thinc==8.2.5 # via spacy threadpoolctl==3.6.0 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # scikit-learn -tokenizers==0.19.1 +tokenizers==0.21.2 # via + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # cohere # transformers -torch==2.2.0 +torch==2.7.1+cpu # via + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # embedders # sentence-transformers # torchvision -torchvision==0.17.0 - # via - # -r requirements/requirements.in - # sentence-transformers +torchvision==0.22.1+cpu + # via -r requirements/requirements.in tqdm==4.67.1 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # embedders # huggingface-hub - # nltk # openai # sentence-transformers # spacy # transformers -transformers==4.41.2 +transformers==4.53.2 # via + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # embedders # sentence-transformers -typer==0.12.3 +typer==0.16.0 # via # spacy # weasel -types-requests==2.31.0.6 +types-requests==2.32.4.20250611 # via cohere -types-urllib3==1.26.25.14 - # via types-requests -typing-extensions==4.14.0 +typing-extensions==4.14.1 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # aiosignal # anyio - # cloudpathlib # cohere - # exceptiongroup # fastapi # huggingface-hub + # minio # pydantic # pydantic-core - # starlette + # sentence-transformers # torch # typer - # uvicorn -urllib3==1.26.20 +urllib3==2.5.0 # via - # -r requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # botocore # minio # requests + # types-requests uvicorn==0.35.0 - # via -r requirements/torch-cpu-requirements.txt + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt wasabi==1.1.3 # via # spacy @@ -396,9 +407,9 @@ wasabi==1.1.3 # weasel weasel==0.4.1 # via spacy -wrapt==1.16.0 +wrapt==1.17.2 # via smart-open -yarl==1.9.4 +yarl==1.20.1 # via aiohttp # The following packages are considered to be unsafe in a requirements file: From 6e58a940b879b9ade639daec700ee5e0491a315f Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Fri, 18 Jul 2025 08:22:37 +0200 Subject: [PATCH 5/9] test: include git in dockerfile --- Dockerfile | 2 ++ dev.Dockerfile | 2 ++ 2 files changed, 4 insertions(+) diff --git a/Dockerfile b/Dockerfile index 045935e..111ba05 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,6 +4,8 @@ WORKDIR /program COPY requirements.txt . +RUN apt-get update && apt-get install -y git --no-install-recommends + RUN pip3 install --no-cache-dir -r requirements.txt COPY / . diff --git a/dev.Dockerfile b/dev.Dockerfile index ee09b36..d9d3332 100644 --- a/dev.Dockerfile +++ b/dev.Dockerfile @@ -6,6 +6,8 @@ VOLUME ["/app"] COPY requirements*.txt . +RUN apt-get update && apt-get install -y git --no-install-recommends + RUN pip3 install --no-cache-dir -r requirements-dev.txt COPY / . From b59cfc67ac112931dcc7e2dbecc3d2745ea094bd Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Fri, 18 Jul 2025 14:06:55 +0200 Subject: [PATCH 6/9] perf: integrate embedders --- .gitignore | 3 - Dockerfile | 2 - app.py | 8 +- controller.py | 44 +-- dev.Dockerfile | 2 - requirements.txt | 4 +- requirements/gpu-requirements.in | 3 +- requirements/requirements.in | 1 - {data => src}/__init__.py | 0 {util => src/data}/__init__.py | 0 {data => src/data}/data_type.py | 0 {data => src/data}/s3.py | 0 src/embedders/__init__.py | 193 +++++++++++ src/embedders/classification/__init__.py | 7 + src/embedders/classification/contextual.py | 193 +++++++++++ src/embedders/classification/count_based.py | 63 ++++ src/embedders/classification/reduce.py | 52 +++ src/embedders/enums.py | 6 + src/embedders/extraction/__init__.py | 23 ++ src/embedders/extraction/contextual.py | 349 ++++++++++++++++++++ src/embedders/extraction/count_based.py | 47 +++ src/embedders/extraction/reduce.py | 64 ++++ src/embedders/samples/__init__.py | 0 src/embedders/samples/clickbait.py | 106 ++++++ src/embedders/util.py | 12 + src/util/__init__.py | 0 {util => src/util}/config_handler.py | 2 +- {util => src/util}/daemon.py | 0 {util => src/util}/decorator.py | 0 {util => src/util}/embedders.py | 14 +- {util => src/util}/notification.py | 2 +- {util => src/util}/request_util.py | 0 32 files changed, 1155 insertions(+), 45 deletions(-) rename {data => src}/__init__.py (100%) rename {util => src/data}/__init__.py (100%) rename {data => src/data}/data_type.py (100%) rename {data => src/data}/s3.py (100%) create mode 100644 src/embedders/__init__.py create mode 100644 src/embedders/classification/__init__.py create mode 100644 src/embedders/classification/contextual.py create mode 100644 src/embedders/classification/count_based.py create mode 100644 src/embedders/classification/reduce.py create mode 100644 src/embedders/enums.py create mode 100644 src/embedders/extraction/__init__.py create mode 100644 src/embedders/extraction/contextual.py create mode 100644 src/embedders/extraction/count_based.py create mode 100644 src/embedders/extraction/reduce.py create mode 100644 src/embedders/samples/__init__.py create mode 100644 src/embedders/samples/clickbait.py create mode 100644 src/embedders/util.py create mode 100644 src/util/__init__.py rename {util => src/util}/config_handler.py (97%) rename {util => src/util}/daemon.py (100%) rename {util => src/util}/decorator.py (100%) rename {util => src/util}/embedders.py (88%) rename {util => src/util}/notification.py (96%) rename {util => src/util}/request_util.py (100%) diff --git a/.gitignore b/.gitignore index d7e5aaa..ff81e70 100644 --- a/.gitignore +++ b/.gitignore @@ -131,6 +131,3 @@ dmypy.json # Pyre type checker .pyre/ .DS_Store - -# embedders package -embedders/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 111ba05..045935e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,8 +4,6 @@ WORKDIR /program COPY requirements.txt . -RUN apt-get update && apt-get install -y git --no-install-recommends - RUN pip3 install --no-cache-dir -r requirements.txt COPY / . diff --git a/app.py b/app.py index dd485ac..9d98f6f 100644 --- a/app.py +++ b/app.py @@ -1,12 +1,14 @@ # -*- coding: utf-8 -*- from fastapi import FastAPI, responses, status, Request -import controller -from data import data_type from typing import Union + import torch +from src.util import request_util +from src.data import data_type +import controller + from submodules.model.business_objects import general -from util import request_util from submodules.model import session app = FastAPI() diff --git a/controller.py b/controller.py index 5b2087a..260406e 100644 --- a/controller.py +++ b/controller.py @@ -1,4 +1,27 @@ # -*- coding: utf-8 -*- +from typing import Any, Dict, Iterator, List, Optional +from fastapi import status +from spacy.tokens import DocBin, Doc +from spacy.vocab import Vocab + +import pickle +import torch +import traceback +import logging +import time +import zlib +import gc +import os +import openai +import pandas as pd + +from src.embedders import Transformer +from src.util import daemon, request_util +from src.util.decorator import param_throttle +from src.util.embedders import get_embedder +from src.util.notification import send_project_update, embedding_warning_templates + +from submodules.s3 import controller as s3 from submodules.model import enums from submodules.model.business_objects import ( attribute, @@ -10,27 +33,6 @@ notification, organization, ) -from fastapi import status -import pickle -import torch -import traceback -import logging -import time -import zlib -from spacy.tokens import DocBin, Doc -from spacy.vocab import Vocab -from embedders import Transformer -from typing import Any, Dict, Iterator, List, Optional - -from util import daemon, request_util -from util.decorator import param_throttle -from util.embedders import get_embedder -from util.notification import send_project_update, embedding_warning_templates -import os -import pandas as pd -from submodules.s3 import controller as s3 -import openai -import gc logging.basicConfig(level=logging.INFO) diff --git a/dev.Dockerfile b/dev.Dockerfile index d9d3332..ee09b36 100644 --- a/dev.Dockerfile +++ b/dev.Dockerfile @@ -6,8 +6,6 @@ VOLUME ["/app"] COPY requirements*.txt . -RUN apt-get update && apt-get install -y git --no-install-recommends - RUN pip3 install --no-cache-dir -r requirements-dev.txt COPY / . diff --git a/requirements.txt b/requirements.txt index 15034a5..826e6b5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -349,13 +349,13 @@ tokenizers==0.21.2 # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # cohere # transformers -torch==2.7.1+cpu +torch==2.7.1 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # embedders # sentence-transformers # torchvision -torchvision==0.22.1+cpu +torchvision==0.22.1 # via -r requirements/requirements.in tqdm==4.67.1 # via diff --git a/requirements/gpu-requirements.in b/requirements/gpu-requirements.in index c93be56..ee09aa1 100644 --- a/requirements/gpu-requirements.in +++ b/requirements/gpu-requirements.in @@ -1,5 +1,4 @@ --r torch-cpu-requirements.txt -embedders @ git+https://github.com/code-kern-ai/embedders@python-upgrade +-r torch-cuda-requirements.txt spacy==3.7.5 torchvision==0.22.1 # define version for torchvision to avoid dependency conflict sentence-transformers==5.0.0 # last version with default_prompt_name & pooling_mode_weightedmean_tokens # higher only possible with embedders/gates change diff --git a/requirements/requirements.in b/requirements/requirements.in index c93be56..42b8d96 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -1,5 +1,4 @@ -r torch-cpu-requirements.txt -embedders @ git+https://github.com/code-kern-ai/embedders@python-upgrade spacy==3.7.5 torchvision==0.22.1 # define version for torchvision to avoid dependency conflict sentence-transformers==5.0.0 # last version with default_prompt_name & pooling_mode_weightedmean_tokens # higher only possible with embedders/gates change diff --git a/data/__init__.py b/src/__init__.py similarity index 100% rename from data/__init__.py rename to src/__init__.py diff --git a/util/__init__.py b/src/data/__init__.py similarity index 100% rename from util/__init__.py rename to src/data/__init__.py diff --git a/data/data_type.py b/src/data/data_type.py similarity index 100% rename from data/data_type.py rename to src/data/data_type.py diff --git a/data/s3.py b/src/data/s3.py similarity index 100% rename from data/s3.py rename to src/data/s3.py diff --git a/src/embedders/__init__.py b/src/embedders/__init__.py new file mode 100644 index 0000000..ab1b0e1 --- /dev/null +++ b/src/embedders/__init__.py @@ -0,0 +1,193 @@ +from abc import ABCMeta, abstractmethod +from typing import Dict, List, Generator, Optional, Union +from spacy.tokens.doc import Doc +from sklearn.decomposition import PCA +from tqdm import tqdm +from embedders import util +from joblib import dump, load + + +class Transformer(metaclass=ABCMeta): + def __init__(self): + self._warnings = {} + + @abstractmethod + def fit_transform( + self, documents: List[Union[str, Doc]], as_generator: bool + ) -> Union[List, Generator]: + """Trains the given algorithm to embed textual documents into semantic vector-spacy representations. + + Args: + documents (List[Union[str, Doc]]): List of plain strings or spaCy documents. + as_generator (bool): Embeddings are calculated batch-wise. If this is set to False, the results will be summarized in one list, else a generator will yield the values. + + Returns: + Union[List, Generator]: List with all embeddings or generator that yields the embeddings. + """ + pass + + @abstractmethod + def transform( + self, documents: List[Union[str, Doc]], as_generator: bool + ) -> Union[List, Generator]: + """Uses the trained algorithm to embed textual documents into semantic vector-spacy representations. + + Args: + documents (List[Union[str, Doc]]): List of plain strings or spaCy documents. + as_generator (bool): Embeddings are calculated batch-wise. If this is set to False, the results will be summarized in one list, else a generator will yield the values. + + Returns: + Union[List, Generator]: List with all embeddings or generator that yields the embeddings. + """ + pass + + @abstractmethod + def get_warnings(self) -> Dict: + """Collects all warnings reported during the embedding creation or PCA. + + Returns: + List: List with all warnings + """ + pass + + +class Embedder(Transformer, metaclass=ABCMeta): + def __init__(self): + super().__init__() + + @abstractmethod + def _encode(self, documents: List[Union[str, Doc]], fit_model: bool) -> Generator: + pass + + def _encode_batch( + self, + documents: List[Union[str, Doc]], + as_generator: bool, + fit_model: bool, + show_progress: Optional[bool] = True, + ) -> Union[List, Generator]: + if as_generator: + return self._encode(documents, fit_model) + else: + embeddings = [] + if show_progress: + num_batches = util.num_batches(documents, self.batch_size) + print("Initializing model, might take some time...") + for embedding_batch in tqdm( + self._encode(documents, fit_model), + total=num_batches, + desc="Encoding batches ...", + ): + embeddings.extend(embedding_batch) + else: + for embedding_batch in self._encode(documents, fit_model): + embeddings.extend(embedding_batch) + return embeddings + + def fit_transform( + self, documents: List[Union[str, Doc]], as_generator: bool = False + ) -> Union[List, Generator]: + return self._encode_batch(documents, as_generator, True) + + def transform( + self, documents: List[Union[str, Doc]], as_generator: bool = False + ) -> Union[List, Generator]: + return self._encode_batch(documents, as_generator, False) + + def get_warnings(self) -> Dict: + return self._warnings + + +class PCAReducer(Transformer, metaclass=ABCMeta): + """Wraps embedder into a principial component analysis to reduce the dimensionality. + + Args: + embedder (Embedder): Algorithm to embed the documents. + n_components (int, optional): Number of principal components to keep. Defaults to 8. + autocorrect_n_components (bool, optional): If there are less data samples than specified components, this will automatically reduce the number of principial components. Defaults to True. + """ + + def __init__( + self, + embedder: Embedder, + n_components: int = 8, + autocorrect_n_components: bool = True, + **kwargs + ): + super().__init__() + self.embedder = embedder + self.reducer = PCA(n_components=n_components, **kwargs) + self.batch_size = self.embedder.batch_size + self.autocorrect_n_components = autocorrect_n_components + + def store_pca_weights(self, file_name: str): + """Stores the PCA weights to a file. + + Args: + file_name (str): Path to the file without any file endings. + """ + dump(self.reducer, f'{file_name}.joblib') + + def load_pca_weights(self, file_name: str): + """Loads the PCA weights from a file. + + Args: + file_name (str): Path to the file without any file endings. + """ + self.reducer = load(f'{file_name}.joblib') + + @abstractmethod + def _reduce( + self, + documents: List[Union[str, Doc]], + fit_model: bool, + fit_after_n_batches: int, + ): + pass + + def _reduce_batch( + self, + documents: List[Union[str, Doc]], + as_generator: bool, + fit_model: bool, + fit_after_n_batches: int, + ) -> Union[List, Generator]: + if as_generator: + return self._reduce(documents, fit_model, fit_after_n_batches) + else: + embeddings = [] + for embedding_batch in self._reduce( + documents, fit_model, fit_after_n_batches + ): + embeddings.extend(embedding_batch) + return embeddings + + def fit_transform( + self, + documents: List[Union[str, Doc]], + as_generator: bool = False, + fit_after_n_batches: int = 5, + ) -> Union[List, Generator]: + """Trains the given algorithm to embed textual documents into semantic vector-spacy representations. + + Args: + documents (List[Union[str, Doc]]): List of plain strings or spaCy documents. + as_generator (bool, optional): Embeddings are calculated batch-wise. If this is set to False, the results will be summarized in one list, else a generator will yield the values.. Defaults to False. + fit_after_n_batches (int, optional): Maximal batch iteration, after which the PCA is fitted. Defaults to 5. + + Returns: + Union[List, Generator]: List with all embeddings or generator that yields the embeddings. + """ + + return self._reduce_batch( + documents, + as_generator, + True, + fit_after_n_batches, + ) + + def transform(self, documents, as_generator=False) -> Union[List, Generator]: + return self._reduce_batch(documents, as_generator, False, 0) + + def get_warnings(self) -> Dict: + return {**self._warnings, **self.embedder.get_warnings()} diff --git a/src/embedders/classification/__init__.py b/src/embedders/classification/__init__.py new file mode 100644 index 0000000..821d087 --- /dev/null +++ b/src/embedders/classification/__init__.py @@ -0,0 +1,7 @@ +from embedders import Embedder + + +class SentenceEmbedder(Embedder): + def __init__(self, batch_size: int = 128): + super().__init__() + self.batch_size = batch_size diff --git a/src/embedders/classification/contextual.py b/src/embedders/classification/contextual.py new file mode 100644 index 0000000..ed98a1e --- /dev/null +++ b/src/embedders/classification/contextual.py @@ -0,0 +1,193 @@ +from typing import List, Optional, Union, Generator +from sentence_transformers import SentenceTransformer +from embedders import util +from embedders.classification import SentenceEmbedder +from spacy.tokens.doc import Doc +import torch +import openai +from openai import error as openai_error +import cohere +import time + + +class TransformerSentenceEmbedder(SentenceEmbedder): + """Embeds documents using large, pre-trained transformers from https://huggingface.co + + Args: + config_string (str): Name of the model listed on https://huggingface.co/models + batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. + """ + + def __init__(self, config_string: str, batch_size: int = 128): + super().__init__(batch_size) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model = SentenceTransformer(config_string).to(self.device) + + def _encode( + self, documents: List[Union[str, Doc]], fit_model: bool + ) -> Generator[List[List[float]], None, None]: + for documents_batch in util.batch(documents, self.batch_size): + yield self.model.encode(documents_batch, show_progress_bar=False).tolist() + + +class HuggingFaceSentenceEmbedder(TransformerSentenceEmbedder): + def __init__(self, config_string: str, batch_size: int = 128): + super().__init__(config_string, batch_size) + + +class OpenAISentenceEmbedder(SentenceEmbedder): + def __init__( + self, + openai_api_key: str, + model_name: str, + batch_size: int = 128, + api_base: Optional[str] = None, + api_type: Optional[str] = None, + api_version: Optional[str] = None, + ): + """ + Embeds documents using large language models from https://openai.com or https://azure.microsoft.com + + Args: + openai_api_key (str): API key from OpenAI or Azure + model_name (str): Name of the embedding model from OpenAI (e.g. text-embedding-ada-002) or the name of your Azure endpoint + batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. + api_base (str, optional): If you use Azure, you need to provide the base URL of your Azure endpoint (e.g. 'https://azureopenkernai.openai.azure.com/'). Defaults to None. + api_type (str, optional): If you use Azure, you need to provide the type of your Azure endpoint (e.g. 'azure'). Defaults to None. + api_version (str, optional): If you use Azure, you need to provide the version of your Azure endpoint (e.g. '2023-05-15'). Defaults to None. + + Raises: + Exception: If you use Azure, you need to provide api_type, api_version and api_base. + + Examples: + >>> from embedders.classification.contextual import OpenAISentenceEmbedder + >>> embedder_openai = OpenAISentenceEmbedder( + ... "my-key-from-openai", + ... "text-embedding-ada-002", + ... ) + >>> embeddings = embedder_openai.transform(["This is a test", "This is another test"]) + >>> print(embeddings) + [[-0.0001, 0.0002, ...], [-0.0001, 0.0002, ...]] + + >>> from embedders.classification.contextual import OpenAISentenceEmbedder + >>> embedder_azure = OpenAISentenceEmbedder( + ... "my-key-from-azure", + ... "my-endpoint-name", + ... api_base="https://azureopenkernai.openai.azure.com/", + ... api_type="azure", + ... api_version="2023-05-15", + ... ) + >>> embeddings = embedder_azure.transform(["This is a test", "This is another test"]) + >>> print(embeddings) + [[-0.0001, 0.0002, ...], [-0.0001, 0.0002, ...]] + + """ + super().__init__(batch_size) + self.model_name = model_name + self.openai_api_key = openai_api_key + openai.api_key = self.openai_api_key + self.api_base = api_base + self.api_type = api_type + self.api_version = api_version + + self.use_azure = any( + [ + api_base is not None, + api_type is not None, + api_version is not None, + ] + ) + if self.use_azure: + assert ( + api_type is not None + and api_version is not None + and api_base is not None + ), "If you want to use Azure, you need to provide api_type, api_version and api_base." + + openai.api_base = api_base + openai.api_type = api_type + openai.api_version = api_version + + def __getstate__(self): + state = self.__dict__.copy() + return state + + def __setstate__(self, state): + self.__dict__.update(state) + self.model_name = state["model_name"] + self.openai_api_key = state["openai_api_key"] + openai.api_key = self.openai_api_key + self.use_azure = state.get("use_azure") + if self.use_azure: + self.api_base = state["api_base"] + self.api_type = state["api_type"] + self.api_version = state["api_version"] + openai.api_base = self.api_base + openai.api_type = self.api_type + openai.api_version = self.api_version + + def _encode( + self, documents: List[Union[str, Doc]], fit_model: bool + ) -> Generator[List[List[float]], None, None]: + for documents_batch in util.batch(documents, self.batch_size): + documents_batch = [doc.replace("\n", " ") for doc in documents_batch] + try: + if self.use_azure: + embeddings = [] + for azure_batch in util.batch(documents_batch, 16): + # azure only allows up to 16 documents per request + count = 0 + while True and count < 60: + try: + count += 1 + response = openai.Embedding.create( + input=azure_batch, engine=self.model_name + ) + break + except openai.error.RateLimitError as e: + if count >= 60: + raise e + if count == 1: + print( + "Rate limit exceeded. Waiting 10 seconds...", + flush=True, + ) + time.sleep(10.05) + else: + time.sleep(1) + embeddings += [entry["embedding"] for entry in response["data"]] + else: + response = openai.Embedding.create( + input=documents_batch, engine=self.model_name + ) + embeddings = [entry["embedding"] for entry in response["data"]] + yield embeddings + except openai_error.AuthenticationError: + raise Exception( + "OpenAI API key is invalid. Please provide a valid API key in the constructor of OpenAISentenceEmbedder." + ) + + +class CohereSentenceEmbedder(SentenceEmbedder): + def __init__(self, cohere_api_key: str, batch_size: int = 128): + super().__init__(batch_size) + self.cohere_api_key = cohere_api_key + self.model = cohere.Client(self.cohere_api_key) + + def __getstate__(self): + state = self.__dict__.copy() + # Don't pickle 'model' + del state["model"] + return state + + def __setstate__(self, state): + self.__dict__.update(state) + # Restore 'model' after unpickling + self.model = cohere.Client(self.cohere_api_key) + + def _encode( + self, documents: List[Union[str, Doc]], fit_model: bool + ) -> Generator[List[List[float]], None, None]: + for documents_batch in util.batch(documents, self.batch_size): + embeddings = self.model.embed(documents_batch).embeddings + yield embeddings diff --git a/src/embedders/classification/count_based.py b/src/embedders/classification/count_based.py new file mode 100644 index 0000000..c9326dc --- /dev/null +++ b/src/embedders/classification/count_based.py @@ -0,0 +1,63 @@ +from typing import List, Union, Generator +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction.text import TfidfVectorizer +from embedders.classification import SentenceEmbedder +from embedders import util + + +class CountSentenceEmbedder(SentenceEmbedder): + def __init__(self, batch_size: int, min_df: float, **kwargs): + super().__init__(batch_size) + + def _encode( + self, documents: List[str], fit_model: bool + ) -> Generator[List[List[Union[float, int]]], None, None]: + if fit_model: + self.model.fit(documents) + + for documents_batch in util.batch(documents, self.batch_size): + documents_batch_embedded = [] + for doc in documents_batch: + documents_batch_embedded.append( + self.model.transform([doc]).toarray().tolist()[0] + ) + yield documents_batch_embedded + + +class BagOfCharsSentenceEmbedder(CountSentenceEmbedder): + """Embeds documents using plain Bag of Characters approach. + + Args: + batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. + min_df (float, optional): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. Defaults to 0.1. + """ + + def __init__(self, batch_size: int = 128, min_df: float = 0.1, **kwargs): + super().__init__(batch_size, min_df) + self.model = CountVectorizer(analyzer="char", min_df=min_df, **kwargs) + + +class BagOfWordsSentenceEmbedder(CountSentenceEmbedder): + """Embeds documents using plain Bag of Words approach. + + Args: + batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. + min_df (float, optional): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. Defaults to 0.1. + """ + + def __init__(self, batch_size: int = 128, min_df: float = 0.1, **kwargs): + super().__init__(batch_size, min_df) + self.model = CountVectorizer(min_df=min_df, **kwargs) + + +class TfidfSentenceEmbedder(CountSentenceEmbedder): + """Embeds documents using Term Frequency - Inverse Document Frequency (TFIDF) approach. + + Args: + batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. + min_df (float, optional): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float in range of [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. Defaults to 0.1. + """ + + def __init__(self, batch_size: int = 128, min_df: float = 0.1, **kwargs): + super().__init__(batch_size, min_df) + self.model = TfidfVectorizer(min_df=min_df, **kwargs) diff --git a/src/embedders/classification/reduce.py b/src/embedders/classification/reduce.py new file mode 100644 index 0000000..d5d14a2 --- /dev/null +++ b/src/embedders/classification/reduce.py @@ -0,0 +1,52 @@ +from spacy.tokens.doc import Doc +from typing import Union, List, Generator +import numpy as np +from embedders import PCAReducer, util + + +class PCASentenceReducer(PCAReducer): + def _transform( + self, embeddings: List[List[Union[int, float]]] + ) -> List[List[Union[float, int]]]: + return self.reducer.transform(embeddings).tolist() + + def _reduce( + self, + documents: List[Union[str, Doc]], + fit_model: bool, + fit_after_n_batches: int, + ) -> Generator[List[List[Union[float, int]]], None, None]: + if fit_model: + embeddings_training = [] + num_batches = util.num_batches(documents, self.embedder.batch_size) + fit_after_n_batches = min(num_batches, fit_after_n_batches) - 1 + for batch_idx, batch in enumerate( + self.embedder.fit_transform(documents, as_generator=True) + ): + if batch_idx <= fit_after_n_batches: + embeddings_training.append(batch) + + if batch_idx == fit_after_n_batches: + embeddings_training_flattened = [] + for batch_training in embeddings_training: + embeddings_training_flattened.extend(batch_training) + embeddings_training_flattened = np.array( + embeddings_training_flattened + ) + if ( + embeddings_training_flattened.shape[1] + < self.reducer.n_components + and self.autocorrect_n_components + ): + self.reducer.n_components = embeddings_training_flattened.shape[ + 1 + ] + self.reducer.fit(embeddings_training_flattened) + + for batch_training in embeddings_training: + yield self._transform(batch_training) + if batch_idx > fit_after_n_batches: + yield self._transform(batch) + else: + embeddings = self.embedder.transform(documents) + yield self._transform(embeddings) diff --git a/src/embedders/enums.py b/src/embedders/enums.py new file mode 100644 index 0000000..92771b5 --- /dev/null +++ b/src/embedders/enums.py @@ -0,0 +1,6 @@ +from enum import Enum + + +class WarningType(Enum): + DOCUMENT_IS_SPLITTED = "DOCUMENT_IS_SPLITTED" + TOKEN_MISMATCHING = "TOKEN_MISMATCHING" diff --git a/src/embedders/extraction/__init__.py b/src/embedders/extraction/__init__.py new file mode 100644 index 0000000..61052a3 --- /dev/null +++ b/src/embedders/extraction/__init__.py @@ -0,0 +1,23 @@ +import spacy +from embedders import Embedder +from spacy.tokens.doc import Doc +from typing import Union + + +class TokenEmbedder(Embedder): + def __init__( + self, language_code: str, precomputed_docs: bool = False, batch_size: int = 128 + ): + super().__init__() + self.preloaded = precomputed_docs + if precomputed_docs: + self.nlp = spacy.blank(language_code) + else: + self.nlp = spacy.load(language_code) + self.batch_size = batch_size + + def _get_tokenized_document(self, document: Union[str, Doc]): + if self.preloaded: + return document + else: + return self.nlp(document) diff --git a/src/embedders/extraction/contextual.py b/src/embedders/extraction/contextual.py new file mode 100644 index 0000000..5c8aca6 --- /dev/null +++ b/src/embedders/extraction/contextual.py @@ -0,0 +1,349 @@ +from typing import List, Tuple, Union, Iterator +import torch +import math +import numpy as np +import re +from transformers import AutoTokenizer, AutoModel +from collections import defaultdict +from embedders import util +from spacy.tokens.doc import Doc + + +from embedders.enums import WarningType +from embedders.extraction import TokenEmbedder + + +class TransformerTokenEmbedder(TokenEmbedder): + """Embeds documents using large, pre-trained transformers from https://huggingface.co + + Args: + config_string (str): Name of the model listed on https://huggingface.co/models + language_code (str): Name of the spaCy language model + precomputed_docs (bool, optional): If you have a large text corpus, it might make sense to precompute the data and input tokenized spaCy documents. Defaults to False. + batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. + """ + + _NL_TOKEN = "[NL]" + + def __init__( + self, + config_string: str, + language_code: str, + precomputed_docs: bool = False, + batch_size: int = 128, + ): + super().__init__(language_code, precomputed_docs, batch_size) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + self.transformer_tokenizer = AutoTokenizer.from_pretrained(config_string) + self.transformer_tokenizer.add_special_tokens( + {"additional_special_tokens": [self._NL_TOKEN]} + ) + + self.model = AutoModel.from_pretrained( + config_string, output_hidden_states=True + ).to(self.device) + self.model.resize_token_embeddings(len(self.transformer_tokenizer)) + + def _encode( + self, documents: Union[List[str], List[Doc]], fit_model: bool + ) -> Iterator[List[List[List[float]]]]: + for batch_number, documents_batch in enumerate( + util.batch(documents, self.batch_size) + ): + """ + Computation of embeddings for each spacy token of each document. For the + embedding creation transformer models are used. Embeddings are calculated + for the transformer tokens of the document, then these token embeddings are + matched to the spacy tokens. + + Args: + documents: list of strings or spacy documents + fit_model: not used, required by base class + Return: + Token embeddings for each document + """ + + documents_batch_embedded = [] + for document_number, document in enumerate(documents_batch): + doc = self._get_tokenized_document(document) + + # no spacy token. + # set special token as text, so that an embedding + # is created that can be processed by the PCA. + if len(doc) == 0: + doc = self.nlp(self._NL_TOKEN) + + # spacy creates tokens which only contain whitespace characters. + # the transformer tokenizer ignores these tokens. + # in order to avoid problems while matching the tokens the text is + # preprocessed. + text, prep_offsets = self._preprocess_doc_text(doc) + + # transformer models have a maximum number of tokens which can be + # processed at the same time. + # in this case the text is splitted in mutliple subparts + number_est_tokens = self._estimate_token_number(text) + idx_document = batch_number * self.batch_size + document_number + if self.model.config.max_position_embeddings < number_est_tokens: + if WarningType.DOCUMENT_IS_SPLITTED.value in self._warnings: + self._warnings[WarningType.DOCUMENT_IS_SPLITTED.value].append( + idx_document + ) + else: + self._warnings[WarningType.DOCUMENT_IS_SPLITTED.value] = [ + idx_document + ] + + transformer_embs = [] + for doc_part, index_offset in self._split_document( + text, number_est_tokens + ): + transformer_embs.extend( + self._get_transformer_embeddings( + doc_part, idx_document, index_offset + ) + ) + else: + transformer_embs = self._get_transformer_embeddings( + text, idx_document + ) + + document_embedded = self._match_transformer_embeddings_to_spacy_tokens( + transformer_embs, doc, prep_offsets + ) + + if len(document_embedded) != len(doc): + idx_document = batch_number * self.batch_size + document_number + if WarningType.TOKEN_MISMATCHING.value in self._warnings: + self._warnings[WarningType.TOKEN_MISMATCHING.value].append( + idx_document + ) + else: + self._warnings[WarningType.TOKEN_MISMATCHING.value] = [ + idx_document + ] + + documents_batch_embedded.append(document_embedded) + yield documents_batch_embedded + + def _preprocess_doc_text(self, doc: Doc) -> Tuple[str, np.ndarray]: + """Replaces the text of tokens which only consist of whitespace characters + with the special token [NL] (new line). + The special token and the whitespace string can consist of different number of + chars. To match the tokens later these differences are saved as offsets. + + Args: + doc: spacy document + Returns: + Preprocessed text in which whitespace tokens are + replaced by a special token, an array containing the indices of replaced + strings and the resulting offset + """ + + prep_text = "" + idx_already_preprocessed = 0 + # pairs of the line number of the preprocessed text and the offset relative to + # the original document, here, offset is the difference btw the preprocessed and + # the original text. + prep_offsets = [(0, 0)] + + for tkn in doc: + if not re.sub(r"[\s]+", "", tkn.text): + # indices of current token which will be replaced by the special token + idx_start, idx_end = tkn.idx, tkn.idx + len(tkn) + # append already processed text and the special token + prep_text += doc.text[idx_already_preprocessed:idx_start] + prep_text += self._NL_TOKEN + + additional_offset = len(tkn) - len(self._NL_TOKEN) + prep_offsets.append( + ( + len(prep_text), # index to apply offset + additional_offset, # offset to be applied + ) + ) + idx_already_preprocessed = idx_end + + # add remaining text + prep_text += doc.text[idx_already_preprocessed:] + + return prep_text, np.array(prep_offsets) + + def _match_transformer_embeddings_to_spacy_tokens( + self, + transformer_embeddings: List[List[Tuple[int, int, List[List[float]]]]], + document_tokenized: Doc, + prep_offsets: np.ndarray = None, + ) -> List[List[float]]: + """ + Transformer and spacy tokens differ. Usual the transformer tokenizer splits + splits the text into smaller subparts in comparison to the spacy tokenizer. + To create embeddings for the spacy tokens the transformer embeddings must be + matched. This is done by comparing the char spans of the tokens and matching the + tokens which overlap. + + Args: + transformer_embeddings: List of start and end indices for each transformer + token and the corresponding embedding + document_tokenized: spacy tokens + prep_offsets: Indices and offsets to match the preprocessed text to the + original document + Returns: + Embeddings for each spacy token in the tokenized document. + """ + + embeddings = defaultdict(list) + + for index_start, index_end, transformer_emb in transformer_embeddings: + + if prep_offsets is not None: + index_start = self._add_offset(index_start, prep_offsets) + index_end = self._add_offset(index_end, prep_offsets) + + span = document_tokenized.char_span( + index_start, index_end, alignment_mode="expand" + ) + if span is not None: + # if a transformer token include multiple spacy tokens, the spacy + # tokens get the same transformer embedding. + for token in span: + embeddings[token.i].extend(transformer_emb) + for key, values in embeddings.items(): + embeddings[key] = np.array(values).mean(0).tolist() + return list(embeddings.values()) + + def _add_offset(self, idx: int, offsets: np.ndarray) -> int: + """ + Adds offset to index according to the offsets array. + + Args: + idx: index to transform + offsets: indices and the corresponding offsets + Returns: + Index customized according to the offset + """ + return idx + np.sum(offsets[np.where(offsets[:, 0] <= idx)][:, 1]) + + def _get_transformer_embeddings( + self, + document: str, + idx_document: int, + idx_offset: int = 0, + ) -> List[List[Tuple[int, int, List[List[float]]]]]: + """ + Calculates embeddings for the given document using a transformer model. + First, the corresponding transformer tokens are computed. The next steps + computes the embeddings. With each embedding the indices of the according + chars are returned. idx_offset is used to return the correct indices if the + document has been split. + + Args: + document: plain document text + idx_offset: offset if the document has been splitted + Returns: + Start and end index for each transformer token and the calculated + embedding + """ + encoded = self.transformer_tokenizer(document, return_tensors="pt").to( + self.device + ) + tokens = encoded.encodings[0] + + # fallback if the number of tokens is still too big + if len(tokens) > self.model.config.max_position_embeddings: + if WarningType.DOCUMENT_IS_SPLITTED.value in self._warnings: + self._warnings[WarningType.DOCUMENT_IS_SPLITTED.value].append( + idx_document + ) + else: + self._warnings[WarningType.DOCUMENT_IS_SPLITTED.value] = [idx_document] + + token_embs = [] + for doc_part, additional_idx_offset in self._split_document( + document, len(tokens) + ): + token_embs.extend( + self._get_transformer_embeddings( + doc_part, idx_document, idx_offset + additional_idx_offset + ) + ) + return token_embs + + with torch.no_grad(): + output = self.model(**encoded) + + # Get all hidden states + states = output.hidden_states + # Stack and sum last four layers + layers = [-4, -3, -2, -1] + output = torch.stack([states[i] for i in layers]).sum(0).squeeze() + + token_embeddings = [] + # 1 and -1 are [CLS] tokens, and other tokens can be ##subwords + for word_idx in set(tokens.word_ids[1:-1]): + index_begin, index_end = tokens.word_to_chars(word_idx) + token_ids_word = np.where(np.array(encoded.word_ids()) == word_idx) + # Only select the tokens that constitute the requested word + word_tokens_output = output[token_ids_word] + token_embeddings.append( + [ + index_begin + idx_offset, + index_end + idx_offset, + word_tokens_output.tolist(), + ] + ) + return token_embeddings + + def _estimate_token_number(self, document: str) -> int: + """ + Estimates the number of tokens which are generated by the transformer model. + It is based on the rule of thumb that per token 3 subtokens are created by + the transformer tokenizer. Tokens are created by splitting at every + special and whitespace character. + Special Characters are handled seperately according to the assumption that each + special character is treated as a token by the transformer tokenizer. + + Args: + document: plain text document + Returns: + Estimation for the number of transformer tokens included in the document + """ + avg_subtokens_per_token = 3 + number_word_tokens = len(re.findall(r"\[NL\]|\w+", document)) + number_special_characters = len(re.sub(r"[\w\s]+", "", document)) + return avg_subtokens_per_token * number_word_tokens + number_special_characters + + def _split_document( + self, document: str, estimated_tokens: int + ) -> Iterator[Tuple[str, int]]: + """ + Splits the documens into subparts, according to the model's max length and the + number of estimated tokens. + + Args: + document: plain text document + estimated_tokens: estimation for the token number + Returns: + Yields subpart of the document, splitted depending on max model length and + estimated number of tokens + """ + # the regular expression matches the special token [NL], any word consiting of + # numbers and chars and single characters which are no whitespace or word + # character + token_spans = [ + token.span() for token in re.finditer(r"\[NL\]|\w+|[^\w\s]+?", document) + ] + split_into = ( + round(estimated_tokens / self.model.config.max_position_embeddings) + 1 + ) + len_part = math.ceil(len(token_spans) / split_into) + + prev_split_idx = 0 + for i in range(split_into): + current_split_idx = min( + len(document), + token_spans[min((i + 1) * len_part, len(token_spans) - 1)][1], + ) + yield document[prev_split_idx:current_split_idx], prev_split_idx + prev_split_idx = current_split_idx diff --git a/src/embedders/extraction/count_based.py b/src/embedders/extraction/count_based.py new file mode 100644 index 0000000..a6e243a --- /dev/null +++ b/src/embedders/extraction/count_based.py @@ -0,0 +1,47 @@ +from typing import List, Generator, Union +from sklearn.feature_extraction.text import CountVectorizer +from embedders import util +from spacy.tokens.doc import Doc + +from embedders.extraction import TokenEmbedder + + +class BagOfCharsTokenEmbedder(TokenEmbedder): + """Embeds documents using plain Bag of Characters approach. + + Args: + language_code (str): Name of the spaCy language model + precomputed_docs (bool, optional): If you have a large text corpus, it might make sense to precompute the data and input tokenized spaCy documents. Defaults to False. + batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. + """ + + def __init__( + self, + language_code: str, + precomputed_docs: bool = False, + batch_size: int = 128, + **kwargs + ): + super().__init__(language_code, precomputed_docs, batch_size) + self.model = CountVectorizer(analyzer="char", min_df=0.01, **kwargs) + + def _encode( + self, documents: List[Union[str, Doc]], fit_model: bool + ) -> Generator[List[List[List[int]]], None, None]: + if fit_model: + if self.preloaded: + self.model.fit([doc.text for doc in documents]) + else: + self.model.fit(documents) + + for documents_batch in util.batch(documents, self.batch_size): + documents_batch_embedded = [] + for doc in documents_batch: + documents_batch_embedded.append( + self.model.transform( + [tok.text for tok in self._get_tokenized_document(doc)] + ) + .toarray() + .tolist() + ) + yield documents_batch_embedded diff --git a/src/embedders/extraction/reduce.py b/src/embedders/extraction/reduce.py new file mode 100644 index 0000000..76df602 --- /dev/null +++ b/src/embedders/extraction/reduce.py @@ -0,0 +1,64 @@ +from typing import List, Generator, Union +import numpy as np +from embedders import PCAReducer, util + + +class PCATokenReducer(PCAReducer): + def __init__(self, embedder, **kwargs): + super().__init__(embedder=embedder, **kwargs) + self.nlp = embedder.nlp + + def _transform( + self, embeddings: List[List[List[Union[int, float]]]] + ) -> List[List[List[Union[float, int]]]]: + batch_concatenated = np.concatenate(embeddings) + start_idx = 0 + batch_unsqueezed = [] + for length in [len(embedding) for embedding in embeddings]: + end_idx = start_idx + length + batch_reduced = self.reducer.transform( + batch_concatenated[start_idx:end_idx] + ) + batch_unsqueezed.append(batch_reduced.tolist()) + start_idx = end_idx + return batch_unsqueezed + + def _reduce( + self, documents, fit_model, fit_after_n_batches + ) -> Generator[List[List[List[Union[float, int]]]], None, None]: + if fit_model: + embeddings_training = [] + num_batches = util.num_batches(documents, self.embedder.batch_size) + fit_after_n_batches = min(num_batches, fit_after_n_batches) - 1 + for batch_idx, batch in enumerate( + self.embedder.fit_transform(documents, as_generator=True) + ): + if batch_idx <= fit_after_n_batches: + embeddings_training.append(batch) + + if batch_idx == fit_after_n_batches: + embeddings_training_flattened = [] + for batch_training in embeddings_training: + embeddings_training_flattened.extend( + np.concatenate(batch_training).tolist() + ) + embeddings_training_flattened = np.array( + embeddings_training_flattened + ) + if ( + embeddings_training_flattened.shape[1] + < self.reducer.n_components + and self.autocorrect_n_components + ): + self.reducer.n_components = embeddings_training_flattened.shape[ + 1 + ] + self.reducer.fit(embeddings_training_flattened) + + for batch_training in embeddings_training: + yield self._transform(batch_training) + if batch_idx > fit_after_n_batches: + yield self._transform(batch) + else: + embeddings = self.embedder.transform(documents) + yield self._transform(embeddings) diff --git a/src/embedders/samples/__init__.py b/src/embedders/samples/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/embedders/samples/clickbait.py b/src/embedders/samples/clickbait.py new file mode 100644 index 0000000..e1914b1 --- /dev/null +++ b/src/embedders/samples/clickbait.py @@ -0,0 +1,106 @@ +DATA = [ + "UK guinea pig farm to close after owner's family grave robbed", + "18 Sweet Pumpkin Treats You Won't Believe Are Healthy", + 'A Guy Just Did The Most Epic "Cha Cha Slide" Dance Ever', + "Premium gas discounted for a few hours", + "Sanctions on US products introduced by Brazil", + "IPhone sales exceed BlackBerry", + "Administration Seeks to Regulate Derivatives", + "21 Life-Changing Products That Can Actually Make Your Skin Better", + "US raids Iran 'liaison office', Russia says it is unacceptable", + "US House of Representatives rejects bail out bill in vote", + "23 Ways To Give Your Heart To Your Valentine", + "Signs You Grew Up In Southern California", + "5 killed in return bus trip from marching band competition", + "Here's Definitive Proof That Leonardo DiCaprio Is Immortal", + "Signs of Possible Deal in Pakistan Turmoil", + "14 killed in Russian bus-truck collision", + "In Icy Kentucky, Thousands Are Still Without Power", + "Beans Memes Is The Only Twitter Account That Actually Matters", + "Couples Who Prove Opposites Attract", + "Ball State Upsets Tennessee in First Round", + "India and U.S.A. work toward nuclear fuel agreement", + "16 Tweets That Sum Up The Lengths You Would Go To Avoid Other People", + "Gas Is Up, but Drivers May Look the Other Way", + "Which Taylor Swift Track Should Be Your Personal Theme Song", + "Bruno Mars Might Headline Super Bowl 50", + "US clinic plans first face transplant", + "South Korea says North Korea will test more nuclear bombs", + "National Hockey League news: February 28, 2008", + "41 Victoria's Secret Models Show What They Look Like Without Makeup", + "North Korean military fires artillery on populated South Korean island", + "Pilot killed as Su-25 military jet explodes near Vladivostok", + "Conservatives Map Strategies on Court Fight", + "Aziz Ansari's Instagram Post About His Dad Will Make You Cry", + "Who Is Your Dad Actually", + "Climate Research That Might Not Help", + "11 Steamy Lyrics That Will Make See You Selena Gomez In A New Light", + "What Percent Vegan Are You", + "13 Misogynistic Phrases That Need To Die", + "491 Scoreless Minutes Come to an End", + "Oil spewing from crack in seafloor of Gulf of Mexico was fifty feet from Deepwater Horizon well", + "Saudis Delay Local Elections by 2 Years", + "7 Excellent Deals You Can Get This Weekend", + "U.N. Warns of Refugee Crisis in Gaza Strip", + "Executives from IT industry focus on 10-year anniversary of Microsoft Research Asia", + "Independent Member of Australian Parliament calls for better indigenous policy", + "Ayesha Curry Has Sparked A Debate About How Women Dress", + "FCC requires VoIP providers to have 911 service", + "17 Images That Will Only Make Sense To People Obsessed With High Heels", + "Australian rules football: 2010 Gippsland Football League round 1 - Wonthaggi v Leongatha", + "Two Killed in Violence on Gaza Border", + "Stimulus Tour Takes Obama to New Blue States", + "12 Bizarre Christmas Traditions From Around The World", + "9 Differences Between Hanging Out With Your New Friend Vs. Your Best Friend", + "Priest Reportedly Suspended For Riding A Hoverboard During Mass", + "15 Songs You Loved (But Forgot About) From 10 Years Ago", + "People Are Using The Hashtag #BurritoSelfie And It Is As Glorious As You'd Imagine", + "This Taco Recipe Will Sexually Awaken Your Taste Buds", + "16 Times Chris Martin Was Really Just An Excited Puppy", + "19 Gorgeous Finnish Baby Names That Will Make You Broody", + "Mugabe spokeperson tells critics to 'go hang'", + "Night Owls Become Early Risers", + "I Tested Pinterest Mug Recipes To See If They Actually Taste Like Food", + "Former U.S. President Clinton stumps for Obama, Franken in Minneapolis", + "Rangers Honor Andy Bathgate and Harry Howell", + "Which Lola From 'Kalyeserye' Are You Based On These Really Hard Questions", + "West African cholera claims more than 500 lives, more deaths feared", + "17 Of The Most Beautifully Illustrated Picture Books In 2015", + "Rwandan genocide investigations to be completed by end of July", + "Mandela discharged from hospital", + "15 Insanely Adorable Pins You Never Knew You Needed", + "Internet Companies and Ad Agencies Find Some Common Ground", + "NBA star Gilbert Arenas pleads guilty to gun possession, could face six months in prison", + "Clothing Makers Exceed Quarterly Expectations", + "22 Mesmerizing, Mundane Photos Of A Day In The Life Of Darth Vader", + "19 Texts That Are Way Too Real For Anyone Who's A Little Bit Greedy", + "What Percentage Do You Have Of Winning The Royal Rumble", + "Students Stand When Called Upon, and When Not", + "Justices Retain Oversight by U.S. on Voting", + "Canadian film academy explains lack of Genie nomination for Juno", + "National Academy of Sciences recommends manned Hubble repair", + "18 Things You Didn't Know About Cold Callers", + "When You Miss Your Friend", + "Robin Cook dead after collapsing", + "12 hurt in San Luis de La Balsa tourist bus accident", + "Wreckage of plane thought to be missing Air France flight found in Atlantic", + 'Which Pink Lady From "Grease" Should Be Your BFF', + "Attack on mosque kills 30 in Rawalpindi, Pakistan", + "Russian Uranium Sale to U.S. Is Planned", + "Billions Withdrawn Before Madoff Arrest", + "Vestas occupation continues; left-wing political parties voice support", + "Television appeal for 1984 murder in Bath, England", + "Myanmar Dissident Testifies at Trial", + "Independent presidential candidates debate this weekend", + "Blake Lively And Ryan Reynolds Continue To Be Actual Relationship Goals", + "F.B.I. Lab Houses Growing Database of DNA Profiles", + "18 Differences Between Snow Days In Canada And America", + "The 27 Most Annoying Things Every Bartender Has To Endure", + "U.S. Tells Chrysler to Prepare for Bankruptcy Filing", + "Thieves steal £40 million from London jeweller", + "Obama on Spot Over a Benefit to Gay Couples", +] + + +def get_sample_data(): + return DATA diff --git a/src/embedders/util.py b/src/embedders/util.py new file mode 100644 index 0000000..4b8c1b2 --- /dev/null +++ b/src/embedders/util.py @@ -0,0 +1,12 @@ +from typing import Any, Generator, List +import numpy as np + + +def batch(documents: List[Any], batch_size: int) -> Generator[List[Any], None, None]: + length = len(documents) + for idx in range(0, length, batch_size): + yield documents[idx : min(idx + batch_size, length)] + + +def num_batches(documents: List[Any], batch_size: int) -> int: + return int(np.ceil(len(documents) / batch_size)) diff --git a/src/util/__init__.py b/src/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/util/config_handler.py b/src/util/config_handler.py similarity index 97% rename from util/config_handler.py rename to src/util/config_handler.py index a7dfb04..f9d1a84 100644 --- a/util/config_handler.py +++ b/src/util/config_handler.py @@ -1,7 +1,7 @@ from typing import Dict, Any, Optional, Union import requests import time -from util import daemon +from src.util import daemon __config = None diff --git a/util/daemon.py b/src/util/daemon.py similarity index 100% rename from util/daemon.py rename to src/util/daemon.py diff --git a/util/decorator.py b/src/util/decorator.py similarity index 100% rename from util/decorator.py rename to src/util/decorator.py diff --git a/util/embedders.py b/src/util/embedders.py similarity index 88% rename from util/embedders.py rename to src/util/embedders.py index e5758ea..cd3d41a 100644 --- a/util/embedders.py +++ b/src/util/embedders.py @@ -1,19 +1,19 @@ from typing import Optional -from embedders.classification.count_based import ( +from src.embedders.classification.count_based import ( BagOfCharsSentenceEmbedder, BagOfWordsSentenceEmbedder, TfidfSentenceEmbedder, ) -from embedders.classification.contextual import ( +from src.embedders.classification.contextual import ( OpenAISentenceEmbedder, HuggingFaceSentenceEmbedder, CohereSentenceEmbedder, ) -from embedders.extraction.count_based import BagOfCharsTokenEmbedder -from embedders.extraction.contextual import TransformerTokenEmbedder -from embedders.classification.reduce import PCASentenceReducer -from embedders.extraction.reduce import PCATokenReducer -from embedders import Transformer +from src.embedders.extraction.count_based import BagOfCharsTokenEmbedder +from src.embedders.extraction.contextual import TransformerTokenEmbedder +from src.embedders.classification.reduce import PCASentenceReducer +from src.embedders.extraction.reduce import PCATokenReducer +from src.embedders import Transformer from submodules.model import enums from submodules.model.business_objects import record diff --git a/util/notification.py b/src/util/notification.py similarity index 96% rename from util/notification.py rename to src/util/notification.py index aa4ed71..2f500e3 100644 --- a/util/notification.py +++ b/src/util/notification.py @@ -1,7 +1,7 @@ import requests import os -from embedders.enums import WarningType +from src.embedders.enums import WarningType from submodules.model.business_objects import project diff --git a/util/request_util.py b/src/util/request_util.py similarity index 100% rename from util/request_util.py rename to src/util/request_util.py From 30549d912ae449119e40241f9b07f9d2dbd39ded Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Fri, 18 Jul 2025 12:12:34 +0000 Subject: [PATCH 7/9] ci: update requirements and Dockerfile --- Dockerfile | 2 +- dev.Dockerfile | 2 +- gpu.Dockerfile | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 045935e..c31e9fa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM kernai/refinery-parent-images:parent-image-updates-torch-cpu +FROM registry.dev.kern.ai/code-kern-ai/refinery-parent-images:parent-image-updates-torch-cpu WORKDIR /program diff --git a/dev.Dockerfile b/dev.Dockerfile index ee09b36..278a7c5 100644 --- a/dev.Dockerfile +++ b/dev.Dockerfile @@ -1,4 +1,4 @@ -FROM kernai/refinery-parent-images:parent-image-updates-torch-cpu +FROM registry.dev.kern.ai/code-kern-ai/refinery-parent-images:parent-image-updates-torch-cpu WORKDIR /app diff --git a/gpu.Dockerfile b/gpu.Dockerfile index 89ff713..3247790 100644 --- a/gpu.Dockerfile +++ b/gpu.Dockerfile @@ -1,4 +1,4 @@ -FROM kernai/refinery-parent-images:parent-image-updates-torch-cuda +FROM registry.dev.kern.ai/code-kern-ai/refinery-parent-images:parent-image-updates-torch-cuda WORKDIR /program From 2da173902b0373fefb60b7c8721ec04b37498f99 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Fri, 18 Jul 2025 12:13:28 +0000 Subject: [PATCH 8/9] ci: update requirements.txt --- gpu-requirements.txt | 199 ++++++++++++++++++++++--------------------- requirements.txt | 46 ++-------- 2 files changed, 106 insertions(+), 139 deletions(-) diff --git a/gpu-requirements.txt b/gpu-requirements.txt index 0ca0cd0..b01f324 100644 --- a/gpu-requirements.txt +++ b/gpu-requirements.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=gpu-requirements.txt requirements/gpu-requirements.in # ---extra-index-url https://download.pytorch.org/whl/cpu +--extra-index-url https://download.pytorch.org/whl/cu113 aiohappyeyeballs==2.6.1 # via aiohttp @@ -14,30 +14,29 @@ aiosignal==1.4.0 # via aiohttp annotated-types==0.7.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # pydantic anyio==4.9.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # httpx + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # starlette argon2-cffi==25.1.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # minio argon2-cffi-bindings==21.2.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # argon2-cffi attrs==25.3.0 # via aiohttp blis==0.7.11 # via thinc boto3==1.39.6 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt botocore==1.39.6 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # boto3 # s3transfer catalogue==2.0.10 @@ -47,28 +46,24 @@ catalogue==2.0.10 # thinc certifi==2025.7.14 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # httpcore - # httpx + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # minio # requests cffi==1.17.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # argon2-cffi-bindings charset-normalizer==3.4.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # requests click==8.2.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # typer # uvicorn cloudpathlib==0.21.1 # via weasel -cohere==5.16.1 - # via embedders confection==0.1.5 # via # thinc @@ -78,15 +73,11 @@ cymem==2.0.11 # preshed # spacy # thinc -embedders @ git+https://github.com/code-kern-ai/embedders@python-upgrade - # via -r requirements/gpu-requirements.in fastapi==0.115.2 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt -fastavro==1.11.1 - # via cohere + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt filelock==3.18.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # huggingface-hub # torch # transformers @@ -96,52 +87,44 @@ frozenlist==1.7.0 # aiosignal fsspec==2025.7.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # huggingface-hub # torch greenlet==3.2.3 # via sqlalchemy h11==0.16.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # httpcore + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # uvicorn hf-xet==1.1.5 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # huggingface-hub -httpcore==1.0.9 - # via httpx -httpx==0.28.1 - # via cohere -httpx-sse==0.4.0 - # via cohere huggingface-hub==0.33.4 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # sentence-transformers # tokenizers # transformers idna==3.10 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # anyio - # httpx # requests # yarl jinja2==3.1.6 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # spacy # torch jmespath==1.0.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # boto3 # botocore joblib==1.5.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # scikit-learn # scikit-optimize langcodes==3.5.0 @@ -154,15 +137,15 @@ markdown-it-py==3.0.0 # via rich markupsafe==3.0.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # jinja2 mdurl==0.1.2 # via markdown-it-py minio==7.2.15 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt mpmath==1.3.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # sympy multidict==6.6.3 # via @@ -175,13 +158,12 @@ murmurhash==1.0.13 # thinc networkx==3.5 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # torch numpy==1.23.4 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # blis - # embedders # pandas # scikit-learn # scikit-optimize @@ -190,20 +172,55 @@ numpy==1.23.4 # thinc # torchvision # transformers -openai==0.28.1 +nvidia-cublas-cu12==12.6.4.1 # via - # -r requirements/gpu-requirements.in - # embedders + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.6.80 + # via torch +nvidia-cuda-nvrtc-cu12==12.6.77 + # via torch +nvidia-cuda-runtime-cu12==12.6.77 + # via torch +nvidia-cudnn-cu12==9.5.1.17 + # via torch +nvidia-cufft-cu12==11.3.0.4 + # via torch +nvidia-cufile-cu12==1.11.1.6 + # via torch +nvidia-curand-cu12==10.3.7.77 + # via torch +nvidia-cusolver-cu12==11.7.1.2 + # via torch +nvidia-cusparse-cu12==12.5.4.2 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-cusparselt-cu12==0.6.3 + # via torch +nvidia-nccl-cu12==2.26.2 + # via torch +nvidia-nvjitlink-cu12==12.6.85 + # via + # nvidia-cufft-cu12 + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 + # torch +nvidia-nvtx-cu12==12.6.77 + # via torch +openai==0.28.1 + # via -r requirements/gpu-requirements.in packaging==25.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # huggingface-hub # spacy # thinc # transformers # weasel pandas==1.5.1 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt pillow==11.3.0 # via # sentence-transformers @@ -217,23 +234,22 @@ propcache==0.3.2 # aiohttp # yarl psycopg2-binary==2.9.9 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt pyaml==25.7.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # scikit-optimize pycparser==2.22 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # cffi pycryptodome==3.23.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # minio pydantic==2.7.4 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # cohere + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # confection # fastapi # spacy @@ -241,34 +257,32 @@ pydantic==2.7.4 # weasel pydantic-core==2.18.4 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # cohere + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # pydantic pygments==2.19.2 # via rich python-dateutil==2.9.0.post0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # botocore # pandas pytz==2025.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # pandas pyyaml==6.0.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # huggingface-hub # pyaml # transformers regex==2024.11.6 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # transformers requests==2.32.4 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # cohere + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # huggingface-hub # openai # spacy @@ -278,52 +292,47 @@ rich==14.0.0 # via typer s3transfer==0.13.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # boto3 safetensors==0.5.3 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # transformers scikit-learn==1.5.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # embedders + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # scikit-optimize # sentence-transformers scikit-optimize==0.9.0 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt scipy==1.13.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # scikit-learn # scikit-optimize # sentence-transformers sentence-transformers==5.0.0 - # via - # -r requirements/gpu-requirements.in - # embedders + # via -r requirements/gpu-requirements.in shellingham==1.5.4 # via typer six==1.17.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # python-dateutil smart-open==7.3.0.post1 # via weasel sniffio==1.3.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # anyio spacy==3.7.5 - # via - # -r requirements/gpu-requirements.in - # embedders + # via -r requirements/gpu-requirements.in spacy-legacy==3.0.12 # via spacy spacy-loggers==1.0.5 # via spacy sqlalchemy==1.4.42 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt srsly==2.5.1 # via # confection @@ -332,35 +341,32 @@ srsly==2.5.1 # weasel starlette==0.40.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # fastapi sympy==1.14.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # torch thinc==8.2.5 # via spacy threadpoolctl==3.6.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # scikit-learn tokenizers==0.21.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # cohere + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # transformers -torch==2.7.1+cpu +torch==2.7.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # embedders + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # sentence-transformers # torchvision -torchvision==0.22.1+cpu +torchvision==0.22.1 # via -r requirements/gpu-requirements.in tqdm==4.67.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # embedders + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # huggingface-hub # openai # sentence-transformers @@ -368,21 +374,19 @@ tqdm==4.67.1 # transformers transformers==4.53.2 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # embedders + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # sentence-transformers +triton==3.3.1 + # via torch typer==0.16.0 # via # spacy # weasel -types-requests==2.32.4.20250611 - # via cohere typing-extensions==4.14.1 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # aiosignal # anyio - # cohere # fastapi # huggingface-hub # minio @@ -393,13 +397,12 @@ typing-extensions==4.14.1 # typer urllib3==2.5.0 # via - # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt # botocore # minio # requests - # types-requests uvicorn==0.35.0 - # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt + # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cuda-requirements.txt wasabi==1.1.3 # via # spacy diff --git a/requirements.txt b/requirements.txt index 826e6b5..f54effb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,7 +19,6 @@ annotated-types==0.7.0 anyio==4.9.0 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # httpx # starlette argon2-cffi==25.1.0 # via @@ -48,8 +47,6 @@ catalogue==2.0.10 certifi==2025.7.14 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # httpcore - # httpx # minio # requests cffi==1.17.1 @@ -67,8 +64,6 @@ click==8.2.1 # uvicorn cloudpathlib==0.21.1 # via weasel -cohere==5.16.1 - # via embedders confection==0.1.5 # via # thinc @@ -78,12 +73,8 @@ cymem==2.0.11 # preshed # spacy # thinc -embedders @ git+https://github.com/code-kern-ai/embedders@python-upgrade - # via -r requirements/requirements.in fastapi==0.115.2 # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt -fastavro==1.11.1 - # via cohere filelock==3.18.0 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt @@ -104,18 +95,11 @@ greenlet==3.2.3 h11==0.16.0 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # httpcore # uvicorn hf-xet==1.1.5 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # huggingface-hub -httpcore==1.0.9 - # via httpx -httpx==0.28.1 - # via cohere -httpx-sse==0.4.0 - # via cohere huggingface-hub==0.33.4 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt @@ -126,7 +110,6 @@ idna==3.10 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # anyio - # httpx # requests # yarl jinja2==3.1.6 @@ -181,7 +164,6 @@ numpy==1.23.4 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # blis - # embedders # pandas # scikit-learn # scikit-optimize @@ -191,9 +173,7 @@ numpy==1.23.4 # torchvision # transformers openai==0.28.1 - # via - # -r requirements/requirements.in - # embedders + # via -r requirements/requirements.in packaging==25.0 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt @@ -233,7 +213,6 @@ pycryptodome==3.23.0 pydantic==2.7.4 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # cohere # confection # fastapi # spacy @@ -242,7 +221,6 @@ pydantic==2.7.4 pydantic-core==2.18.4 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # cohere # pydantic pygments==2.19.2 # via rich @@ -268,7 +246,6 @@ regex==2024.11.6 requests==2.32.4 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # cohere # huggingface-hub # openai # spacy @@ -287,7 +264,6 @@ safetensors==0.5.3 scikit-learn==1.5.2 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # embedders # scikit-optimize # sentence-transformers scikit-optimize==0.9.0 @@ -299,9 +275,7 @@ scipy==1.13.1 # scikit-optimize # sentence-transformers sentence-transformers==5.0.0 - # via - # -r requirements/requirements.in - # embedders + # via -r requirements/requirements.in shellingham==1.5.4 # via typer six==1.17.0 @@ -315,9 +289,7 @@ sniffio==1.3.1 # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # anyio spacy==3.7.5 - # via - # -r requirements/requirements.in - # embedders + # via -r requirements/requirements.in spacy-legacy==3.0.12 # via spacy spacy-loggers==1.0.5 @@ -347,20 +319,17 @@ threadpoolctl==3.6.0 tokenizers==0.21.2 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # cohere # transformers -torch==2.7.1 +torch==2.7.1+cpu # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # embedders # sentence-transformers # torchvision -torchvision==0.22.1 +torchvision==0.22.1+cpu # via -r requirements/requirements.in tqdm==4.67.1 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # embedders # huggingface-hub # openai # sentence-transformers @@ -369,20 +338,16 @@ tqdm==4.67.1 transformers==4.53.2 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt - # embedders # sentence-transformers typer==0.16.0 # via # spacy # weasel -types-requests==2.32.4.20250611 - # via cohere typing-extensions==4.14.1 # via # -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt # aiosignal # anyio - # cohere # fastapi # huggingface-hub # minio @@ -397,7 +362,6 @@ urllib3==2.5.0 # botocore # minio # requests - # types-requests uvicorn==0.35.0 # via -r /home/runner/work/refinery-submodule-parent-images/refinery-submodule-parent-images/refinery-embedder/requirements/torch-cpu-requirements.txt wasabi==1.1.3 From 2eaf04b281fdab69d939c15cf07dd50ffbff15f8 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Mon, 21 Jul 2025 10:21:06 +0000 Subject: [PATCH 9/9] ci(pi): kernai/refinery-parent-images:v1.23.0-torch-cpu --- Dockerfile | 2 +- dev.Dockerfile | 2 +- gpu.Dockerfile | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index c31e9fa..f0f24e0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM registry.dev.kern.ai/code-kern-ai/refinery-parent-images:parent-image-updates-torch-cpu +FROM kernai/refinery-parent-images:v1.23.0-torch-cpu WORKDIR /program diff --git a/dev.Dockerfile b/dev.Dockerfile index 278a7c5..e3add5a 100644 --- a/dev.Dockerfile +++ b/dev.Dockerfile @@ -1,4 +1,4 @@ -FROM registry.dev.kern.ai/code-kern-ai/refinery-parent-images:parent-image-updates-torch-cpu +FROM kernai/refinery-parent-images:v1.23.0-torch-cpu WORKDIR /app diff --git a/gpu.Dockerfile b/gpu.Dockerfile index 3247790..2132274 100644 --- a/gpu.Dockerfile +++ b/gpu.Dockerfile @@ -1,4 +1,4 @@ -FROM registry.dev.kern.ai/code-kern-ai/refinery-parent-images:parent-image-updates-torch-cuda +FROM kernai/refinery-parent-images:v1.23.0-torch-cuda WORKDIR /program