GetSoloTech · ddiddi · Mar 26, 2025 · Mar 26, 2025
diff --git a/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/Dockerfile b/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/Dockerfile
@@ -0,0 +1,40 @@
+#---
+# name: transformers
+# config: config.py
+# group: llm
+# depends: [pytorch, torchvision, huggingface_hub, rust]
+# test: [test_version.py, huggingface-benchmark.py]
+# docs: docs.md
+# notes: for quantization support in Transformers, use the bitsandbytes, AutoGPTQ, or AutoAWQ containers.
+#---
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+
+ARG TRANSFORMERS_PACKAGE=transformers \
+    TRANSFORMERS_VERSION
+
+# if you want optimum[exporters,onnxruntime] see the optimum package
+
+RUN pip3 install --no-cache-dir --verbose accelerate && \
+    pip3 install --no-cache-dir --verbose sentencepiece && \
+    pip3 install --no-cache-dir --verbose optimum && \
+    \
+    # install from pypi, git, ect (sometimes other version got installed)
+    pip3 uninstall -y transformers && \
+    \
+    echo "Installing tranformers $TRANSFORMERS_VERSION (from $TRANSFORMERS_PACKAGE)" && \
+    pip3 install --no-cache-dir --verbose ${TRANSFORMERS_PACKAGE} && \
+    \
+    # "/usr/local/lib/python3.8/dist-packages/transformers/modeling_utils.py", line 118
+    # AttributeError: module 'torch.distributed' has no attribute 'is_initialized'
+    PYTHON_ROOT=`pip3 show transformers | grep Location: | cut -d' ' -f2` && \
+    sed -i \
+        -e 's|torch.distributed.is_initialized|torch.distributed.is_available|g' \
+        ${PYTHON_ROOT}/transformers/modeling_utils.py
+
+# add benchmark script
+COPY huggingface-benchmark.py /usr/local/bin
+
+# make sure it loads
+RUN pip3 show transformers \
+    && python3 -c 'import transformers; print(transformers.__version__)'
diff --git a/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/Dockerfile.vllm b/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/Dockerfile.vllm
@@ -0,0 +1,22 @@
+#---
+# name: vllm
+# group: vlm
+# config: config.py
+# depends: [pytorch, torchvision, torchaudio, transformers, triton, xformers]
+# requires: '>=34.1.0'
+# test: test.py
+# notes: https://github.com/vllm-project/vllm
+#---
+    ARG BASE_IMAGE
+    FROM ${BASE_IMAGE}
+
+    ARG VLLM_VERSION \
+        XGRAMMAR_VERSION \
+        FORCE_BUILD=off
+
+    RUN apt-get update -y && apt-get install -y libnuma-dev \
+        libsndfile1 libsndfile1-dev libprotobuf-dev libsm6 libxext6 libgl1
+
+    COPY build.sh install.sh patches /tmp/vllm/
+
+    RUN /tmp/vllm/install.sh || /tmp/vllm/build.sh
diff --git a/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/batch.py b/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/batch.py
@@ -0,0 +1,39 @@
+import litserve as ls
+from transformers import pipeline
+
+class HuggingFaceLitAPI(ls.LitAPI):
+    def setup(self, device):
+        # Load the model and tokenizer from Hugging Face Hub
+        # For example, using the `distilbert-base-uncased-finetuned-sst-2-english` model for sentiment analysis
+        # You can replace the model name with any model from the Hugging Face Hub
+        model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+        self.pipeline = pipeline("text-classification", model=model_name, device=device)
+
+    def decode_request(self, request):
+        # Extract text from request
+        # This assumes the request payload is of the form: {'text': 'Your input text here'}
+        return request["text"]
+
+    def batch(self, inputs):
+        # return the batched input list
+        return inputs
+
+    def predict(self, texts):
+        # Use the loaded pipeline to perform inference
+        return self.pipeline(texts)
+
+    def unbatch(self, outputs):
+        # unbatch the model output
+        return outputs
+
+    def encode_response(self, output):
+        # Format the output from the model to send as a response
+        # This example sends back the label and score of the prediction
+        return {"label": output["label"], "score": output["score"]}
+
+if __name__ == "__main__":
+    # Create an instance of your API
+    api = HuggingFaceLitAPI()
+    # Start the server, specifying the port
+    server = ls.LitServer(api, accelerator="cuda", max_batch_size=16, workers_per_device=4, batch_timeout=0.01)
+    server.run(port=8000)
diff --git a/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/client.py b/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/client.py
@@ -0,0 +1,15 @@
+import requests
+
+def test_server(text):
+    # API endpoint URL
+    url = "http://127.0.0.1:8000/predict"
+    # Request payload
+    payload = {"text": text}
+    # POST request to the server
+    response = requests.post(url, json=payload)
+    # Print the response from the server
+    print(response.json())
+
+if __name__ == "__main__":
+    sample_text = "I love machine learning. My experience with LitServe has been amazing!"
+    test_server(sample_text)
diff --git a/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/requirements.txt b/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/requirements.txt
@@ -0,0 +1,15 @@
+accelerate
+bitsandbytes
+decord
+litserve
+openai
+Pillow
+qwen-vl-utils
+streamlit
+torch==2.4.0
+torchvision==0.19.0
+git+https://github.com/huggingface/transformers.git
+
+# Optional dependency
+# Uncomment the following line if you need flash-attn
+flash-attn==2.6.1
diff --git a/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/server.py b/solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/server.py
@@ -0,0 +1,31 @@
+import litserve as ls
+from transformers import pipeline
+
+class HuggingFaceLitAPI(ls.LitAPI):
+    def setup(self, device):
+        # Load the model and tokenizer from Hugging Face Hub
+        # For example, using the `distilbert-base-uncased-finetuned-sst-2-english` model for sentiment analysis
+        # You can replace the model name with any model from the Hugging Face Hub
+        model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+        self.pipeline = pipeline("text-classification", model=model_name, device=0 if device=="gpu" else -1)
+
+    def decode_request(self, request):
+        # Extract text from request
+        # This assumes the request payload is of the form: {'text': 'Your input text here'}
+        return request["text"]
+
+    def predict(self, text):
+        # Use the loaded pipeline to perform inference
+        return self.pipeline(text)
+
+    def encode_response(self, output):
+        # Format the output from the model to send as a response
+        # This example sends back the label and score of the prediction
+        return {"label": output[0]["label"], "score": output[0]["score"]}
+
+if __name__ == "__main__":
+    # Create an instance of your API
+    api = HuggingFaceLitAPI()
+    # Start the server, specifying the port
+    server = ls.LitServer(api, accelerator="cuda")
+    server.run(port=8000)
diff --git a/solo_server/examples/merged/qwen2.5/server.py b/solo_server/examples/merged/qwen2.5/server.py
@@ -0,0 +1,35 @@
+from fastapi import HTTPException
+from stock_researcher import research_financials, research_news, stock_researcher, load_model
+import litserve as ls
+
+model_id = "Qwen/Qwen2.5-7B-Instruct"
+
+class StockAnalyst(ls.LitAPI):
+    def setup(self, device):
+        # Using a self hosted open-source model with OpenAI API compatible interface
+        self.model = model_id
+
+    def decode_request(self, request: dict):
+        # Query containing the stock name to research
+        return request["query"]
+
+    def predict(self, query: str):
+        try:
+            # 1. Find financial info
+            messages, financials = research_financials(query, self.model)
+            # 2. Research news about stocks
+            tool_calls, tool_final_result = research_news(financials, query, self.model)
+            # 3. Analyze data
+            yield from stock_researcher(tool_final_result, tool_calls, messages, self.model)
+        except Exception as e:
+            raise HTTPException(status_code=500, detail="Stock analyst ran into an error")
+
+    def encode_response(self, response):
+        for chunk in response:
+            yield chunk
+
+if __name__ == "__main__":
+    load_model(model_id)
+    api = StockAnalyst()
+    server = ls.LitServer(api, workers_per_device=8, accelerator="cpu", timeout=False, stream=True)
+    server.run(port=8888)
diff --git a/solo_server/examples/solo.yaml b/solo_server/examples/solo.yaml
@@ -0,0 +1,23 @@
+domain: "education"
+hardware:
+  accelerator: "cpu"
+  workers_per_device: 8
+  timeout: false
+  stream: true
+default-llm: "gpt-4"
+models:
+  interest_tags:
+    - fast
+    - balanced
+    - innovative
+  prompt_seed: "Model prompt seed based on interests: fast, balanced, innovative"
+docker:
+  image: "your_docker_image"
+  port: 5070
+huggingface:
+  cache_dir: "huggingface"
+  token: "YOUR_HUGGINGFACE_API_TOKEN"
+soloconfig:
+  path: "soloconfig.py"
+  modelpath: "models"
+  storage_limit_gb: 20