Skip to content

feat: add containers with litserve servers default #36

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#---
# name: transformers
# config: config.py
# group: llm
# depends: [pytorch, torchvision, huggingface_hub, rust]
# test: [test_version.py, huggingface-benchmark.py]
# docs: docs.md
# notes: for quantization support in Transformers, use the bitsandbytes, AutoGPTQ, or AutoAWQ containers.
#---
ARG BASE_IMAGE
FROM ${BASE_IMAGE}

ARG TRANSFORMERS_PACKAGE=transformers \
TRANSFORMERS_VERSION

# if you want optimum[exporters,onnxruntime] see the optimum package

RUN pip3 install --no-cache-dir --verbose accelerate && \
pip3 install --no-cache-dir --verbose sentencepiece && \
pip3 install --no-cache-dir --verbose optimum && \
\
# install from pypi, git, ect (sometimes other version got installed)
pip3 uninstall -y transformers && \
\
echo "Installing tranformers $TRANSFORMERS_VERSION (from $TRANSFORMERS_PACKAGE)" && \
pip3 install --no-cache-dir --verbose ${TRANSFORMERS_PACKAGE} && \
\
# "/usr/local/lib/python3.8/dist-packages/transformers/modeling_utils.py", line 118
# AttributeError: module 'torch.distributed' has no attribute 'is_initialized'
PYTHON_ROOT=`pip3 show transformers | grep Location: | cut -d' ' -f2` && \
sed -i \
-e 's|torch.distributed.is_initialized|torch.distributed.is_available|g' \
${PYTHON_ROOT}/transformers/modeling_utils.py

# add benchmark script
COPY huggingface-benchmark.py /usr/local/bin

# make sure it loads
RUN pip3 show transformers \
&& python3 -c 'import transformers; print(transformers.__version__)'
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#---
# name: vllm
# group: vlm
# config: config.py
# depends: [pytorch, torchvision, torchaudio, transformers, triton, xformers]
# requires: '>=34.1.0'
# test: test.py
# notes: https://github.com/vllm-project/vllm
#---
ARG BASE_IMAGE
FROM ${BASE_IMAGE}

ARG VLLM_VERSION \
XGRAMMAR_VERSION \
FORCE_BUILD=off

RUN apt-get update -y && apt-get install -y libnuma-dev \
libsndfile1 libsndfile1-dev libprotobuf-dev libsm6 libxext6 libgl1

COPY build.sh install.sh patches /tmp/vllm/

RUN /tmp/vllm/install.sh || /tmp/vllm/build.sh
39 changes: 39 additions & 0 deletions solo_server/examples/merged/deepseek-r1-distill-qwen-1.5b/batch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import litserve as ls
from transformers import pipeline

class HuggingFaceLitAPI(ls.LitAPI):
def setup(self, device):
# Load the model and tokenizer from Hugging Face Hub
# For example, using the `distilbert-base-uncased-finetuned-sst-2-english` model for sentiment analysis
# You can replace the model name with any model from the Hugging Face Hub
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
self.pipeline = pipeline("text-classification", model=model_name, device=device)

def decode_request(self, request):
# Extract text from request
# This assumes the request payload is of the form: {'text': 'Your input text here'}
return request["text"]

def batch(self, inputs):
# return the batched input list
return inputs

def predict(self, texts):
# Use the loaded pipeline to perform inference
return self.pipeline(texts)

def unbatch(self, outputs):
# unbatch the model output
return outputs

def encode_response(self, output):
# Format the output from the model to send as a response
# This example sends back the label and score of the prediction
return {"label": output["label"], "score": output["score"]}

if __name__ == "__main__":
# Create an instance of your API
api = HuggingFaceLitAPI()
# Start the server, specifying the port
server = ls.LitServer(api, accelerator="cuda", max_batch_size=16, workers_per_device=4, batch_timeout=0.01)
server.run(port=8000)
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import requests

def test_server(text):
# API endpoint URL
url = "http://127.0.0.1:8000/predict"
# Request payload
payload = {"text": text}
# POST request to the server
response = requests.post(url, json=payload)
# Print the response from the server
print(response.json())

if __name__ == "__main__":
sample_text = "I love machine learning. My experience with LitServe has been amazing!"
test_server(sample_text)
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
accelerate
bitsandbytes
decord
litserve
openai
Pillow
qwen-vl-utils
streamlit
torch==2.4.0
torchvision==0.19.0
git+https://github.com/huggingface/transformers.git

# Optional dependency
# Uncomment the following line if you need flash-attn
flash-attn==2.6.1
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import litserve as ls
from transformers import pipeline

class HuggingFaceLitAPI(ls.LitAPI):
def setup(self, device):
# Load the model and tokenizer from Hugging Face Hub
# For example, using the `distilbert-base-uncased-finetuned-sst-2-english` model for sentiment analysis
# You can replace the model name with any model from the Hugging Face Hub
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
self.pipeline = pipeline("text-classification", model=model_name, device=0 if device=="gpu" else -1)

def decode_request(self, request):
# Extract text from request
# This assumes the request payload is of the form: {'text': 'Your input text here'}
return request["text"]

def predict(self, text):
# Use the loaded pipeline to perform inference
return self.pipeline(text)

def encode_response(self, output):
# Format the output from the model to send as a response
# This example sends back the label and score of the prediction
return {"label": output[0]["label"], "score": output[0]["score"]}

if __name__ == "__main__":
# Create an instance of your API
api = HuggingFaceLitAPI()
# Start the server, specifying the port
server = ls.LitServer(api, accelerator="cuda")
server.run(port=8000)
35 changes: 35 additions & 0 deletions solo_server/examples/merged/qwen2.5/server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from fastapi import HTTPException
from stock_researcher import research_financials, research_news, stock_researcher, load_model
import litserve as ls

model_id = "Qwen/Qwen2.5-7B-Instruct"

class StockAnalyst(ls.LitAPI):
def setup(self, device):
# Using a self hosted open-source model with OpenAI API compatible interface
self.model = model_id

def decode_request(self, request: dict):
# Query containing the stock name to research
return request["query"]

def predict(self, query: str):
try:
# 1. Find financial info
messages, financials = research_financials(query, self.model)
# 2. Research news about stocks
tool_calls, tool_final_result = research_news(financials, query, self.model)
# 3. Analyze data
yield from stock_researcher(tool_final_result, tool_calls, messages, self.model)
except Exception as e:
raise HTTPException(status_code=500, detail="Stock analyst ran into an error")

def encode_response(self, response):
for chunk in response:
yield chunk

if __name__ == "__main__":
load_model(model_id)
api = StockAnalyst()
server = ls.LitServer(api, workers_per_device=8, accelerator="cpu", timeout=False, stream=True)
server.run(port=8888)
23 changes: 23 additions & 0 deletions solo_server/examples/solo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
domain: "education"
hardware:
accelerator: "cpu"
workers_per_device: 8
timeout: false
stream: true
default-llm: "gpt-4"
models:
interest_tags:
- fast
- balanced
- innovative
prompt_seed: "Model prompt seed based on interests: fast, balanced, innovative"
docker:
image: "your_docker_image"
port: 5070
huggingface:
cache_dir: "huggingface"
token: "YOUR_HUGGINGFACE_API_TOKEN"
soloconfig:
path: "soloconfig.py"
modelpath: "models"
storage_limit_gb: 20