diff --git a/dist/solo_server-0.2.6-py3-none-any.whl b/dist/solo_server-0.2.6-py3-none-any.whl deleted file mode 100644 index aecf19e..0000000 Binary files a/dist/solo_server-0.2.6-py3-none-any.whl and /dev/null differ diff --git a/dist/solo_server-0.2.6.tar.gz b/dist/solo_server-0.2.6.tar.gz deleted file mode 100644 index 83955ad..0000000 Binary files a/dist/solo_server-0.2.6.tar.gz and /dev/null differ diff --git a/solo_server/base.py b/solo_server/base.py index 4af1217..18cae98 100644 --- a/solo_server/base.py +++ b/solo_server/base.py @@ -1,117 +1,195 @@ import typer -from subprocess import run, CalledProcessError +from subprocess import run, CalledProcessError, DEVNULL import os +import sys +import time +import requests +import subprocess app = typer.Typer(help="šŸ› ļø Solo Server CLI for managing edge AI model inference using Docker-style commands.") def execute_command(command: list): + """Utility function to execute shell commands.""" try: run(command, check=True) except CalledProcessError as e: typer.echo(f"āŒ Error: {e}") raise typer.Exit(code=1) -# Recurring prompt to ask for the next command -@app.command() -def prompt(): - """ - šŸ”„ Recurring prompt for managing the Solo Server. - """ - while True: - typer.echo("\nWhat would you like to do?") - typer.echo("1. šŸš€ Start the Solo Server") - typer.echo("2. ā¹ Stop the Solo Server") - typer.echo("3. šŸ“ˆ Check the Solo Server status") - typer.echo("4. šŸ–Œļø Generate a code base template") - typer.echo("5. āŒ Exit") - choice = typer.prompt("Enter the number of your choice") - - if choice == "1": - tag = typer.prompt("Enter the tag name to start the server with") - start(tag) - elif choice == "2": - stop() - elif choice == "3": - status() - elif choice == "4": - tag = typer.prompt("Enter the tag name for the code base template") - gen(tag) - elif choice == "5": - typer.echo("āŒ Exiting the Solo Server CLI. Goodbye!") - break - else: - typer.echo("āš ļø Invalid choice. Please try again.") +def check_docker_installation(): + """Ensure Docker and Docker Compose are installed and user has necessary permissions.""" + typer.echo("šŸ” Checking Docker and Docker Compose installation...") -# Command to start the Solo Server, expects a tag name + # Check Docker + try: + run(["docker", "--version"], stdout=DEVNULL, stderr=DEVNULL, check=True) + except FileNotFoundError: + typer.echo("āŒ Docker is not installed. Installing Docker...") + execute_command([ + "curl", "-fsSL", "https://get.docker.com", "|", "sh" + ]) + except CalledProcessError: + typer.echo("āŒ Docker is installed but not accessible. Please ensure you have the correct permissions.") + typer.echo("šŸ”‘ Run the following to add your user to the Docker group:") + typer.echo(" sudo usermod -aG docker $USER && newgrp docker") + sys.exit(1) + + # Check Docker Compose + try: + run(["docker-compose", "--version"], stdout=DEVNULL, stderr=DEVNULL, check=True) + except FileNotFoundError: + typer.echo("āŒ Docker Compose is not installed. Installing Docker Compose...") + execute_command([ + "curl", "-L", "https://github.com/docker/compose/releases/latest/download/docker-compose-$(uname -s)-$(uname -m)", + "-o", "/usr/local/bin/docker-compose" + ]) + execute_command(["chmod", "+x", "/usr/local/bin/docker-compose"]) + except CalledProcessError: + typer.echo("āŒ Docker Compose is installed but not accessible.") + sys.exit(1) + + typer.echo("āœ… Docker and Docker Compose are installed and accessible.") @app.command() -def start( - tag: str, - model_url: str = typer.Option( - None, - "--model-url", "-u", - help="URL for the LLM model (only used with llm tag)" - ), - model_filename: str = typer.Option( - None, - "--model-filename", "-f", - help="Filename for the LLM model (only used with llm tag)" - ) -): +def start(tag: str): """ šŸš€ Start the Solo Server for model inference. """ + check_docker_installation() typer.echo(f"šŸš€ Starting the Solo Server with tag: {tag}...") - - if tag == "llm": - # Default values for llm tag - default_url = "https://huggingface.co/Mozilla/Llama-3.2-1B-Instruct-llamafile/resolve/main/Llama-3.2-1B-Instruct.Q6_K.llamafile" - default_filename = "Llama-3.2-1B-Instruct.Q6_K.llamafile" - - # Use provided values or defaults - os.environ["MODEL_URL"] = model_url or default_url - os.environ["MODEL_FILENAME"] = model_filename or default_filename - elif (model_url or model_filename) and tag != "llm": - typer.echo("āš ļø Warning: model-url and model-filename are only used with the llm tag") - python_file = f"templates/{tag}.py" os.environ["PYTHON_FILE"] = python_file - - # Get the current file's directory and construct the full path current_dir = os.path.dirname(os.path.abspath(__file__)) docker_compose_path = os.path.join(current_dir, "docker-compose.yml") - execute_command(["docker-compose", "-f", docker_compose_path, "up", "--build"]) + execute_command(["docker-compose", "-f", docker_compose_path, "up", "-d"]) -# Command to stop the Solo Server @app.command() def stop(): """ ā¹ Stop the running Solo Server. """ + check_docker_installation() typer.echo("ā¹ Stopping the Solo Server...") current_dir = os.path.dirname(os.path.abspath(__file__)) docker_compose_path = os.path.join(current_dir, "docker-compose.yml") execute_command(["docker-compose", "-f", docker_compose_path, "down"]) -# Command to check the status of the Solo Server @app.command() def status(): """ šŸ“ˆ Check the status of the Solo Server. """ + check_docker_installation() typer.echo("šŸ“ˆ Checking Solo Server status...") current_dir = os.path.dirname(os.path.abspath(__file__)) docker_compose_path = os.path.join(current_dir, "docker-compose.yml") execute_command(["docker-compose", "-f", docker_compose_path, "ps"]) -# Command to generate a code base template related to the tag @app.command() -def gen(tag: str): +def benchmark( + model_url: str = typer.Option(..., help="URL of the model to benchmark"), + model_filename: str = typer.Option(..., help="Filename for the downloaded model"), + template: str = typer.Option("llm", help="Template to use for benchmarking") +): + """ + šŸŽļø Run a benchmark test on the Solo Server with TimescaleDB and Grafana integration. + """ + check_docker_installation() + + # First start the Solo Server with the specified template + typer.echo(f"šŸš€ Starting the Solo Server with template: {template}...") + python_file = f"templates/{template}.py" + os.environ["PYTHON_FILE"] = python_file + os.environ["MODEL_URL"] = model_url + os.environ["MODEL_FILENAME"] = model_filename + + # Start the main server + current_dir = os.path.dirname(os.path.abspath(__file__)) + docker_compose_path = os.path.join(current_dir, "docker-compose.yml") + execute_command(["docker-compose", "-f", docker_compose_path, "up", "-d"]) + + # Wait for container to be healthy + typer.echo("ā³ Waiting for LLM server to be ready...") + start_time = time.time() + timeout = 300 # 5 minutes timeout + + while True: + if time.time() - start_time > timeout: + typer.echo("āŒ LLM server startup timed out") + execute_command(["docker-compose", "-f", docker_compose_path, "down"]) + return + + result = subprocess.run( + ["docker", "inspect", "--format", "{{.State.Health.Status}}", "solo-api"], + capture_output=True, + text=True + ) + status = result.stdout.strip() + + if status == "healthy": + typer.echo("āœ… LLM server is ready!") + break + elif status == "unhealthy": + # Print the container logs to help debug + typer.echo("Checking container logs:") + subprocess.run(["docker", "logs", "solo-api"]) + typer.echo("āŒ LLM server failed to start") + execute_command(["docker-compose", "-f", docker_compose_path, "down"]) + return + + typer.echo("ā³ Waiting for LLM server to initialize... (Status: " + status + ")") + time.sleep(5) + + # Now start the benchmark tools + typer.echo("šŸŽļø Starting benchmark tools...") + benchmark_compose_path = os.path.join(current_dir, "docker-compose-benchmark.yml") + execute_command(["docker-compose", "-f", benchmark_compose_path, "up", "-d", "timescale", "grafana", "locust"]) + + try: + # Wait for Grafana to be ready + typer.echo("ā³ Waiting for Grafana to be ready...") + time.sleep(10) + + # Configure Grafana + typer.echo("šŸ”§ Configuring Grafana...") + grafana_setup_path = os.path.join(current_dir, "grafana_setup.sh") + os.chmod(grafana_setup_path, 0o755) + execute_command([grafana_setup_path]) + + typer.echo("āœ… Benchmark environment is ready!") + typer.echo("šŸ“Š Visit:") + typer.echo(" - Grafana: http://localhost:3000 (admin/admin)") + typer.echo(" - Locust: http://localhost:8089") + + while True: + time.sleep(1) + except KeyboardInterrupt: + typer.echo("\nā¹ Stopping all services...") + finally: + # Stop both compose files + execute_command(["docker-compose", "-f", docker_compose_path, "down"]) + execute_command(["docker-compose", "-f", benchmark_compose_path, "down"]) + +@app.command() +def gui(): """ - šŸ–Œļø Generate a code base template related to the tag. + šŸ–„ļø Launch the Streamlit GUI for Solo Server. """ - typer.echo(f"šŸ–Œļø Generating code base template for tag: {tag}...") - # Add logic to generate a template based on the provided tag + typer.echo("šŸ–„ļø Launching Streamlit app...") + + # Run Streamlit + streamlit_command = [ + "streamlit", + "run", + "templates/streamlit_llm.py" + ] + + try: + print(execute_command(streamlit_command)) + except Exception as e: + typer.echo(f"āŒ Failed to launch Streamlit app: {e}") + else: + typer.echo("āœ… Streamlit app launched successfully.") if __name__ == "__main__": app() diff --git a/solo_server/docker-compose-benchmark.yml b/solo_server/docker-compose-benchmark.yml new file mode 100644 index 0000000..af12c24 --- /dev/null +++ b/solo_server/docker-compose-benchmark.yml @@ -0,0 +1,49 @@ +version: '3.7' + +services: + timescale: + image: timescale/timescaledb:latest-pg14 + container_name: timescale_postgres + environment: + POSTGRES_PASSWORD: password + POSTGRES_DB: locust + ports: + - "5433:5432" + volumes: + - timescale_postgres_data:/var/lib/postgresql/data + + grafana: + image: grafana/grafana:latest + container_name: timescale_grafana + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + depends_on: + - timescale + volumes: + - grafana_data:/var/lib/grafana + + locust: + image: locustio/locust:latest + container_name: locust_benchmark + volumes: + - ./locustfile.py:/home/locust/locustfile.py + command: > + -f /home/locust/locustfile.py + --host http://host.docker.internal:8000 + --users 10 + --spawn-rate 2 + --run-time 1m + ports: + - "8089:8089" + extra_hosts: + - "host.docker.internal:host-gateway" + +volumes: + timescale_postgres_data: + grafana_data: + +networks: + solo-network: + driver: bridge diff --git a/solo_server/docker-compose.yml b/solo_server/docker-compose.yml index 8cd3c3c..6c802a7 100644 --- a/solo_server/docker-compose.yml +++ b/solo_server/docker-compose.yml @@ -1,3 +1,5 @@ +version: '3.7' + services: solo-api: build: @@ -7,10 +9,18 @@ services: container_name: "solo-api" ports: - "8000:8000" + - "8080:8080" environment: - PYTHON_FILE=${PYTHON_FILE:-solo_server/templates/basic.py} - MODEL_URL=${MODEL_URL:-your_model_url_here} - MODEL_FILENAME=${MODEL_FILENAME:-your_model_filename_here} + - LITSERVE_TIMEOUT=120 + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8080/completion", "-H", "Content-Type: application/json", "-d", '{"prompt":"test","n_predict":1}'] + interval: 10s + timeout: 30s + retries: 10 + start_period: 120s networks: solo-network: diff --git a/solo_server/grafana_setup.sh b/solo_server/grafana_setup.sh new file mode 100755 index 0000000..d425607 --- /dev/null +++ b/solo_server/grafana_setup.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +GRAFANA_URL="http://localhost:3000" +ADMIN_PASSWORD="admin" +DATASOURCE_NAME="TimescaleDB" + +# Add a new TimescaleDB datasource +curl -X POST -H "Content-Type: application/json" \ + -u admin:$ADMIN_PASSWORD \ + -d '{ + "name": "'"$DATASOURCE_NAME"'", + "type": "postgres", + "url": "timescale:5432", + "access": "proxy", + "database": "locust", + "user": "postgres", + "password": "password", + "isDefault": true + }' \ + $GRAFANA_URL/api/datasources diff --git a/solo_server/locustfile.py b/solo_server/locustfile.py new file mode 100644 index 0000000..9ae6f71 --- /dev/null +++ b/solo_server/locustfile.py @@ -0,0 +1,33 @@ +from locust import HttpUser, task, between +import json + +class SoloServerUser(HttpUser): + wait_time = between(1, 2) + + @task + def test_llm(self): + """Test LLM completions endpoint""" + headers = { + "Content-Type": "application/json" + } + + payload = { + "prompt": "What is AI?", + "n_predict": 128 + } + + with self.client.post( + "/predict", + json=payload, + headers=headers, + catch_response=True + ) as response: + try: + if response.status_code == 200: + response.success() + else: + response.failure(f"Failed with status code: {response.status_code}") + except json.JSONDecodeError: + response.failure("Response could not be decoded as JSON") + except Exception as e: + response.failure(f"Error: {str(e)}") diff --git a/solo_server/requirements.txt b/solo_server/requirements.txt index 4602da1..83d748c 100644 --- a/solo_server/requirements.txt +++ b/solo_server/requirements.txt @@ -9,4 +9,7 @@ Pillow diffusers accelerate huggingface_hub -qai-hub-models[stable_diffusion_v2_1_quantized] \ No newline at end of file +qai-hub-models[stable_diffusion_v2_1_quantized] +typer +locust +locust-plugins \ No newline at end of file diff --git a/solo_server/templates/llm.py b/solo_server/templates/llm.py index 62bfd4a..a95df26 100644 --- a/solo_server/templates/llm.py +++ b/solo_server/templates/llm.py @@ -51,26 +51,74 @@ def setup(self, device): print("Llama model server started.") def decode_request(self, request): - return request["prompt"] + # Handle both POST /predict and direct completion requests + if isinstance(request, dict): + return request.get("prompt", request.get("input", "")) + return request def predict(self, prompt): - response = subprocess.run(["curl", "-X", "POST", "http://localhost:8080/completion", - "-H", "Content-Type: application/json", - "-d", f'{{"prompt": "{prompt}", "n_predict": 128}}'], - capture_output=True, text=True) - response_json = json.loads(response.stdout) - return response_json["content"] + try: + # Internal request to LLaMA server on 8080 + response = subprocess.run( + ["curl", "-s", "http://localhost:8080/completion", + "-H", "Content-Type: application/json", + "-d", json.dumps({ + "prompt": prompt, + "n_predict": 128 + })], + capture_output=True, + text=True, + timeout=30 + ) + + if response.returncode != 0: + print(f"Error from LLM server: {response.stderr}") + return f"Error: {response.stderr}" + + result = json.loads(response.stdout) + return result.get("content", "No content generated") + + except Exception as e: + print(f"Error in predict: {e}") + return f"Error: {str(e)}" def encode_response(self, output): - # Clean up the output by removing system tokens, newlines, and redundant text - cleaned_output = output.replace("<|eot_id|>", "") # Remove system token - cleaned_output = cleaned_output.replace("\n", " ") # Replace newlines with spaces - cleaned_output = " ".join(cleaned_output.split()) # Remove extra spaces - return {"generated_text": cleaned_output} + if isinstance(output, str): + cleaned_output = output.replace("<|eot_id|>", "").replace("\n", " ").strip() + return { + "generated_text": cleaned_output, + "status": "success" + } + return { + "error": str(output), + "status": "error" + } + + def health_check(self): + """Health check endpoint""" + try: + response = subprocess.run( + ["curl", "-s", "http://localhost:8080/completion", + "-H", "Content-Type: application/json", + "-d", '{"prompt": "test", "n_predict": 1}'], + capture_output=True, + timeout=5 + ) + return response.returncode == 0 + except: + return False # STEP 2: START THE SERVER if __name__ == "__main__": api = LlamaLitAPI() server = ls.LitServer(api, accelerator="auto") + + # Add health check endpoint + @server.app.get("/health") + async def health(): + if api.health_check(): + return {"status": "healthy"} + return {"status": "unhealthy"} + server.run(port=8000, generate_client_file=False) \ No newline at end of file diff --git a/solo_server/templates/streamlit_llm.py b/solo_server/templates/streamlit_llm.py new file mode 100644 index 0000000..6318669 --- /dev/null +++ b/solo_server/templates/streamlit_llm.py @@ -0,0 +1,111 @@ +import json + +import streamlit as st +from openai import OpenAI +from tools import available_tools, functions + +from utils import display_message + +# define model +MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct" +SYSTEM_MESSAGE = { + "role": "system", + "content": "You are a helpful assistant with tool calling capabilities. When you receive a tool call response, use the output to format an answer to the orginal use question.", +} + +client = OpenAI( + base_url="http://127.0.0.1:8001/v1", + api_key="lit", +) + +st.title("Chat with an AI Assistant.") + +# Initialize chat history +if "messages" not in st.session_state: + st.session_state.messages = [] + +# Add input field for system prompt +st.sidebar.header("System Prompt") +system_prompt = st.sidebar.text_area( + label="Modify the prompt here.", value=SYSTEM_MESSAGE["content"], height=200 +) +SYSTEM_MESSAGE["content"] = system_prompt + + +# Add checkboxes to the sidebar +st.sidebar.header("Available Tools") +selected_tools = [ + tool["function"]["name"] + for tool in available_tools + if st.sidebar.checkbox(tool["function"]["name"], value=True) +] + +# Filter available tools based on selected tools +tools = [tool for tool in available_tools if tool["function"]["name"] in selected_tools] + +# Display chat messages from history on app rerun +for message in st.session_state.messages: + display_message(message) + +# Accept user input +if prompt := st.chat_input("Ask anything?"): + # Add user message to chat history + st.session_state.messages.append({"role": "user", "content": prompt}) + # Display user message in chat message container + with st.chat_message("user"): + st.markdown(prompt) + + # Display assistant response in chat message container + with st.chat_message("assistant"): + messages = [SYSTEM_MESSAGE, *st.session_state.messages] + if not tools: + stream = client.chat.completions.create( + model=MODEL, + messages=messages, + stream=True, + ) + response = st.write_stream(stream) + st.session_state.messages.append({"role": "assistant", "content": response}) + else: + spinner = st.spinner("Thinking...") + response = client.chat.completions.create( + model=MODEL, + messages=messages, + tools=available_tools, + tool_choice="auto", + ) + response_message = response.choices[0].message + tool_calls = response_message.tool_calls + if tool_calls: + with st.status("Thinking...", expanded=True) as status: + st.session_state.messages.append(response_message) + for tool_call in tool_calls: + function_name = tool_call.function.name + tool = functions[function_name] + args = json.loads(tool_call.function.arguments) + st.write(f"Calling {function_name}... with args: {args}") + tool_response = tool(**args) + st.session_state.messages.append( + { + "tool_call_id": tool_call.id, + "role": "ipython", + "content": tool_response, + "name": function_name, + } + ) + status.update( + label=f"Running {function_name}... Done!", + state="complete", + expanded=False, + ) + stream = client.chat.completions.create( + model=MODEL, messages=st.session_state.messages, stream=True + ) + response = st.write_stream(stream) + st.session_state.messages.append( + {"role": "assistant", "content": response} + ) + else: + response = response.choices[0].message + st.write(response.content) + st.session_state.messages.append(response) diff --git a/solo_server/templates/tools/__init__.py b/solo_server/templates/tools/__init__.py new file mode 100644 index 0000000..3b4f9aa --- /dev/null +++ b/solo_server/templates/tools/__init__.py @@ -0,0 +1,9 @@ +from .get_top_hf_papers import get_top_hf_papers, get_top_hf_papers_json + +available_tools = [ + get_top_hf_papers_json, +] + +functions = { + "get_top_hf_papers": get_top_hf_papers, +} diff --git a/solo_server/templates/tools/get_top_hf_papers.py b/solo_server/templates/tools/get_top_hf_papers.py new file mode 100644 index 0000000..0b1361a --- /dev/null +++ b/solo_server/templates/tools/get_top_hf_papers.py @@ -0,0 +1,105 @@ +import json +import requests +from bs4 import BeautifulSoup + + +def get_top_hf_papers(n: int): + """ + Fetches the top N papers from the Hugging Face papers page based on the number of votes. + """ + url = "https://huggingface.co/papers" + response = requests.get(url) + if response.status_code != 200: + raise Exception(f"Failed to retrieve papers: {response.status_code}") + + soup = BeautifulSoup(response.text, "html.parser") + papers = soup.find_all("article") + + paper_info = [] + for paper in papers: + title = paper.find("h3").text.strip() if paper.find("h3") else "No Title" + link = paper.find("a")["href"] if paper.find("a") else "#" + vote_info = paper.find( + "div", {"class": "flex flex-wrap items-center gap-2.5 pt-1"} + ).find("div", {"class": "leading-none"}) + thumbnail = paper.find("img")["src"] if paper.find("img") else "" + author_list = paper.find( + "ul", {"class": "flex items-center flex-row-reverse text-sm"} + ) + + authors = [] + if author_list: + for author in author_list.find_all("li"): + if author.has_attr("title"): + authors.append(author["title"]) + + paper_info.append( + { + "title": title, + "link": link, + "votes": int(vote_info.text.strip()) + if vote_info and vote_info.text.strip().isdigit() + else 0, + "thumbnail": thumbnail, + "authors": ", ".join(authors) if authors else "Unknown", + } + ) + + paper_info.sort(key=lambda x: x["votes"], reverse=True) + top_papers = paper_info[:n] + + for i, paper in enumerate(top_papers): + paper_url = f"https://huggingface.co{paper['link']}" + paper_response = requests.get(paper_url) + if paper_response.status_code != 200: + print( + f"Failed to retrieve paper details for {paper['title']}: {paper_response.status_code}" + ) + continue + + paper_soup = BeautifulSoup(paper_response.text, "html.parser") + published_date_div = paper_soup.find( + "div", + { + "class": "mb-6 flex flex-wrap gap-2 text-sm text-gray-500 max-sm:flex-col sm:items-center sm:text-base md:mb-8" + }, + ).find("div") + published_date_text = "" + if published_date_div: + published_date_text = published_date_div.text.split("Published on ")[ + 1 + ].strip() + + abstract_div = paper_soup.find("div", {"class": "pb-8 pr-4 md:pr-16"}).find("p") + abstract = ( + abstract_div.text.strip() if abstract_div else "No abstract available" + ) + + top_papers[i]["published_date"] = published_date_text + top_papers[i]["abstract"] = abstract + + return json.dumps(top_papers, indent=2) + + +get_top_hf_papers_json = { + "type": "function", + "function": { + "name": "get_top_hf_papers", + "description": "Get the top N papers from the Hugging Face papers page based on the number of votes.", + "parameters": { + "type": "object", + "properties": { + "n": { + "type": "integer", + "description": "Number of top papers to fetch.", + } + }, + "required": ["n"], + }, + }, +} + +if __name__ == "__main__": + top_papers = get_top_hf_papers(5) + for paper in top_papers: + print(f"Title: {paper['title']}") diff --git a/solo_server/templates/utils.py b/solo_server/templates/utils.py new file mode 100644 index 0000000..3486923 --- /dev/null +++ b/solo_server/templates/utils.py @@ -0,0 +1,14 @@ +import streamlit as st + + +def display_message(message): + if isinstance(message, dict): + role = message.get("role") + content = message.get("content") + else: + role = message.role + content = message.content + + if role in ["system", "assistant", "user"] and content: + with st.chat_message(role): + st.markdown(content)