1
1
'''
2
- ##################### TinyLlama + FastAPI + Docker #########################################
3
- Autor: Santiago Gonzalez Acevedo
4
- Twitter : @locoalien
5
- Python 3.11 +
2
+ ##################### FastAPI + TinyLlama + Docker #########################################
3
+ Autor: Adrián Baeza Prieto
4
+ Github : @adribaeza
5
+ Python 3.10 +
6
6
'''
7
- #https://medium.com/@santiagosk80/tinyllama-fastapi-docker-microservicios-llm-ff99eb999f04
8
7
import logging
9
8
import os
10
9
import torch
11
10
from fastapi import FastAPI , HTTPException
12
11
from transformers import pipeline
13
- import docs #Libreria con informacion de la API en Swagger
14
- from starlette .middleware .cors import CORSMiddleware #Seguridad a nivel de CORS
12
+ import docs #Import the documentation
13
+ from starlette .middleware .cors import CORSMiddleware #Import the middleware
15
14
import json
16
15
16
+ #instance logger
17
17
logger = logging .getLogger (__name__ )
18
- # Crea una instancia de FastAPI
19
- app = FastAPI (title = 'LLM Chat Service' , description = docs .desc , version = docs .version )
20
- # CORS Configuration (in-case you want to deploy)
21
- app .add_middleware (
18
+ # Instance FastAPI
19
+ api = FastAPI (title = 'LLM Chat Service with TinyLLama ' , description = docs .desc , version = docs .version )
20
+ # Define configuration for the API
21
+ api .add_middleware (
22
22
CORSMiddleware ,
23
23
allow_origins = ["*" ],
24
24
allow_credentials = True ,
27
27
)
28
28
logger .info ('Adding v1 endpoints..' )
29
29
30
- # Carga el modelo y el tokenizador
30
+ # Load the model with the TinyLlama model
31
31
pipe = pipeline ("text-generation" , model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" , torch_dtype = torch .bfloat16 , device_map = "auto" )
32
32
33
- # Necesito un enpoint "/chat" que reciba un texto, lo pase por el modelo y devuelva la respuesta
34
- @app .post ("/chat" )
33
+ # Declare the endpoint for the chat service
34
+ @api .post ("/chat" )
35
35
async def chat (text : str ):
36
36
try :
37
- #Configuracion de comportamiento del modelo
38
- messages = [
39
- {
40
- "role" : "system" ,
41
- "content" : "Solo quiero la respuesta a la pregunta sin repetir la pregunta, por favor." ,
42
- },
43
- {"role" : "user" , "content" : f"{ text } " },
44
- ]
45
- #Obtener prompt para el modelo
46
- prompt = pipe .tokenizer .apply_chat_template (
47
- messages , tokenize = False , add_generation_prompt = True
48
- )
49
- #Configuracion de exactitud del modelo
50
- outputs = pipe (
51
- prompt ,
52
- max_new_tokens = 256 ,
53
- do_sample = True ,
54
- temperature = 0.3 ,
55
- top_k = 50 ,
56
- top_p = 0.95 ,
57
- )
58
- #Resultado del modelo
59
- output = outputs [0 ]["generated_text" ]
60
- # Extraer la parte de la respuesta a partir de "<|assistant|>"
61
- assistant_response = output .split ("<|assistant|>" )[- 1 ].strip ()
62
- json_results = json_results = json .dumps ({"response" : assistant_response }, ensure_ascii = False , indent = 4 ).encode ('utf8' )
63
- return json .loads (json_results )
37
+ #Define the messages to send to the model
38
+ messages = [
39
+ {
40
+ "role" : "system" ,
41
+ "content" : "Solo quiero la respuesta a la pregunta sin repetir la pregunta, por favor." ,
42
+ },
43
+ {
44
+ "role" : "user" ,
45
+ "content" : f"{ text } "
46
+ },
47
+ ]
48
+ #Get the prompt from the tokenizer
49
+ prompt = pipe .tokenizer .apply_chat_template (
50
+ messages , tokenize = False , add_generation_prompt = True
51
+ )
52
+ #Model configuration
53
+ outputs = pipe (
54
+ prompt ,
55
+ max_new_tokens = 256 ,
56
+ do_sample = True ,
57
+ temperature = 0.3 ,
58
+ top_k = 50 ,
59
+ top_p = 0.95 ,
60
+ )
61
+ #Get the output from the model
62
+ output = outputs [0 ]["generated_text" ]
63
+ assistant_response = output .split ("<|assistant|>" )[- 1 ].strip ()
64
+ json_results = json_results = json .dumps ({"response" : assistant_response }, ensure_ascii = False , indent = 4 ).encode ('utf8' )
65
+ return json .loads (json_results )
64
66
except Exception as e :
65
67
logger .error (f'Error: { e } ' )
66
68
raise HTTPException (status_code = 500 , detail = 'Internal Server Error' )
67
69
68
- # Ejecutar el servidor con uvicorn
69
- if __name__ == " __main__" :
70
- import uvicorn
71
- uvicorn .run (app )
70
+ # Execute the API with Uvicorn only if the script is executed directly in the local environment
71
+ # if __name__ == ' __main__' :
72
+ # import uvicorn
73
+ # uvicorn.run(api )
0 commit comments