Spaces:
Sleeping
Sleeping
Tu Nombre
commited on
Commit
路
2681e2a
1
Parent(s):
8ced679
Optimize performance and memory usage
Browse files- Dockerfile +27 -22
- app.py +30 -57
Dockerfile
CHANGED
@@ -10,34 +10,39 @@ COPY --chown=user ./requirements.txt requirements.txt
|
|
10 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
11 |
COPY --chown=user . /app
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
PARAMETER
|
19 |
-
PARAMETER
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
ollama
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
34 |
|
35 |
RUN chmod +x /app/start.sh
|
36 |
|
37 |
USER user
|
38 |
ENV PATH="/home/user/.local/bin:$PATH"
|
|
|
39 |
|
40 |
-
HEALTHCHECK --interval=30s --timeout=10s --start-period=
|
41 |
CMD curl -f http://localhost:7860/health || exit 1
|
42 |
|
43 |
CMD ["/app/start.sh"]
|
|
|
10 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
11 |
COPY --chown=user . /app
|
12 |
|
13 |
+
# Optimizar Modelfile para menor uso de memoria
|
14 |
+
RUN printf "FROM llama3
|
15 |
+
|
16 |
+
SYSTEM \"Asistente especializado en Desarrollo Humano basado en Papalia 12va Edici贸n.\"
|
17 |
+
|
18 |
+
PARAMETER temperature 0.7
|
19 |
+
PARAMETER top_k 40
|
20 |
+
PARAMETER top_p 0.7
|
21 |
+
PARAMETER repeat_penalty 1.1
|
22 |
+
PARAMETER num_ctx 1024
|
23 |
+
PARAMETER num_thread 4
|
24 |
+
PARAMETER num_gpu 0" > /app/Modelfile
|
25 |
+
|
26 |
+
# Script de inicio optimizado
|
27 |
+
RUN printf '#!/bin/bash
|
28 |
+
ulimit -v unlimited
|
29 |
+
ollama serve --verbose &
|
30 |
+
timeout=60
|
31 |
+
until nc -z localhost 11434 || [ $timeout -le 0 ]; do
|
32 |
+
sleep 1
|
33 |
+
((timeout--))
|
34 |
+
done
|
35 |
+
cd /app
|
36 |
+
ollama create llama3.2:1b-papalia -f Modelfile
|
37 |
+
exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1 --limit-concurrency 1' > /app/start.sh
|
38 |
|
39 |
RUN chmod +x /app/start.sh
|
40 |
|
41 |
USER user
|
42 |
ENV PATH="/home/user/.local/bin:$PATH"
|
43 |
+
ENV MALLOC_ARENA_MAX=2
|
44 |
|
45 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
46 |
CMD curl -f http://localhost:7860/health || exit 1
|
47 |
|
48 |
CMD ["/app/start.sh"]
|
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from fastapi import FastAPI, HTTPException, Request
|
2 |
-
from fastapi.responses import HTMLResponse
|
3 |
from fastapi.templating import Jinja2Templates
|
4 |
from fastapi.middleware.cors import CORSMiddleware
|
5 |
from pydantic import BaseModel
|
@@ -10,12 +10,7 @@ from typing import Optional, Dict, Any
|
|
10 |
logging.basicConfig(level=logging.INFO)
|
11 |
logger = logging.getLogger(__name__)
|
12 |
|
13 |
-
app = FastAPI(
|
14 |
-
title="Llama3.2:1b-Papalia Inference API",
|
15 |
-
description="API para interactuar con el modelo especializado en Desarrollo Humano",
|
16 |
-
version="1.0.0"
|
17 |
-
)
|
18 |
-
|
19 |
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
|
20 |
templates = Jinja2Templates(directory="templates")
|
21 |
|
@@ -28,72 +23,50 @@ class QueryResponse(BaseModel):
|
|
28 |
response: str
|
29 |
model: str = "llama3.2:1b-papalia"
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
if response.status_code != 200:
|
39 |
-
return {"status": "error", "message": "Ollama no responde"}
|
40 |
-
|
41 |
-
model_response = await client.post(
|
42 |
-
OLLAMA_API_URL,
|
43 |
-
json={
|
44 |
-
"model": "llama3.2:1b-papalia",
|
45 |
-
"prompt": "test",
|
46 |
-
"max_tokens": 1
|
47 |
-
},
|
48 |
-
timeout=5.0
|
49 |
)
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
55 |
|
56 |
@app.get("/", response_class=HTMLResponse)
|
57 |
async def read_root(request: Request):
|
58 |
-
|
59 |
-
return templates.TemplateResponse("index.html", {"request": request, "title": "Papalia3 Inference", "status": status})
|
60 |
|
61 |
@app.post("/generate")
|
62 |
async def generate_response(query: QueryRequest):
|
63 |
-
logger.info(f"Solicitud recibida: {query.prompt[:50]}...")
|
64 |
try:
|
65 |
-
async with httpx.AsyncClient(timeout=
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
response = await client.post(
|
71 |
-
OLLAMA_API_URL,
|
72 |
-
json={
|
73 |
"model": "llama3.2:1b-papalia",
|
74 |
"prompt": query.prompt,
|
75 |
-
"stream": False,
|
76 |
"temperature": query.temperature,
|
77 |
-
"max_tokens": query.max_tokens
|
78 |
-
|
79 |
-
|
80 |
)
|
81 |
-
|
82 |
-
if response.status_code != 200:
|
83 |
-
raise HTTPException(status_code=response.status_code, detail=f"Error del modelo: {response.text}")
|
84 |
-
|
85 |
-
result = response.json()
|
86 |
-
logger.info("Respuesta generada exitosamente")
|
87 |
return {"response": result.get("response", ""), "model": "llama3.2:1b-papalia"}
|
88 |
-
|
89 |
-
except httpx.TimeoutException:
|
90 |
-
logger.error("Timeout en solicitud a Ollama")
|
91 |
-
raise HTTPException(status_code=504, detail="Timeout en solicitud al modelo")
|
92 |
except Exception as e:
|
93 |
logger.error(f"Error: {str(e)}")
|
94 |
raise HTTPException(status_code=500, detail=str(e))
|
95 |
|
96 |
@app.get("/health")
|
97 |
async def health_check():
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
1 |
from fastapi import FastAPI, HTTPException, Request
|
2 |
+
from fastapi.responses import HTMLResponse
|
3 |
from fastapi.templating import Jinja2Templates
|
4 |
from fastapi.middleware.cors import CORSMiddleware
|
5 |
from pydantic import BaseModel
|
|
|
10 |
logging.basicConfig(level=logging.INFO)
|
11 |
logger = logging.getLogger(__name__)
|
12 |
|
13 |
+
app = FastAPI()
|
|
|
|
|
|
|
|
|
|
|
14 |
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
|
15 |
templates = Jinja2Templates(directory="templates")
|
16 |
|
|
|
23 |
response: str
|
24 |
model: str = "llama3.2:1b-papalia"
|
25 |
|
26 |
+
async def generate_with_retries(client: httpx.AsyncClient, data: dict, max_retries: int = 3) -> dict:
|
27 |
+
for attempt in range(max_retries):
|
28 |
+
try:
|
29 |
+
response = await client.post(
|
30 |
+
"http://localhost:11434/api/generate",
|
31 |
+
json=data,
|
32 |
+
timeout=90.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
)
|
34 |
+
response.raise_for_status()
|
35 |
+
return response.json()
|
36 |
+
except Exception as e:
|
37 |
+
if attempt == max_retries - 1:
|
38 |
+
raise
|
39 |
+
await httpx.AsyncClient().aclose()
|
40 |
+
continue
|
41 |
|
42 |
@app.get("/", response_class=HTMLResponse)
|
43 |
async def read_root(request: Request):
|
44 |
+
return templates.TemplateResponse("index.html", {"request": request})
|
|
|
45 |
|
46 |
@app.post("/generate")
|
47 |
async def generate_response(query: QueryRequest):
|
|
|
48 |
try:
|
49 |
+
async with httpx.AsyncClient(timeout=90.0) as client:
|
50 |
+
result = await generate_with_retries(
|
51 |
+
client,
|
52 |
+
{
|
|
|
|
|
|
|
|
|
53 |
"model": "llama3.2:1b-papalia",
|
54 |
"prompt": query.prompt,
|
|
|
55 |
"temperature": query.temperature,
|
56 |
+
"max_tokens": query.max_tokens,
|
57 |
+
"stream": False
|
58 |
+
}
|
59 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
return {"response": result.get("response", ""), "model": "llama3.2:1b-papalia"}
|
|
|
|
|
|
|
|
|
61 |
except Exception as e:
|
62 |
logger.error(f"Error: {str(e)}")
|
63 |
raise HTTPException(status_code=500, detail=str(e))
|
64 |
|
65 |
@app.get("/health")
|
66 |
async def health_check():
|
67 |
+
try:
|
68 |
+
async with httpx.AsyncClient(timeout=5.0) as client:
|
69 |
+
await client.get("http://localhost:11434")
|
70 |
+
return {"status": "healthy"}
|
71 |
+
except Exception as e:
|
72 |
+
return {"status": "unhealthy", "error": str(e)}
|