Tu Nombre commited on
Commit
2681e2a
1 Parent(s): 8ced679

Optimize performance and memory usage

Browse files
Files changed (2) hide show
  1. Dockerfile +27 -22
  2. app.py +30 -57
Dockerfile CHANGED
@@ -10,34 +10,39 @@ COPY --chown=user ./requirements.txt requirements.txt
10
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
  COPY --chown=user . /app
12
 
13
- RUN printf "FROM llama3\n\
14
- SYSTEM \"Eres un asistente especializado en Desarrollo Humano, basado en la duod茅cima edici贸n del libro de Papalia.\"\n\
15
- PARAMETER temperature 0.7\n\
16
- PARAMETER top_k 40\n\
17
- PARAMETER top_p 0.7\n\
18
- PARAMETER repeat_penalty 1.1\n\
19
- PARAMETER num_ctx 2048" > /app/Modelfile
20
-
21
- RUN printf '#!/bin/bash\n\
22
- ulimit -v unlimited\n\
23
- ollama serve &\n\
24
- for i in {1..60}; do\n\
25
- if nc -z localhost 11434; then break; fi\n\
26
- sleep 1\n\
27
- done\n\
28
- cd /app\n\
29
- ollama create llama3.2:1b-papalia -f Modelfile\n\
30
- if ! ollama list | grep -q "llama3.2:1b-papalia"; then\n\
31
- exit 1\n\
32
- fi\n\
33
- exec uvicorn app:app --host 0.0.0.0 --port 7860 --timeout-keep-alive 120 --workers 1' > /app/start.sh
 
 
 
 
34
 
35
  RUN chmod +x /app/start.sh
36
 
37
  USER user
38
  ENV PATH="/home/user/.local/bin:$PATH"
 
39
 
40
- HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
41
  CMD curl -f http://localhost:7860/health || exit 1
42
 
43
  CMD ["/app/start.sh"]
 
10
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
  COPY --chown=user . /app
12
 
13
+ # Optimizar Modelfile para menor uso de memoria
14
+ RUN printf "FROM llama3
15
+
16
+ SYSTEM \"Asistente especializado en Desarrollo Humano basado en Papalia 12va Edici贸n.\"
17
+
18
+ PARAMETER temperature 0.7
19
+ PARAMETER top_k 40
20
+ PARAMETER top_p 0.7
21
+ PARAMETER repeat_penalty 1.1
22
+ PARAMETER num_ctx 1024
23
+ PARAMETER num_thread 4
24
+ PARAMETER num_gpu 0" > /app/Modelfile
25
+
26
+ # Script de inicio optimizado
27
+ RUN printf '#!/bin/bash
28
+ ulimit -v unlimited
29
+ ollama serve --verbose &
30
+ timeout=60
31
+ until nc -z localhost 11434 || [ $timeout -le 0 ]; do
32
+ sleep 1
33
+ ((timeout--))
34
+ done
35
+ cd /app
36
+ ollama create llama3.2:1b-papalia -f Modelfile
37
+ exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1 --limit-concurrency 1' > /app/start.sh
38
 
39
  RUN chmod +x /app/start.sh
40
 
41
  USER user
42
  ENV PATH="/home/user/.local/bin:$PATH"
43
+ ENV MALLOC_ARENA_MAX=2
44
 
45
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
46
  CMD curl -f http://localhost:7860/health || exit 1
47
 
48
  CMD ["/app/start.sh"]
app.py CHANGED
@@ -1,5 +1,5 @@
1
  from fastapi import FastAPI, HTTPException, Request
2
- from fastapi.responses import HTMLResponse, JSONResponse
3
  from fastapi.templating import Jinja2Templates
4
  from fastapi.middleware.cors import CORSMiddleware
5
  from pydantic import BaseModel
@@ -10,12 +10,7 @@ from typing import Optional, Dict, Any
10
  logging.basicConfig(level=logging.INFO)
11
  logger = logging.getLogger(__name__)
12
 
13
- app = FastAPI(
14
- title="Llama3.2:1b-Papalia Inference API",
15
- description="API para interactuar con el modelo especializado en Desarrollo Humano",
16
- version="1.0.0"
17
- )
18
-
19
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
20
  templates = Jinja2Templates(directory="templates")
21
 
@@ -28,72 +23,50 @@ class QueryResponse(BaseModel):
28
  response: str
29
  model: str = "llama3.2:1b-papalia"
30
 
31
- OLLAMA_API_URL = "http://localhost:11434/api/generate"
32
- OLLAMA_BASE_URL = "http://localhost:11434"
33
-
34
- async def check_ollama_status() -> Dict[str, Any]:
35
- try:
36
- async with httpx.AsyncClient(timeout=5.0) as client:
37
- response = await client.get(OLLAMA_BASE_URL)
38
- if response.status_code != 200:
39
- return {"status": "error", "message": "Ollama no responde"}
40
-
41
- model_response = await client.post(
42
- OLLAMA_API_URL,
43
- json={
44
- "model": "llama3.2:1b-papalia",
45
- "prompt": "test",
46
- "max_tokens": 1
47
- },
48
- timeout=5.0
49
  )
50
- if model_response.status_code != 200:
51
- return {"status": "error", "message": "Modelo no disponible"}
52
- return {"status": "ok", "message": "Servicio activo"}
53
- except Exception as e:
54
- return {"status": "error", "message": str(e)}
 
 
55
 
56
  @app.get("/", response_class=HTMLResponse)
57
  async def read_root(request: Request):
58
- status = await check_ollama_status()
59
- return templates.TemplateResponse("index.html", {"request": request, "title": "Papalia3 Inference", "status": status})
60
 
61
  @app.post("/generate")
62
  async def generate_response(query: QueryRequest):
63
- logger.info(f"Solicitud recibida: {query.prompt[:50]}...")
64
  try:
65
- async with httpx.AsyncClient(timeout=60.0) as client:
66
- status = await check_ollama_status()
67
- if status["status"] != "ok":
68
- raise HTTPException(status_code=503, detail=status["message"])
69
-
70
- response = await client.post(
71
- OLLAMA_API_URL,
72
- json={
73
  "model": "llama3.2:1b-papalia",
74
  "prompt": query.prompt,
75
- "stream": False,
76
  "temperature": query.temperature,
77
- "max_tokens": query.max_tokens
78
- },
79
- timeout=60.0
80
  )
81
-
82
- if response.status_code != 200:
83
- raise HTTPException(status_code=response.status_code, detail=f"Error del modelo: {response.text}")
84
-
85
- result = response.json()
86
- logger.info("Respuesta generada exitosamente")
87
  return {"response": result.get("response", ""), "model": "llama3.2:1b-papalia"}
88
-
89
- except httpx.TimeoutException:
90
- logger.error("Timeout en solicitud a Ollama")
91
- raise HTTPException(status_code=504, detail="Timeout en solicitud al modelo")
92
  except Exception as e:
93
  logger.error(f"Error: {str(e)}")
94
  raise HTTPException(status_code=500, detail=str(e))
95
 
96
  @app.get("/health")
97
  async def health_check():
98
- status = await check_ollama_status()
99
- return {"status": "healthy", "message": status["message"]} if status["status"] == "ok" else {"status": "unhealthy", "error": status["message"]}
 
 
 
 
 
1
  from fastapi import FastAPI, HTTPException, Request
2
+ from fastapi.responses import HTMLResponse
3
  from fastapi.templating import Jinja2Templates
4
  from fastapi.middleware.cors import CORSMiddleware
5
  from pydantic import BaseModel
 
10
  logging.basicConfig(level=logging.INFO)
11
  logger = logging.getLogger(__name__)
12
 
13
+ app = FastAPI()
 
 
 
 
 
14
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
15
  templates = Jinja2Templates(directory="templates")
16
 
 
23
  response: str
24
  model: str = "llama3.2:1b-papalia"
25
 
26
+ async def generate_with_retries(client: httpx.AsyncClient, data: dict, max_retries: int = 3) -> dict:
27
+ for attempt in range(max_retries):
28
+ try:
29
+ response = await client.post(
30
+ "http://localhost:11434/api/generate",
31
+ json=data,
32
+ timeout=90.0
 
 
 
 
 
 
 
 
 
 
 
33
  )
34
+ response.raise_for_status()
35
+ return response.json()
36
+ except Exception as e:
37
+ if attempt == max_retries - 1:
38
+ raise
39
+ await httpx.AsyncClient().aclose()
40
+ continue
41
 
42
  @app.get("/", response_class=HTMLResponse)
43
  async def read_root(request: Request):
44
+ return templates.TemplateResponse("index.html", {"request": request})
 
45
 
46
  @app.post("/generate")
47
  async def generate_response(query: QueryRequest):
 
48
  try:
49
+ async with httpx.AsyncClient(timeout=90.0) as client:
50
+ result = await generate_with_retries(
51
+ client,
52
+ {
 
 
 
 
53
  "model": "llama3.2:1b-papalia",
54
  "prompt": query.prompt,
 
55
  "temperature": query.temperature,
56
+ "max_tokens": query.max_tokens,
57
+ "stream": False
58
+ }
59
  )
 
 
 
 
 
 
60
  return {"response": result.get("response", ""), "model": "llama3.2:1b-papalia"}
 
 
 
 
61
  except Exception as e:
62
  logger.error(f"Error: {str(e)}")
63
  raise HTTPException(status_code=500, detail=str(e))
64
 
65
  @app.get("/health")
66
  async def health_check():
67
+ try:
68
+ async with httpx.AsyncClient(timeout=5.0) as client:
69
+ await client.get("http://localhost:11434")
70
+ return {"status": "healthy"}
71
+ except Exception as e:
72
+ return {"status": "unhealthy", "error": str(e)}