Spaces:

andresdegante
/

papalia3

Sleeping

App Files Files Community

Tu Nombre commited on 25 days ago

Commit

2681e2a

1 Parent(s): 8ced679

Optimize performance and memory usage

Browse files

Files changed (2) hide show

Dockerfile +27 -22
app.py +30 -57

Dockerfile CHANGED Viewed

@@ -10,34 +10,39 @@ COPY --chown=user ./requirements.txt requirements.txt
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
 COPY --chown=user . /app
-RUN printf "FROM llama3\n\
-SYSTEM \"Eres un asistente especializado en Desarrollo Humano, basado en la duodécima edición del libro de Papalia.\"\n\
-PARAMETER temperature 0.7\n\
-PARAMETER top_k 40\n\
-PARAMETER top_p 0.7\n\
-PARAMETER repeat_penalty 1.1\n\
-PARAMETER num_ctx 2048" > /app/Modelfile
-RUN printf '#!/bin/bash\n\
-ulimit -v unlimited\n\
-ollama serve &\n\
-for i in {1..60}; do\n\
-    if nc -z localhost 11434; then break; fi\n\
-    sleep 1\n\
-done\n\
-cd /app\n\
-ollama create llama3.2:1b-papalia -f Modelfile\n\
-if ! ollama list | grep -q "llama3.2:1b-papalia"; then\n\
-    exit 1\n\
-fi\n\
-exec uvicorn app:app --host 0.0.0.0 --port 7860 --timeout-keep-alive 120 --workers 1' > /app/start.sh
 RUN chmod +x /app/start.sh
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
-HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
     CMD curl -f http://localhost:7860/health || exit 1
 CMD ["/app/start.sh"]

 RUN pip install --no-cache-dir --upgrade -r requirements.txt
 COPY --chown=user . /app
+# Optimizar Modelfile para menor uso de memoria
+RUN printf "FROM llama3
+SYSTEM \"Asistente especializado en Desarrollo Humano basado en Papalia 12va Edición.\"
+PARAMETER temperature 0.7
+PARAMETER top_k 40
+PARAMETER top_p 0.7
+PARAMETER repeat_penalty 1.1
+PARAMETER num_ctx 1024
+PARAMETER num_thread 4
+PARAMETER num_gpu 0" > /app/Modelfile
+# Script de inicio optimizado
+RUN printf '#!/bin/bash
+ulimit -v unlimited
+ollama serve --verbose &
+timeout=60
+until nc -z localhost 11434 || [ $timeout -le 0 ]; do
+    sleep 1
+    ((timeout--))
+done
+cd /app
+ollama create llama3.2:1b-papalia -f Modelfile
+exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1 --limit-concurrency 1' > /app/start.sh
 RUN chmod +x /app/start.sh
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
+ENV MALLOC_ARENA_MAX=2
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
     CMD curl -f http://localhost:7860/health || exit 1
 CMD ["/app/start.sh"]

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from fastapi import FastAPI, HTTPException, Request
-from fastapi.responses import HTMLResponse, JSONResponse
 from fastapi.templating import Jinja2Templates
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
@@ -10,12 +10,7 @@ from typing import Optional, Dict, Any
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-app = FastAPI(
-    title="Llama3.2:1b-Papalia Inference API",
-    description="API para interactuar con el modelo especializado en Desarrollo Humano",
-    version="1.0.0"
-)
 app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
 templates = Jinja2Templates(directory="templates")
@@ -28,72 +23,50 @@ class QueryResponse(BaseModel):
     response: str
     model: str = "llama3.2:1b-papalia"
-OLLAMA_API_URL = "http://localhost:11434/api/generate"
-OLLAMA_BASE_URL = "http://localhost:11434"
-async def check_ollama_status() -> Dict[str, Any]:
-    try:
-        async with httpx.AsyncClient(timeout=5.0) as client:
-            response = await client.get(OLLAMA_BASE_URL)
-            if response.status_code != 200:
-                return {"status": "error", "message": "Ollama no responde"}
-            model_response = await client.post(
-                OLLAMA_API_URL,
-                json={
-                    "model": "llama3.2:1b-papalia",
-                    "prompt": "test",
-                    "max_tokens": 1
-                },
-                timeout=5.0
             )
-            if model_response.status_code != 200:
-                return {"status": "error", "message": "Modelo no disponible"}
-            return {"status": "ok", "message": "Servicio activo"}
-    except Exception as e:
-        return {"status": "error", "message": str(e)}
 @app.get("/", response_class=HTMLResponse)
 async def read_root(request: Request):
-    status = await check_ollama_status()
-    return templates.TemplateResponse("index.html", {"request": request, "title": "Papalia3 Inference", "status": status})
 @app.post("/generate")
 async def generate_response(query: QueryRequest):
-    logger.info(f"Solicitud recibida: {query.prompt[:50]}...")
     try:
-        async with httpx.AsyncClient(timeout=60.0) as client:
-            status = await check_ollama_status()
-            if status["status"] != "ok":
-                raise HTTPException(status_code=503, detail=status["message"])
-            response = await client.post(
-                OLLAMA_API_URL,
-                json={
                     "model": "llama3.2:1b-papalia",
                     "prompt": query.prompt,
-                    "stream": False,
                     "temperature": query.temperature,
-                    "max_tokens": query.max_tokens
-                },
-                timeout=60.0
             )
-            if response.status_code != 200:
-                raise HTTPException(status_code=response.status_code, detail=f"Error del modelo: {response.text}")
-            result = response.json()
-            logger.info("Respuesta generada exitosamente")
             return {"response": result.get("response", ""), "model": "llama3.2:1b-papalia"}
-    except httpx.TimeoutException:
-        logger.error("Timeout en solicitud a Ollama")
-        raise HTTPException(status_code=504, detail="Timeout en solicitud al modelo")
     except Exception as e:
         logger.error(f"Error: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/health")
 async def health_check():
-    status = await check_ollama_status()
-    return {"status": "healthy", "message": status["message"]} if status["status"] == "ok" else {"status": "unhealthy", "error": status["message"]}

 from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import HTMLResponse
 from fastapi.templating import Jinja2Templates
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+app = FastAPI()
 app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"])
 templates = Jinja2Templates(directory="templates")
     response: str
     model: str = "llama3.2:1b-papalia"
+async def generate_with_retries(client: httpx.AsyncClient, data: dict, max_retries: int = 3) -> dict:
+    for attempt in range(max_retries):
+        try:
+            response = await client.post(
+                "http://localhost:11434/api/generate",
+                json=data,
+                timeout=90.0
             )
+            response.raise_for_status()
+            return response.json()
+        except Exception as e:
+            if attempt == max_retries - 1:
+                raise
+            await httpx.AsyncClient().aclose()
+            continue
 @app.get("/", response_class=HTMLResponse)
 async def read_root(request: Request):
+    return templates.TemplateResponse("index.html", {"request": request})
 @app.post("/generate")
 async def generate_response(query: QueryRequest):
     try:
+        async with httpx.AsyncClient(timeout=90.0) as client:
+            result = await generate_with_retries(
+                client,
+                {
                     "model": "llama3.2:1b-papalia",
                     "prompt": query.prompt,
                     "temperature": query.temperature,
+                    "max_tokens": query.max_tokens,
+                    "stream": False
+                }
             )
             return {"response": result.get("response", ""), "model": "llama3.2:1b-papalia"}
     except Exception as e:
         logger.error(f"Error: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/health")
 async def health_check():
+    try:
+        async with httpx.AsyncClient(timeout=5.0) as client:
+            await client.get("http://localhost:11434")
+            return {"status": "healthy"}
+    except Exception as e:
+        return {"status": "unhealthy", "error": str(e)}