Spaces:
Configuration error
Configuration error
ManojINaik
commited on
Upload 4 files
Browse files- Dockerfile +26 -0
- README.md +60 -12
- app.py +101 -0
- requirements.txt +9 -0
Dockerfile
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9-slim
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
# Install system dependencies
|
6 |
+
RUN apt-get update && \
|
7 |
+
apt-get install -y --no-install-recommends \
|
8 |
+
build-essential \
|
9 |
+
git \
|
10 |
+
&& rm -rf /var/lib/apt/lists/*
|
11 |
+
|
12 |
+
# Copy requirements first for better caching
|
13 |
+
COPY requirements.txt .
|
14 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
15 |
+
|
16 |
+
# Copy the rest of the application
|
17 |
+
COPY . .
|
18 |
+
|
19 |
+
# Create cache directory
|
20 |
+
RUN mkdir -p ./model_cache
|
21 |
+
|
22 |
+
# Expose the port
|
23 |
+
EXPOSE 7860
|
24 |
+
|
25 |
+
# Command to run the application
|
26 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
@@ -1,12 +1,60 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Fine-Tuned LLM API
|
2 |
+
|
3 |
+
This is a FastAPI-based API service for the fine-tuned model "ManojINaik/Strength_weakness". The model is optimized for text generation with 4-bit quantization for efficient inference.
|
4 |
+
|
5 |
+
## API Endpoints
|
6 |
+
|
7 |
+
### GET /
|
8 |
+
Health check endpoint that confirms the API is running.
|
9 |
+
|
10 |
+
### POST /generate/
|
11 |
+
Generate text based on a prompt with optional parameters.
|
12 |
+
|
13 |
+
#### Request Body
|
14 |
+
```json
|
15 |
+
{
|
16 |
+
"prompt": "What are the strengths of Python?",
|
17 |
+
"history": [], // Optional: List of previous conversation messages
|
18 |
+
"system_prompt": "You are a very powerful AI assistant.", // Optional
|
19 |
+
"max_length": 200, // Optional: Maximum length of generated text
|
20 |
+
"temperature": 0.7 // Optional: Controls randomness (0.0 to 1.0)
|
21 |
+
}
|
22 |
+
```
|
23 |
+
|
24 |
+
#### Response
|
25 |
+
```json
|
26 |
+
{
|
27 |
+
"response": "Generated text response..."
|
28 |
+
}
|
29 |
+
```
|
30 |
+
|
31 |
+
## Model Details
|
32 |
+
- Base Model: ManojINaik/Strength_weakness
|
33 |
+
- Quantization: 4-bit quantization using bitsandbytes
|
34 |
+
- Device: Automatically uses GPU if available, falls back to CPU
|
35 |
+
- Memory Efficient: Uses device mapping for optimal resource utilization
|
36 |
+
|
37 |
+
## Technical Details
|
38 |
+
- Framework: FastAPI
|
39 |
+
- Python Version: 3.9+
|
40 |
+
- Key Dependencies:
|
41 |
+
- transformers
|
42 |
+
- torch
|
43 |
+
- bitsandbytes
|
44 |
+
- accelerate
|
45 |
+
- peft
|
46 |
+
|
47 |
+
## Example Usage
|
48 |
+
```python
|
49 |
+
import requests
|
50 |
+
|
51 |
+
url = "https://your-space-name.hf.space/generate"
|
52 |
+
payload = {
|
53 |
+
"prompt": "What are the strengths of Python?",
|
54 |
+
"temperature": 0.7,
|
55 |
+
"max_length": 200
|
56 |
+
}
|
57 |
+
|
58 |
+
response = requests.post(url, json=payload)
|
59 |
+
print(response.json()["response"])
|
60 |
+
```
|
app.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException
|
2 |
+
from pydantic import BaseModel
|
3 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
|
4 |
+
import torch
|
5 |
+
from typing import Optional, List
|
6 |
+
|
7 |
+
app = FastAPI(title="LLM API", description="API for interacting with LLaMA model")
|
8 |
+
|
9 |
+
# Model configuration
|
10 |
+
class ModelConfig:
|
11 |
+
model_name = "ManojINaik/Strength_weakness" # Your fine-tuned model
|
12 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
13 |
+
max_length = 200
|
14 |
+
temperature = 0.7
|
15 |
+
|
16 |
+
# Request/Response models
|
17 |
+
class GenerateRequest(BaseModel):
|
18 |
+
prompt: str
|
19 |
+
history: Optional[List[str]] = []
|
20 |
+
system_prompt: Optional[str] = "You are a very powerful AI assistant."
|
21 |
+
max_length: Optional[int] = 200
|
22 |
+
temperature: Optional[float] = 0.7
|
23 |
+
|
24 |
+
class GenerateResponse(BaseModel):
|
25 |
+
response: str
|
26 |
+
|
27 |
+
# Global variables for model and tokenizer
|
28 |
+
model = None
|
29 |
+
tokenizer = None
|
30 |
+
generator = None
|
31 |
+
|
32 |
+
@app.on_event("startup")
|
33 |
+
async def load_model():
|
34 |
+
global model, tokenizer, generator
|
35 |
+
try:
|
36 |
+
print("Loading model and tokenizer...")
|
37 |
+
|
38 |
+
# Configure quantization
|
39 |
+
bnb_config = BitsAndBytesConfig(
|
40 |
+
load_in_4bit=True,
|
41 |
+
bnb_4bit_quant_type="nf4",
|
42 |
+
bnb_4bit_compute_dtype=torch.float16,
|
43 |
+
bnb_4bit_use_double_quant=False
|
44 |
+
)
|
45 |
+
|
46 |
+
tokenizer = AutoTokenizer.from_pretrained(ModelConfig.model_name)
|
47 |
+
model = AutoModelForCausalLM.from_pretrained(
|
48 |
+
ModelConfig.model_name,
|
49 |
+
quantization_config=bnb_config,
|
50 |
+
device_map="auto",
|
51 |
+
trust_remote_code=True
|
52 |
+
)
|
53 |
+
|
54 |
+
generator = pipeline(
|
55 |
+
"text-generation",
|
56 |
+
model=model,
|
57 |
+
tokenizer=tokenizer,
|
58 |
+
device_map="auto"
|
59 |
+
)
|
60 |
+
print("Model loaded successfully!")
|
61 |
+
except Exception as e:
|
62 |
+
print(f"Error loading model: {str(e)}")
|
63 |
+
raise e
|
64 |
+
|
65 |
+
@app.post("/generate/", response_model=GenerateResponse)
|
66 |
+
async def generate_text(request: GenerateRequest):
|
67 |
+
if generator is None:
|
68 |
+
raise HTTPException(status_code=500, detail="Model not loaded")
|
69 |
+
|
70 |
+
try:
|
71 |
+
# Format the prompt with system prompt and chat history
|
72 |
+
formatted_prompt = f"{request.system_prompt}\n\n"
|
73 |
+
for msg in request.history:
|
74 |
+
formatted_prompt += f"{msg}\n"
|
75 |
+
formatted_prompt += f"Human: {request.prompt}\nAssistant:"
|
76 |
+
|
77 |
+
# Generate response
|
78 |
+
outputs = generator(
|
79 |
+
formatted_prompt,
|
80 |
+
max_length=request.max_length,
|
81 |
+
temperature=request.temperature,
|
82 |
+
num_return_sequences=1,
|
83 |
+
do_sample=True,
|
84 |
+
pad_token_id=tokenizer.pad_token_id,
|
85 |
+
eos_token_id=tokenizer.eos_token_id
|
86 |
+
)
|
87 |
+
|
88 |
+
# Extract the generated text
|
89 |
+
generated_text = outputs[0]['generated_text']
|
90 |
+
|
91 |
+
# Remove the prompt from the response
|
92 |
+
response = generated_text.split("Assistant:")[-1].strip()
|
93 |
+
|
94 |
+
return {"response": response}
|
95 |
+
|
96 |
+
except Exception as e:
|
97 |
+
raise HTTPException(status_code=500, detail=f"Error generating text: {str(e)}")
|
98 |
+
|
99 |
+
@app.get("/")
|
100 |
+
def root():
|
101 |
+
return {"message": "LLM API is running. Use /generate endpoint for text generation."}
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi==0.104.1
|
2 |
+
uvicorn==0.24.0
|
3 |
+
huggingface-hub==0.19.4
|
4 |
+
pydantic==2.5.2
|
5 |
+
transformers==4.35.2
|
6 |
+
torch==2.1.1
|
7 |
+
accelerate==0.24.1
|
8 |
+
bitsandbytes==0.41.1
|
9 |
+
peft==0.6.0
|