version: '3' | |
services: | |
h2ogpt: | |
build: | |
context: . | |
dockerfile: Dockerfile | |
restart: always | |
shm_size: '2gb' | |
depends_on: | |
vllm: | |
condition: service_healthy | |
ports: | |
- '${H2OGPT_PORT}:7860' | |
volumes: | |
- cache:/workspace/.cache | |
- save:/workspace/save | |
networks: | |
- h2ogpt | |
command: | |
- /workspace/generate.py | |
- --inference_server="vllm:vllm:5000" | |
- --base_model=${H2OGPT_BASE_MODEL} | |
- --langchain_mode=UserData | |
deploy: | |
resources: | |
reservations: | |
devices: | |
- driver: nvidia | |
device_ids: ['2', '3'] | |
capabilities: [gpu] | |
vllm: | |
image: vllm/vllm-openai:latest | |
restart: always | |
shm_size: '64gb' | |
expose: | |
- 5000 | |
volumes: | |
- cache:/workspace/.cache | |
networks: | |
- h2ogpt | |
entrypoint: python3 | |
command: -m vllm.entrypoints.openai.api_server --port=5000 --host=0.0.0.0 ${H2OGPT_VLLM_ARGS} | |
environment: | |
- NCCL_IGNORE_DISABLED_P2P=1 | |
healthcheck: | |
test: [ "CMD", "curl", "-f", "http://0.0.0.0:5000/v1/models" ] | |
interval: 30s | |
timeout: 5s | |
retries: 20 | |
deploy: | |
resources: | |
reservations: | |
devices: | |
- driver: nvidia | |
device_ids: ['0', '1'] | |
capabilities: [gpu] | |
volumes: | |
cache: | |
save: | |
networks: | |
h2ogpt: | |