[tool.poetry] name = "llm-engineering" version = "0.1.0" description = "" authors = ["iusztinpaul "] license = "MIT" readme = "README.md" [tool.poetry.dependencies] python = "~3.11" pymongo = "^4.6.2" click = "^8.0.1" loguru = "^0.7.2" rich = "^13.7.1" numpy = "^1.26.4" poethepoet = "0.29.0" datasets = "^3.0.1" # Digital data ETL selenium = "^4.21.0" webdriver-manager = "^4.0.1" beautifulsoup4 = "^4.12.3" html2text = "^2024.2.26" jmespath = "^1.0.1" chromedriver-autoinstaller = "^0.6.4" # Feature engineering qdrant-client = "^1.8.0" langchain = "^0.3.9" sentence-transformers = "^3.0.0" # RAG langchain-openai = "^0.2.11" jinja2 = "^3.1.4" tiktoken = "^0.7.0" fake-useragent = "^1.5.1" langchain-community = "^0.3.9" # Inference fastapi = ">=0.115.2,<1.0" uvicorn = "^0.30.6" opik = "^0.2.2" langchain-core = "^0.3.21" langchain-ollama = "^0.2.1" gradio = "^5.8.0" clearml = "^1.16.5" python-dotenv = "^1.0.1" [tool.poetry.group.dev.dependencies] ruff = "^0.4.9" pre-commit = "^3.7.1" pytest = "^8.2.2" [tool.poetry.group.aws.dependencies] sagemaker = ">=2.232.2" s3fs = ">2022.3.0" aws-profile-manager = "^0.7.3" kubernetes = "^30.1.0" sagemaker-huggingface-inference-toolkit = "^2.4.0" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" # ---------------------------------- # --- Poe the Poet Configuration --- # ---------------------------------- [tool.poe.tasks] # Data pipelines run-digital-data-etl-cs370 = "poetry run python -m tools.run --run-etl --no-cache --etl-config-filename digital_data_etl_cs370.yaml" run-digital-data-etl = [ "run-digital-data-etl-cs370", ] run-feature-engineering-pipeline = "poetry run python -m tools.run --no-cache --run-feature-engineering" run-generate-instruct-datasets-pipeline = "poetry run python -m tools.run --no-cache --run-generate-instruct-datasets" run-generate-preference-datasets-pipeline = "poetry run python -m tools.run --no-cache --run-generate-preference-datasets" run-end-to-end-data-pipeline = "poetry run python -m tools.run --no-cache --run-end-to-end-data" # Utility pipelines run-export-artifact-to-json-pipeline = "poetry run python -m tools.run --no-cache --run-export-artifact-to-json" run-export-data-warehouse-to-json = "poetry run python -m tools.data_warehouse --export-raw-data" run-import-data-warehouse-from-json = "poetry run python -m tools.data_warehouse --import-raw-data" # Training pipelines run-training-pipeline = "poetry run python -m tools.run --no-cache --run-training" run-evaluation-pipeline = "poetry run python -m tools.run --no-cache --run-evaluation" # Inference call-rag-retrieval-module = "poetry run python -m tools.rag" run-inference-ml-service = "poetry run uvicorn tools.ml_service:app --host 0.0.0.0 --port 8000 --reload" call-inference-ml-service = "curl -X POST 'http://127.0.0.1:8000/rag' -H 'Content-Type: application/json' -d '{\"query\": \"My name is Paul Iusztin. Could you draft a LinkedIn post discussing RAG systems? I am particularly interested in how RAG works and how it is integrated with vector DBs and LLMs.\"}'" # Infrastructure ## Local infrastructure local-docker-infrastructure-up = "docker compose up -d" local-docker-infrastructure-down = "docker compose stop" local-zenml-server-down = "poetry run zenml down" local-infrastructure-up = [ "local-docker-infrastructure-up", "local-zenml-server-down", "local-zenml-server-up", ] local-infrastructure-down = [ "local-docker-infrastructure-down", "local-zenml-server-down", ] set-local-stack = "poetry run zenml stack set default" set-aws-stack = "poetry run zenml stack set aws-stack" set-asynchronous-runs = "poetry run zenml orchestrator update aws-stack --synchronous=False" zenml-server-disconnect = "poetry run zenml disconnect" ## Settings export-settings-to-zenml = "poetry run python -m tools.run --export-settings" delete-settings-zenml = "poetry run zenml secret delete settings" ## SageMaker create-sagemaker-role = "poetry run python -m llm_engineering.infrastructure.aws.roles.create_sagemaker_role" create-sagemaker-execution-role = "poetry run python -m llm_engineering.infrastructure.aws.roles.create_execution_role" deploy-inference-endpoint = "poetry run python -m llm_engineering.infrastructure.aws.deploy.huggingface.run" test-sagemaker-endpoint = "poetry run python -m llm_engineering.model.inference.test" delete-inference-endpoint = "poetry run python -m llm_engineering.infrastructure.aws.deploy.delete_sagemaker_endpoint" ## Docker build-docker-image = "docker buildx build --platform linux/amd64 -t llmtwin -f Dockerfile ." run-docker-end-to-end-data-pipeline = "docker run --rm --network host --shm-size=2g --env-file .env llmtwin poetry poe --no-cache --run-end-to-end-data" bash-docker-container = "docker run --rm -it --network host --env-file .env llmtwin bash" # QA lint-check = "poetry run ruff check ." format-check = "poetry run ruff format --check ." lint-check-docker = "sh -c 'docker run --rm -i hadolint/hadolint < Dockerfile'" gitleaks-check = "docker run -v .:/src zricethezav/gitleaks:latest dir /src/llm_engineering" lint-fix = "poetry run ruff check --fix ." format-fix = "poetry run ruff format ." [tool.poe.tasks.local-zenml-server-up] control.expr = "sys.platform" [[tool.poe.tasks.local-zenml-server-up.switch]] case = "darwin" env = { OBJC_DISABLE_INITIALIZE_FORK_SAFETY = "YES" } cmd = "poetry run zenml up" [[tool.poe.tasks.local-zenml-server-up.switch]] cmd = "poetry run zenml up" # Tests [tool.poe.tasks.test] cmd = "poetry run pytest tests/" env = { ENV_FILE = ".env.testing" }