SkazuHD commited on
Commit
d660b02
·
1 Parent(s): 12ac4f1

init space

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Dockerfile +52 -0
  2. LICENSE +21 -0
  3. README.md +681 -8
  4. app/app.py +34 -0
  5. code_snippets/03_custom_odm_example.py +10 -0
  6. code_snippets/03_orm.py +37 -0
  7. code_snippets/08_instructor_embeddings.py +18 -0
  8. code_snippets/08_text_embeddings.py +28 -0
  9. code_snippets/08_text_image_embeddings.py +37 -0
  10. configs/digital_data_etl_cs370.yaml +14 -0
  11. configs/end_to_end_data.yaml +87 -0
  12. configs/evaluating.yaml +9 -0
  13. configs/export_artifact_to_json.yaml +13 -0
  14. configs/feature_engineering.yaml +10 -0
  15. configs/generate_instruct_datasets.yaml +13 -0
  16. configs/generate_preference_datasets.yaml +13 -0
  17. configs/training.yaml +14 -0
  18. data/artifacts/cleaned_documents.json +0 -0
  19. data/artifacts/instruct_datasets.json +0 -0
  20. data/artifacts/preference_datasets.json +0 -0
  21. data/artifacts/raw_documents.json +0 -0
  22. data/data_warehouse_raw_data/ArticleDocument.json +0 -0
  23. data/data_warehouse_raw_data/PostDocument.json +1 -0
  24. data/data_warehouse_raw_data/RepositoryDocument.json +1 -0
  25. data/data_warehouse_raw_data/UserDocument.json +1 -0
  26. docker-compose.yml +83 -0
  27. images/cover_plus.png +0 -0
  28. images/crazy_cat.jpg +0 -0
  29. llm_engineering/__init__.py +4 -0
  30. llm_engineering/application/__init__.py +3 -0
  31. llm_engineering/application/crawlers/__init__.py +7 -0
  32. llm_engineering/application/crawlers/__pycache__/__init__.cpython-311.pyc +0 -0
  33. llm_engineering/application/crawlers/__pycache__/base.cpython-311.pyc +0 -0
  34. llm_engineering/application/crawlers/__pycache__/custom_article.cpython-311.pyc +0 -0
  35. llm_engineering/application/crawlers/__pycache__/dispatcher.cpython-311.pyc +0 -0
  36. llm_engineering/application/crawlers/__pycache__/github.cpython-311.pyc +0 -0
  37. llm_engineering/application/crawlers/__pycache__/linkedin.cpython-311.pyc +0 -0
  38. llm_engineering/application/crawlers/__pycache__/medium.cpython-311.pyc +0 -0
  39. llm_engineering/application/crawlers/base.py +63 -0
  40. llm_engineering/application/crawlers/custom_article.py +54 -0
  41. llm_engineering/application/crawlers/dispatcher.py +39 -0
  42. llm_engineering/application/crawlers/github.py +158 -0
  43. llm_engineering/application/dataset/__init__.py +3 -0
  44. llm_engineering/application/dataset/__pycache__/__init__.cpython-311.pyc +0 -0
  45. llm_engineering/application/dataset/__pycache__/constants.cpython-311.pyc +0 -0
  46. llm_engineering/application/dataset/__pycache__/generation.cpython-311.pyc +0 -0
  47. llm_engineering/application/dataset/__pycache__/output_parsers.cpython-311.pyc +0 -0
  48. llm_engineering/application/dataset/__pycache__/utils.cpython-311.pyc +0 -0
  49. llm_engineering/application/dataset/constants.py +26 -0
  50. llm_engineering/application/dataset/generation.py +260 -0
Dockerfile ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim-bullseye AS release
2
+
3
+ ENV WORKSPACE_ROOT=/llm_engineering/
4
+ ENV PYTHONDONTWRITEBYTECODE=1 \
5
+ PYTHONUNBUFFERED=1 \
6
+ POETRY_VERSION=1.8.3 \
7
+ DEBIAN_FRONTEND=noninteractive \
8
+ POETRY_NO_INTERACTION=1
9
+
10
+ RUN apt-get update -y && apt-get install -y --no-install-recommends \
11
+ wget \
12
+ curl \
13
+ gnupg \
14
+ build-essential \
15
+ gcc \
16
+ python3-dev \
17
+ libglib2.0-dev \
18
+ libnss3-dev \
19
+ && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | \
20
+ gpg --dearmor -o /usr/share/keyrings/google-linux-signing-key.gpg \
21
+ && echo "deb [signed-by=/usr/share/keyrings/google-linux-signing-key.gpg] https://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \
22
+ && apt-get update -y && apt-get install -y --no-install-recommends google-chrome-stable \
23
+ && apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/*
24
+
25
+ RUN pip install --no-cache-dir "poetry==$POETRY_VERSION" && \
26
+ poetry config installer.max-workers 20 && \
27
+ poetry config virtualenvs.create false
28
+
29
+ WORKDIR $WORKSPACE_ROOT
30
+
31
+ COPY pyproject.toml poetry.lock $WORKSPACE_ROOT
32
+ RUN poetry install --no-root --no-interaction --no-cache --without dev && \
33
+ poetry self add 'poethepoet[poetry_plugin]' && \
34
+ rm -rf ~/.cache/pypoetry/*
35
+
36
+ RUN curl -fsSL https://ollama.com/install.sh | sh
37
+ RUN ollama --version
38
+
39
+ RUN bash -c "ollama serve & sleep 5 && ollama pull llama3.1"
40
+
41
+ # Ensure app.py is copied
42
+
43
+ EXPOSE 7860
44
+
45
+ COPY . $WORKSPACE_ROOT
46
+
47
+
48
+ RUN poetry install
49
+
50
+
51
+ #ENTRYPOINT ["bash", "-c", "pwd && ls && poetry run python3 ./app/app.py"]
52
+ CMD ["bash", "-c", "ollama serve & poetry run python3 ./app/app.py"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Packt
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,10 +1,683 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
- title: Docker Test
3
- emoji: 🐨
4
- colorFrom: gray
5
- colorTo: indigo
6
- sdk: docker
7
- pinned: false
8
- ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CS370 Project
2
+
3
+ In this project we build a Retrieval Augmented Generation (RAG) system. RAG is a recent paradigm for large-scale language understanding tasks. It combines the strengths of retrieval-based and generation-based models, enabling the model to retrieve relevant information from a large corpus and generate a coherent domain-specific response
4
+
5
+ ## Team Members
6
+
7
+ Jonah-Alexander Loewnich
8
+ - Github:
9
+ - HuggingFace:
10
+
11
+ Thomas Gammer
12
+ - Github:
13
+ - HuggingFace:
14
+
15
+ ## Docker Containers
16
+
17
+ Here are the docker containers up and running.
18
+ ![Docker Containers](./screenshots/container.png)
19
+
20
+ ## Crawled Resources
21
+
22
+ - https://github.com/ros-infrastructure/www.ros.org/
23
+ - https://github.com/ros-navigation/docs.nav2.org
24
+ - https://github.com/moveit/moveit2
25
+ - https://github.com/gazebosim/gz-sim
26
+
27
+ ## LLM + RAG Responses
28
+
29
+ Here is our models response to the first question.
30
+ ![Question 1 Response](./screenshots/127.0.0.1_7860__1-1.png)
31
+
32
+
33
+ Here is our models response to the second question.
34
+ ![Question 2 Response](./screenshots/127.0.0.1_7860__2-1.png)
35
+
36
  ---
 
 
 
 
 
 
 
37
 
38
+ <div align="center">
39
+ <h1>👷 LLM Engineer's Handbook</h1>
40
+ <p class="tagline">Official repository of the <a href="https://www.amazon.com/LLM-Engineers-Handbook-engineering-production/dp/1836200072/">LLM Engineer's Handbook</a> by <a href="https://github.com/iusztinpaul">Paul Iusztin</a> and <a href="https://github.com/mlabonne">Maxime Labonne</a></p>
41
+ </div>
42
+ </br>
43
+
44
+ <p align="center">
45
+ <a href="https://www.amazon.com/LLM-Engineers-Handbook-engineering-production/dp/1836200072/">
46
+ <img src="images/cover_plus.png" alt="Book cover">
47
+ </a>
48
+ </p>
49
+
50
+ ## 🌟 Features
51
+
52
+ The goal of this book is to create your own end-to-end LLM-based system using best practices:
53
+
54
+ - 📝 Data collection & generation
55
+ - 🔄 LLM training pipeline
56
+ - 📊 Simple RAG system
57
+ - 🚀 Production-ready AWS deployment
58
+ - 🔍 Comprehensive monitoring
59
+ - 🧪 Testing and evaluation framework
60
+
61
+ You can download and use the final trained model on [Hugging Face](https://huggingface.co/mlabonne/TwinLlama-3.1-8B-DPO).
62
+
63
+ ## 🔗 Dependencies
64
+
65
+ ### Local dependencies
66
+
67
+ To install and run the project locally, you need the following dependencies.
68
+
69
+ | Tool | Version | Purpose | Installation Link |
70
+ |------|---------|---------|------------------|
71
+ | pyenv | ≥2.3.36 | Multiple Python versions (optional) | [Install Guide](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) |
72
+ | Python | 3.11 | Runtime environment | [Download](https://www.python.org/downloads/) |
73
+ | Poetry | ≥1.8.3 | Package management | [Install Guide](https://python-poetry.org/docs/#installation) |
74
+ | Docker | ≥27.1.1 | Containerization | [Install Guide](https://docs.docker.com/engine/install/) |
75
+ | AWS CLI | ≥2.15.42 | Cloud management | [Install Guide](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) |
76
+ | Git | ≥2.44.0 | Version control | [Download](https://git-scm.com/downloads) |
77
+
78
+ ### Cloud services
79
+
80
+ The code also uses and depends on the following cloud services. For now, you don't have to do anything. We will guide you in the installation and deployment sections on how to use them:
81
+
82
+ | Service | Purpose |
83
+ |---------|---------|
84
+ | [HuggingFace](https://huggingface.com/) | Model registry |
85
+ | [Comet ML](https://www.comet.com/site/) | Experiment tracker |
86
+ | [Opik](https://www.comet.com/site/products/opik/) | Prompt monitoring |
87
+ | [ZenML](https://www.zenml.io/) | Orchestrator and artifacts layer |
88
+ | [AWS](https://aws.amazon.com/) | Compute and storage |
89
+ | [MongoDB](https://www.mongodb.com/) | NoSQL database |
90
+ | [Qdrant](https://qdrant.tech/) | Vector database |
91
+ | [GitHub Actions](https://github.com/features/actions) | CI/CD pipeline |
92
+
93
+ In the [LLM Engineer's Handbook](https://www.amazon.com/LLM-Engineers-Handbook-engineering-production/dp/1836200072/), Chapter 2 will walk you through each tool. Chapters 10 and 11 provide step-by-step guides on how to set up everything you need.
94
+
95
+ ## 🗂️ Project Structure
96
+
97
+ Here is the directory overview:
98
+
99
+ ```bash
100
+ .
101
+ ├── code_snippets/ # Standalone example code
102
+ ├── configs/ # Pipeline configuration files
103
+ ├── llm_engineering/ # Core project package
104
+ │ ├── application/
105
+ │ ├── domain/
106
+ │ ├── infrastructure/
107
+ │ ├── model/
108
+ ├── pipelines/ # ML pipeline definitions
109
+ ├── steps/ # Pipeline components
110
+ ├── tests/ # Test examples
111
+ ├── tools/ # Utility scripts
112
+ │ ├── run.py
113
+ │ ├── ml_service.py
114
+ │ ├── rag.py
115
+ │ ├── data_warehouse.py
116
+ ```
117
+
118
+ `llm_engineering/` is the main Python package implementing LLM and RAG functionality. It follows Domain-Driven Design (DDD) principles:
119
+
120
+ - `domain/`: Core business entities and structures
121
+ - `application/`: Business logic, crawlers, and RAG implementation
122
+ - `model/`: LLM training and inference
123
+ - `infrastructure/`: External service integrations (AWS, Qdrant, MongoDB, FastAPI)
124
+
125
+ The code logic and imports flow as follows: `infrastructure` → `model` → `application` → `domain`
126
+
127
+ `pipelines/`: Contains the ZenML ML pipelines, which serve as the entry point for all the ML pipelines. Coordinates the data processing and model training stages of the ML lifecycle.
128
+
129
+ `steps/`: Contains individual ZenML steps, which are reusable components for building and customizing ZenML pipelines. Steps perform specific tasks (e.g., data loading, preprocessing) and can be combined within the ML pipelines.
130
+
131
+ `tests/`: Covers a few sample tests used as examples within the CI pipeline.
132
+
133
+ `tools/`: Utility scripts used to call the ZenML pipelines and inference code:
134
+ - `run.py`: Entry point script to run ZenML pipelines.
135
+ - `ml_service.py`: Starts the REST API inference server.
136
+ - `rag.py`: Demonstrates usage of the RAG retrieval module.
137
+ - `data_warehouse.py`: Used to export or import data from the MongoDB data warehouse through JSON files.
138
+
139
+ `configs/`: ZenML YAML configuration files to control the execution of pipelines and steps.
140
+
141
+ `code_snippets/`: Independent code examples that can be executed independently.
142
+
143
+ ## 💻 Installation
144
+
145
+ ### 1. Clone the Repository
146
+
147
+ Start by cloning the repository and navigating to the project directory:
148
+
149
+ ```bash
150
+ git clone https://github.com/PacktPublishing/LLM-Engineers-Handbook.git
151
+ cd LLM-Engineers-Handbook
152
+ ```
153
+
154
+ Next, we have to prepare your Python environment and its adjacent dependencies.
155
+
156
+ ### 2. Set Up Python Environment
157
+
158
+ The project requires Python 3.11. You can either use your global Python installation or set up a project-specific version using pyenv.
159
+
160
+ #### Option A: Using Global Python (if version 3.11 is installed)
161
+
162
+ Verify your Python version:
163
+
164
+ ```bash
165
+ python --version # Should show Python 3.11.x
166
+ ```
167
+
168
+ #### Option B: Using pyenv (recommended)
169
+
170
+ 1. Verify pyenv installation:
171
+
172
+ ```bash
173
+ pyenv --version # Should show pyenv 2.3.36 or later
174
+ ```
175
+
176
+ 2. Install Python 3.11.8:
177
+
178
+ ```bash
179
+ pyenv install 3.11.8
180
+ ```
181
+
182
+ 3. Verify the installation:
183
+
184
+ ```bash
185
+ python --version # Should show Python 3.11.8
186
+ ```
187
+
188
+ 4. Confirm Python version in the project directory:
189
+
190
+ ```bash
191
+ python --version
192
+ # Output: Python 3.11.8
193
+ ```
194
+
195
+ > [!NOTE]
196
+ > The project includes a `.python-version` file that automatically sets the correct Python version when you're in the project directory.
197
+
198
+ ### 3. Install Dependencies
199
+
200
+ The project uses Poetry for dependency management.
201
+
202
+ 1. Verify Poetry installation:
203
+
204
+ ```bash
205
+ poetry --version # Should show Poetry version 1.8.3 or later
206
+ ```
207
+
208
+ 2. Set up the project environment and install dependencies:
209
+
210
+ ```bash
211
+ poetry env use 3.11
212
+ poetry install --without aws
213
+ poetry run pre-commit install
214
+ ```
215
+
216
+ This will:
217
+
218
+ - Configure Poetry to use Python 3.11
219
+ - Install project dependencies (excluding AWS-specific packages)
220
+ - Set up pre-commit hooks for code verification
221
+
222
+ ### 4. Activate the Environment
223
+
224
+ As our task manager, we run all the scripts using [Poe the Poet](https://poethepoet.natn.io/index.html).
225
+
226
+ 1. Start a Poetry shell:
227
+
228
+ ```bash
229
+ poetry shell
230
+ ```
231
+
232
+ 2. Run project commands using Poe the Poet:
233
+
234
+ ```bash
235
+ poetry poe ...
236
+ ```
237
+
238
+ <details>
239
+ <summary>🔧 Troubleshooting Poe the Poet Installation</summary>
240
+
241
+ ### Alternative Command Execution
242
+
243
+ If you're experiencing issues with `poethepoet`, you can still run the project commands directly through Poetry. Here's how:
244
+
245
+ 1. Look up the command definition in `pyproject.toml`
246
+ 2. Use `poetry run` with the underlying command
247
+
248
+ #### Example:
249
+ Instead of:
250
+ ```bash
251
+ poetry poe local-infrastructure-up
252
+ ```
253
+ Use the direct command from pyproject.toml:
254
+ ```bash
255
+ poetry run <actual-command-from-pyproject-toml>
256
+ ```
257
+ Note: All project commands are defined in the [tool.poe.tasks] section of pyproject.toml
258
+ </details>
259
+
260
+ Now, let's configure our local project with all the necessary credentials and tokens to run the code locally.
261
+
262
+ ### 5. Local Development Setup
263
+
264
+ After you have installed all the dependencies, you must create and fill a `.env` file with your credentials to appropriately interact with other services and run the project. Setting your sensitive credentials in a `.env` file is a good security practice, as this file won't be committed to GitHub or shared with anyone else.
265
+
266
+ 1. First, copy our example by running the following:
267
+
268
+ ```bash
269
+ cp .env.example .env # The file must be at your repository's root!
270
+ ```
271
+
272
+ 2. Now, let's understand how to fill in all the essential variables within the `.env` file to get you started. The following are the mandatory settings we must complete when working locally:
273
+
274
+ #### OpenAI
275
+
276
+ To authenticate to OpenAI's API, you must fill out the `OPENAI_API_KEY` env var with an authentication token.
277
+
278
+ ```env
279
+ OPENAI_API_KEY=your_api_key_here
280
+ ```
281
+
282
+ → Check out this [tutorial](https://platform.openai.com/docs/quickstart) to learn how to provide one from OpenAI.
283
+
284
+ #### Hugging Face
285
+
286
+ To authenticate to Hugging Face, you must fill out the `HUGGINGFACE_ACCESS_TOKEN` env var with an authentication token.
287
+
288
+ ```env
289
+ HUGGINGFACE_ACCESS_TOKEN=your_token_here
290
+ ```
291
+
292
+ → Check out this [tutorial](https://huggingface.co/docs/hub/en/security-tokens) to learn how to provide one from Hugging Face.
293
+
294
+ #### Comet ML & Opik
295
+
296
+ To authenticate to Comet ML (required only during training) and Opik, you must fill out the `COMET_API_KEY` env var with your authentication token.
297
+
298
+ ```env
299
+ COMET_API_KEY=your_api_key_here
300
+ ```
301
+
302
+ → Check out this [tutorial](https://www.comet.com/docs/v2/api-and-sdk/rest-api/overview/) to learn how to get the Comet ML variables from above. You can also access Opik's dashboard using 🔗[this link](https://www.comet.com/opik).
303
+
304
+ ### 6. Deployment Setup
305
+
306
+ When deploying the project to the cloud, we must set additional settings for Mongo, Qdrant, and AWS. If you are just working locally, the default values of these env vars will work out of the box. Detailed deployment instructions are available in Chapter 11 of the [LLM Engineer's Handbook](https://www.amazon.com/LLM-Engineers-Handbook-engineering-production/dp/1836200072/).
307
+
308
+ #### MongoDB
309
+
310
+ We must change the `DATABASE_HOST` env var with the URL pointing to your cloud MongoDB cluster.
311
+
312
+ ```env
313
+ DATABASE_HOST=your_mongodb_url
314
+ ```
315
+
316
+ → Check out this [tutorial](https://www.mongodb.com/resources/products/fundamentals/mongodb-cluster-setup) to learn how to create and host a MongoDB cluster for free.
317
+
318
+ #### Qdrant
319
+
320
+ Change `USE_QDRANT_CLOUD` to `true`, `QDRANT_CLOUD_URL` with the URL point to your cloud Qdrant cluster, and `QDRANT_APIKEY` with its API key.
321
+
322
+ ```env
323
+ USE_QDRANT_CLOUD=true
324
+ QDRANT_CLOUD_URL=your_qdrant_cloud_url
325
+ QDRANT_APIKEY=your_qdrant_api_key
326
+ ```
327
+
328
+ → Check out this [tutorial](https://qdrant.tech/documentation/cloud/create-cluster/) to learn how to create a Qdrant cluster for free
329
+
330
+ #### AWS
331
+
332
+ For your AWS set-up to work correctly, you need the AWS CLI installed on your local machine and properly configured with an admin user (or a user with enough permissions to create new SageMaker, ECR, and S3 resources; using an admin user will make everything more straightforward).
333
+
334
+ Chapter 2 provides step-by-step instructions on how to install the AWS CLI, create an admin user on AWS, and get an access key to set up the `AWS_ACCESS_KEY` and `AWS_SECRET_KEY` environment variables. If you already have an AWS admin user in place, you have to configure the following env vars in your `.env` file:
335
+
336
+ ```bash
337
+ AWS_REGION=eu-central-1 # Change it with your AWS region.
338
+ AWS_ACCESS_KEY=your_aws_access_key
339
+ AWS_SECRET_KEY=your_aws_secret_key
340
+ ```
341
+
342
+ AWS credentials are typically stored in `~/.aws/credentials`. You can view this file directly using `cat` or similar commands:
343
+
344
+ ```bash
345
+ cat ~/.aws/credentials
346
+ ```
347
+
348
+ > [!IMPORTANT]
349
+ > Additional configuration options are available in [settings.py](https://github.com/PacktPublishing/LLM-Engineers-Handbook/blob/main/llm_engineering/settings.py). Any variable in the `Settings` class can be configured through the `.env` file.
350
+
351
+ ## 🏗️ Infrastructure
352
+
353
+ ### Local infrastructure (for testing and development)
354
+
355
+ When running the project locally, we host a MongoDB and Qdrant database using Docker. Also, a testing ZenML server is made available through their Python package.
356
+
357
+ > [!WARNING]
358
+ > You need Docker installed (>= v27.1.1)
359
+
360
+ For ease of use, you can start the whole local development infrastructure with the following command:
361
+ ```bash
362
+ poetry poe local-infrastructure-up
363
+ ```
364
+
365
+ Also, you can stop the ZenML server and all the Docker containers using the following command:
366
+ ```bash
367
+ poetry poe local-infrastructure-down
368
+ ```
369
+
370
+ > [!WARNING]
371
+ > When running on MacOS, before starting the server, export the following environment variable:
372
+ > `export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES`
373
+ > Otherwise, the connection between the local server and pipeline will break. 🔗 More details in [this issue](https://github.com/zenml-io/zenml/issues/2369).
374
+ > This is done by default when using Poe the Poet.
375
+
376
+ Start the inference real-time RESTful API:
377
+ ```bash
378
+ poetry poe run-inference-ml-service
379
+ ```
380
+
381
+ > [!IMPORTANT]
382
+ > The LLM microservice, called by the RESTful API, will work only after deploying the LLM to AWS SageMaker.
383
+
384
+ #### ZenML
385
+
386
+ Dashboard URL: `localhost:8237`
387
+
388
+ Default credentials:
389
+ - `username`: default
390
+ - `password`:
391
+
392
+ → Find out more about using and setting up [ZenML](https://docs.zenml.io/).
393
+
394
+ #### Qdrant
395
+
396
+ REST API URL: `localhost:6333`
397
+
398
+ Dashboard URL: `localhost:6333/dashboard`
399
+
400
+ → Find out more about using and setting up [Qdrant with Docker](https://qdrant.tech/documentation/quick-start/).
401
+
402
+ #### MongoDB
403
+
404
+ Database URI: `mongodb://llm_engineering:[email protected]:27017`
405
+
406
+ Database name: `twin`
407
+
408
+ Default credentials:
409
+ - `username`: llm_engineering
410
+ - `password`: llm_engineering
411
+
412
+ → Find out more about using and setting up [MongoDB with Docker](https://www.mongodb.com/docs/manual/tutorial/install-mongodb-community-with-docker).
413
+
414
+ You can search your MongoDB collections using your **IDEs MongoDB plugin** (which you have to install separately), where you have to use the database URI to connect to the MongoDB database hosted within the Docker container: `mongodb://llm_engineering:[email protected]:27017`
415
+
416
+ > [!IMPORTANT]
417
+ > Everything related to training or running the LLMs (e.g., training, evaluation, inference) can only be run if you set up AWS SageMaker, as explained in the next section on cloud infrastructure.
418
+
419
+ ### Cloud infrastructure (for production)
420
+
421
+ Here we will quickly present how to deploy the project to AWS and other serverless services. We won't go into the details (as everything is presented in the book) but only point out the main steps you have to go through.
422
+
423
+ First, reinstall your Python dependencies with the AWS group:
424
+ ```bash
425
+ poetry install --with aws
426
+ ```
427
+
428
+ #### AWS SageMaker
429
+
430
+ > [!NOTE]
431
+ > Chapter 10 provides step-by-step instructions in the section "Implementing the LLM microservice using AWS SageMaker".
432
+
433
+ By this point, we expect you to have AWS CLI installed and your AWS CLI and project's env vars (within the `.env` file) properly configured with an AWS admin user.
434
+
435
+ To ensure best practices, we must create a new AWS user restricted to creating and deleting only resources related to AWS SageMaker. Create it by running:
436
+ ```bash
437
+ poetry poe create-sagemaker-role
438
+ ```
439
+ It will create a `sagemaker_user_credentials.json` file at the root of your repository with your new `AWS_ACCESS_KEY` and `AWS_SECRET_KEY` values. **But before replacing your new AWS credentials, also run the following command to create the execution role (to create it using your admin credentials).**
440
+
441
+ To create the IAM execution role used by AWS SageMaker to access other AWS resources on our behalf, run the following:
442
+ ```bash
443
+ poetry poe create-sagemaker-execution-role
444
+ ```
445
+ It will create a `sagemaker_execution_role.json` file at the root of your repository with your new `AWS_ARN_ROLE` value. Add it to your `.env` file.
446
+
447
+ Once you've updated the `AWS_ACCESS_KEY`, `AWS_SECRET_KEY`, and `AWS_ARN_ROLE` values in your `.env` file, you can use AWS SageMaker. **Note that this step is crucial to complete the AWS setup.**
448
+
449
+ #### Training
450
+
451
+ We start the training pipeline through ZenML by running the following:
452
+ ```bash
453
+ poetry poe run-training-pipeline
454
+ ```
455
+ This will start the training code using the configs from `configs/training.yaml` directly in SageMaker. You can visualize the results in Comet ML's dashboard.
456
+
457
+ We start the evaluation pipeline through ZenML by running the following:
458
+ ```bash
459
+ poetry poe run-evaluation-pipeline
460
+ ```
461
+ This will start the evaluation code using the configs from `configs/evaluating.yaml` directly in SageMaker. You can visualize the results in `*-results` datasets saved to your Hugging Face profile.
462
+
463
+ #### Inference
464
+
465
+ To create an AWS SageMaker Inference Endpoint, run:
466
+ ```bash
467
+ poetry poe deploy-inference-endpoint
468
+ ```
469
+ To test it out, run:
470
+ ```bash
471
+ poetry poe test-sagemaker-endpoint
472
+ ```
473
+ To delete it, run:
474
+ ```bash
475
+ poetry poe delete-inference-endpoint
476
+ ```
477
+
478
+ #### AWS: ML pipelines, artifacts, and containers
479
+
480
+ The ML pipelines, artifacts, and containers are deployed to AWS by leveraging ZenML's deployment features. Thus, you must create an account with ZenML Cloud and follow their guide on deploying a ZenML stack to AWS. Otherwise, we provide step-by-step instructions in **Chapter 11**, section **Deploying the LLM Twin's pipelines to the cloud** on what you must do.
481
+
482
+ #### Qdrant & MongoDB
483
+
484
+ We leverage Qdrant's and MongoDB's serverless options when deploying the project. Thus, you can either follow [Qdrant's](https://qdrant.tech/documentation/cloud/create-cluster/) and [MongoDB's](https://www.mongodb.com/resources/products/fundamentals/mongodb-cluster-setup) tutorials on how to create a freemium cluster for each or go through **Chapter 11**, section **Deploying the LLM Twin's pipelines to the cloud** and follow our step-by-step instructions.
485
+
486
+ #### GitHub Actions
487
+
488
+ We use GitHub Actions to implement our CI/CD pipelines. To implement your own, you have to fork our repository and set the following env vars as Actions secrets in your forked repository:
489
+ - `AWS_ACCESS_KEY_ID`
490
+ - `AWS_SECRET_ACCESS_KEY`
491
+ - `AWS_ECR_NAME`
492
+ - `AWS_REGION`
493
+
494
+ Also, we provide instructions on how to set everything up in **Chapter 11**, section **Adding LLMOps to the LLM Twin**.
495
+
496
+ #### Comet ML & Opik
497
+
498
+ You can visualize the results on their self-hosted dashboards if you create a Comet account and correctly set the `COMET_API_KEY` env var. As Opik is powered by Comet, you don't have to set up anything else along Comet:
499
+ - [Comet ML (for experiment tracking)](https://www.comet.com/)
500
+ - [Opik (for prompt monitoring)](https://www.comet.com/opik)
501
+
502
+ ## ⚡ Pipelines
503
+
504
+ All the ML pipelines will be orchestrated behind the scenes by [ZenML](https://www.zenml.io/). A few exceptions exist when running utility scrips, such as exporting or importing from the data warehouse.
505
+
506
+ The ZenML pipelines are the entry point for most processes throughout this project. They are under the `pipelines/` folder. Thus, when you want to understand or debug a workflow, starting with the ZenML pipeline is the best approach.
507
+
508
+ To see the pipelines running and their results:
509
+ - go to your ZenML dashboard
510
+ - go to the `Pipelines` section
511
+ - click on a specific pipeline (e.g., `feature_engineering`)
512
+ - click on a specific run (e.g., `feature_engineering_run_2024_06_20_18_40_24`)
513
+ - click on a specific step or artifact of the DAG to find more details about it
514
+
515
+ Now, let's explore all the pipelines you can run. From data collection to training, we will present them in their natural order to go through the LLM project end-to-end.
516
+
517
+ ### Data pipelines
518
+
519
+ Run the data collection ETL:
520
+ ```bash
521
+ poetry poe run-digital-data-etl
522
+ ```
523
+
524
+ > [!WARNING]
525
+ > You must have Chrome (or another Chromium-based browser) installed on your system for LinkedIn and Medium crawlers to work (which use Selenium under the hood). Based on your Chrome version, the Chromedriver will be automatically installed to enable Selenium support. Another option is to run everything using our Docker image if you don't want to install Chrome. For example, to run all the pipelines combined you can run `poetry poe run-docker-end-to-end-data-pipeline`. Note that the command can be tweaked to support any other pipeline.
526
+ >
527
+ > If, for any other reason, you don't have a Chromium-based browser installed and don't want to use Docker, you have two other options to bypass this Selenium issue:
528
+ > - Comment out all the code related to Selenium, Chrome and all the links that use Selenium to crawl them (e.g., Medium), such as the `chromedriver_autoinstaller.install()` command from [application.crawlers.base](https://github.com/PacktPublishing/LLM-Engineers-Handbook/blob/main/llm_engineering/application/crawlers/base.py) and other static calls that check for Chrome drivers and Selenium.
529
+ > - Install Google Chrome using your CLI in environments such as GitHub Codespaces or other cloud VMs using the same command as in our [Docker file](https://github.com/PacktPublishing/LLM-Engineers-Handbook/blob/main/Dockerfile#L10).
530
+
531
+ To add additional links to collect from, go to `configs/digital_data_etl_[author_name].yaml` and add them to the `links` field. Also, you can create a completely new file and specify it at run time, like this: `python -m llm_engineering.interfaces.orchestrator.run --run-etl --etl-config-filename configs/digital_data_etl_[your_name].yaml`
532
+
533
+ Run the feature engineering pipeline:
534
+ ```bash
535
+ poetry poe run-feature-engineering-pipeline
536
+ ```
537
+
538
+ Generate the instruct dataset:
539
+ ```bash
540
+ poetry poe run-generate-instruct-datasets-pipeline
541
+ ```
542
+
543
+ Generate the preference dataset:
544
+ ```bash
545
+ poetry poe run-generate-preference-datasets-pipeline
546
+ ```
547
+
548
+ Run all of the above compressed into a single pipeline:
549
+ ```bash
550
+ poetry poe run-end-to-end-data-pipeline
551
+ ```
552
+
553
+ ### Utility pipelines
554
+
555
+ Export the data from the data warehouse to JSON files:
556
+ ```bash
557
+ poetry poe run-export-data-warehouse-to-json
558
+ ```
559
+
560
+ Import data to the data warehouse from JSON files (by default, it imports the data from the `data/data_warehouse_raw_data` directory):
561
+ ```bash
562
+ poetry poe run-import-data-warehouse-from-json
563
+ ```
564
+
565
+ Export ZenML artifacts to JSON:
566
+ ```bash
567
+ poetry poe run-export-artifact-to-json-pipeline
568
+ ```
569
+
570
+ This will export the following ZenML artifacts to the `output` folder as JSON files (it will take their latest version):
571
+ - cleaned_documents.json
572
+ - instruct_datasets.json
573
+ - preference_datasets.json
574
+ - raw_documents.json
575
+
576
+ You can configure what artifacts to export by tweaking the `configs/export_artifact_to_json.yaml` configuration file.
577
+
578
+ ### Training pipelines
579
+
580
+ Run the training pipeline:
581
+ ```bash
582
+ poetry poe run-training-pipeline
583
+ ```
584
+
585
+ Run the evaluation pipeline:
586
+ ```bash
587
+ poetry poe run-evaluation-pipeline
588
+ ```
589
+
590
+ > [!WARNING]
591
+ > For this to work, make sure you properly configured AWS SageMaker as described in [Set up cloud infrastructure (for production)](#set-up-cloud-infrastructure-for-production).
592
+
593
+ ### Inference pipelines
594
+
595
+ Call the RAG retrieval module with a test query:
596
+ ```bash
597
+ poetry poe call-rag-retrieval-module
598
+ ```
599
+
600
+ Start the inference real-time RESTful API:
601
+ ```bash
602
+ poetry poe run-inference-ml-service
603
+ ```
604
+
605
+ Call the inference real-time RESTful API with a test query:
606
+ ```bash
607
+ poetry poe call-inference-ml-service
608
+ ```
609
+
610
+ Remember that you can monitor the prompt traces on [Opik](https://www.comet.com/opik).
611
+
612
+ > [!WARNING]
613
+ > For the inference service to work, you must have the LLM microservice deployed to AWS SageMaker, as explained in the setup cloud infrastructure section.
614
+
615
+ ### Linting & formatting (QA)
616
+
617
+ Check or fix your linting issues:
618
+ ```bash
619
+ poetry poe lint-check
620
+ poetry poe lint-fix
621
+ ```
622
+
623
+ Check or fix your formatting issues:
624
+ ```bash
625
+ poetry poe format-check
626
+ poetry poe format-fix
627
+ ```
628
+
629
+ Check the code for leaked credentials:
630
+ ```bash
631
+ poetry poe gitleaks-check
632
+ ```
633
+
634
+ ### Tests
635
+
636
+ Run all the tests using the following command:
637
+ ```bash
638
+ poetry poe test
639
+ ```
640
+
641
+ ## 🏃 Run project
642
+
643
+ Based on the setup and usage steps described above, assuming the local and cloud infrastructure works and the `.env` is filled as expected, follow the next steps to run the LLM system end-to-end:
644
+
645
+ ### Data
646
+
647
+ 1. Collect data: `poetry poe run-digital-data-etl`
648
+
649
+ 2. Compute features: `poetry poe run-feature-engineering-pipeline`
650
+
651
+ 3. Compute instruct dataset: `poetry poe run-generate-instruct-datasets-pipeline`
652
+
653
+ 4. Compute preference alignment dataset: `poetry poe run-generate-preference-datasets-pipeline`
654
+
655
+ ### Training
656
+
657
+ > [!IMPORTANT]
658
+ > From now on, for these steps to work, you need to properly set up AWS SageMaker, such as running `poetry install --with aws` and filling in the AWS-related environment variables and configs.
659
+
660
+ 5. SFT fine-tuning Llamma 3.1: `poetry poe run-training-pipeline`
661
+
662
+ 6. For DPO, go to `configs/training.yaml`, change `finetuning_type` to `dpo`, and run `poetry poe run-training-pipeline` again
663
+
664
+ 7. Evaluate fine-tuned models: `poetry poe run-evaluation-pipeline`
665
+
666
+ ### Inference
667
+
668
+ > [!IMPORTANT]
669
+ > From now on, for these steps to work, you need to properly set up AWS SageMaker, such as running `poetry install --with aws` and filling in the AWS-related environment variables and configs.
670
+
671
+ 8. Call only the RAG retrieval module: `poetry poe call-rag-retrieval-module`
672
+
673
+ 9. Deploy the LLM Twin microservice to SageMaker: `poetry poe deploy-inference-endpoint`
674
+
675
+ 10. Test the LLM Twin microservice: `poetry poe test-sagemaker-endpoint`
676
+
677
+ 11. Start end-to-end RAG server: `poetry poe run-inference-ml-service`
678
+
679
+ 12. Test RAG server: `poetry poe call-inference-ml-service`
680
+
681
+ ## 📄 License
682
+
683
+ This course is an open-source project released under the MIT license. Thus, as long you distribute our LICENSE and acknowledge our work, you can safely clone or fork this project and use it as a source of inspiration for whatever you want (e.g., university projects, college degree projects, personal projects, etc.).
app/app.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llm_engineering.infrastructure.inference_pipeline_api import rag
2
+ import gradio as gr
3
+ from langchain.schema import AIMessage, HumanMessage, SystemMessage
4
+
5
+
6
+ def predict(message, history):
7
+ history_langchain_format = []
8
+ for msg in history:
9
+ if msg['role'] == "user":
10
+ history_langchain_format.append(HumanMessage(content=msg['content']))
11
+ elif msg['role'] == "assistant":
12
+ history_langchain_format.append(AIMessage(content=msg['content']))
13
+ query = HumanMessage(content=message)
14
+ gpt_response = rag(query, history_langchain_format)
15
+ history_langchain_format.append(query)
16
+
17
+ return gpt_response.content
18
+
19
+ predefined_questions = [
20
+ "Tell me how can I navigate to a specific pose - include replanning aspects in your answer.",
21
+ "Can you provide me with code for this task?",
22
+ ]
23
+
24
+ demo = gr.ChatInterface(
25
+ predict,
26
+ type="messages",
27
+ examples=[ "Tell me how can I navigate to a specific pose - include replanning aspects in your answer.",
28
+ "Can you provide me with code for this task?"],
29
+ description="Ask specific questions related to ROS2 navigation, motion planning, and simulation",
30
+ stop_btn=True,
31
+ head="RAG System for ROS2 Robotics",
32
+ )
33
+
34
+ demo.launch(server_name="0.0.0.0", server_port=7860)
code_snippets/03_custom_odm_example.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from llm_engineering.domain.documents import ArticleDocument, UserDocument
2
+
3
+ if __name__ == "__main__":
4
+ user = UserDocument.get_or_create(first_name="Paul", last_name="Iusztin")
5
+ articles = ArticleDocument.bulk_find(author_id=str(user.id))
6
+
7
+ print(f"User ID: {user.id}") # noqa
8
+ print(f"User name: {user.first_name} {user.last_name}") # noqa
9
+ print(f"Number of articles: {len(articles)}") # noqa
10
+ print("First article link:", articles[0].link) # noqa
code_snippets/03_orm.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import Column, Integer, String, create_engine
2
+ from sqlalchemy.orm import declarative_base, sessionmaker
3
+
4
+ # Create virtual environment, install dependencies and run the code:
5
+ # 1. Create: python3 -m venv orm_venv
6
+ # 2. Activate: source orm_venv/bin/activate
7
+ # 3. Install: pip install sqlalchemy==2.0.35
8
+ # 4. Run the code: python code_snippets/03_orm.py
9
+
10
+ if __name__ == "__main__":
11
+ Base = declarative_base()
12
+
13
+ # Define a class that maps to the users table.
14
+ class User(Base):
15
+ __tablename__ = "users"
16
+
17
+ id = Column(Integer, primary_key=True)
18
+ name = Column(String)
19
+
20
+ # Create an SQLite database in memory.
21
+ engine = create_engine("sqlite:///:memory:")
22
+ Base.metadata.create_all(engine)
23
+
24
+ # Create a session used to interact with the database.
25
+ Session = sessionmaker(bind=engine)
26
+ session = Session()
27
+
28
+ # Add a new user.
29
+ new_user = User(name="Alice")
30
+ session.add(new_user)
31
+ session.commit()
32
+
33
+ # Query the database.
34
+ user = session.query(User).first()
35
+ if user:
36
+ print(f"User ID: {user.id}") # noqa
37
+ print(f"User name: {user.name}") # noqa
code_snippets/08_instructor_embeddings.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from InstructorEmbedding import INSTRUCTOR
2
+
3
+ # Create virtual environment, install dependencies and run the code:
4
+ # 1. Create: python3 -m venv instructor_venv
5
+ # 2. Activate: source instructor_venv/bin/activate
6
+ # 3. Install: pip install sentence-transformers==2.2.2 InstructorEmbedding==1.0.1
7
+ # 4. Run the code: python code_snippets/08_instructor_embeddings.py
8
+
9
+ if __name__ == "__main__":
10
+ model = INSTRUCTOR("hkunlp/instructor-base")
11
+
12
+ sentence = "RAG Fundamentals First"
13
+
14
+ instruction = "Represent the title of an article about AI:"
15
+
16
+ embeddings = model.encode([[instruction, sentence]])
17
+ print(embeddings.shape) # noqa
18
+ # Output: (1, 768)
code_snippets/08_text_embeddings.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+
3
+ # Leverage the Poetry virtual environment to run the code:
4
+ # poetry run python code_snippets/08_text_embeddings.py
5
+
6
+ if __name__ == "__main__":
7
+ # 1. Load a pretrained Sentence Transformer model.
8
+ model = SentenceTransformer("all-MiniLM-L6-v2")
9
+
10
+ # The sentences to encode.
11
+ sentences = ["The dog sits outside waiting for a treat.", "I am going swimming.", "The dog is swimming."]
12
+
13
+ # 2. Calculate embeddings.
14
+ embeddings = model.encode(sentences)
15
+ print(embeddings.shape) # noqa
16
+ # Output: [3, 384]
17
+
18
+ # 3. Calculate the embedding similarities using cosine similarity.
19
+ similarities = model.similarity(embeddings, embeddings)
20
+ print(similarities) # noqa
21
+ # Output:
22
+ # tensor([[ 1.0000, -0.0389, 0.2692],
23
+ # [-0.0389, 1.0000, 0.3837],
24
+ # [ 0.2692, 0.3837, 1.0000]])
25
+ #
26
+ # similarities[0, 0] = The similarity between the first sentence and itself.
27
+ # similarities[0, 1] = The similarity between the first and second sentence.
28
+ # similarities[2, 1] = The similarity between the third and second sentence.
code_snippets/08_text_image_embeddings.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import BytesIO
2
+
3
+ import requests
4
+ from PIL import Image
5
+ from sentence_transformers import SentenceTransformer
6
+
7
+ # Leverage the Poetry virtual environment to run the code:
8
+ # poetry run python code_snippets/08_text_image_embeddings.py
9
+
10
+ if __name__ == "__main__":
11
+ # Load an image with a crazy cat.
12
+ response = requests.get(
13
+ "https://github.com/PacktPublishing/LLM-Engineering/blob/main/images/crazy_cat.jpg?raw=true"
14
+ )
15
+ image = Image.open(BytesIO(response.content))
16
+
17
+ # Load CLIP model.
18
+ model = SentenceTransformer("clip-ViT-B-32")
19
+
20
+ # Encode the loaded image.
21
+ img_emb = model.encode(image)
22
+
23
+ # Encode text descriptions.
24
+ text_emb = model.encode(
25
+ [
26
+ "A crazy cat smiling.",
27
+ "A white and brown cat with a yellow bandana.",
28
+ "A man eating in the garden.",
29
+ ]
30
+ )
31
+ print(text_emb.shape) # noqa
32
+ # Output: (3, 512)
33
+
34
+ # Compute similarities.
35
+ similarity_scores = model.similarity(img_emb, text_emb)
36
+ print(similarity_scores) # noqa
37
+ # Output: tensor([[0.3068, 0.3300, 0.1719]])
configs/digital_data_etl_cs370.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ settings:
2
+ docker:
3
+ parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
4
+ skip_build: True
5
+ orchestrator.sagemaker:
6
+ synchronous: false
7
+
8
+ parameters:
9
+ user_full_name: CS370 Project
10
+ links:
11
+ - https://github.com/ros-infrastructure/www.ros.org/
12
+ - https://github.com/ros-navigation/docs.nav2.org
13
+ - https://github.com/moveit/moveit2
14
+ - https://github.com/gazebosim/gz-sim
configs/end_to_end_data.yaml ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ settings:
2
+ docker:
3
+ parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
4
+ skip_build: True
5
+ orchestrator.sagemaker:
6
+ synchronous: false
7
+
8
+ parameters:
9
+ # Data ETL & Feature engineering pipelines parameters
10
+ author_links:
11
+ - user_full_name: Paul Iusztin # [First Name(s)] [Last Name]
12
+ links:
13
+ # Medium (only articles that are not under the paid wall work)
14
+ - https://medium.com/decodingml/an-end-to-end-framework-for-production-ready-llm-systems-by-building-your-llm-twin-2cc6bb01141f
15
+ - https://medium.com/decodingml/a-real-time-retrieval-system-for-rag-on-social-media-data-9cc01d50a2a0
16
+ - https://medium.com/decodingml/sota-python-streaming-pipelines-for-fine-tuning-llms-and-rag-in-real-time-82eb07795b87
17
+ - https://medium.com/decodingml/the-4-advanced-rag-algorithms-you-must-know-to-implement-5d0c7f1199d2
18
+ - https://medium.com/decodingml/architect-scalable-and-cost-effective-llm-rag-inference-pipelines-73b94ef82a99
19
+ # Substack
20
+ - https://decodingml.substack.com/p/a-blueprint-for-designing-production?r=1ttoeh
21
+ - https://decodingml.substack.com/p/the-difference-between-development?r=1ttoeh
22
+ - https://decodingml.substack.com/p/architect-scalable-and-cost-effective?r=1ttoeh
23
+ - https://decodingml.substack.com/p/7-tips-to-reduce-your-vram-when-training?r=1ttoeh
24
+ - https://decodingml.substack.com/p/using-this-python-package-you-can?r=1ttoeh
25
+ - https://decodingml.substack.com/p/the-4-advanced-rag-algorithms-you?r=1ttoeh
26
+ - https://decodingml.substack.com/p/problems-deploying-your-ml-models?r=1ttoeh
27
+ - https://decodingml.substack.com/p/sota-python-streaming-pipelines-for?r=1ttoeh
28
+ - https://decodingml.substack.com/p/ready-for-production-ml-here-are?r=1ttoeh
29
+ - https://decodingml.substack.com/p/ready-for-production-ml-here-are?r=1ttoeh
30
+ - https://decodingml.substack.com/p/my-ml-monthly-learning-resource-recommendations?r=1ttoeh
31
+ - https://decodingml.substack.com/p/an-end-to-end-framework-for-production?r=1ttoeh
32
+ - https://decodingml.substack.com/p/upskill-your-llm-knowledge-base-with?r=1ttoeh
33
+ - https://decodingml.substack.com/p/want-to-learn-an-end-to-end-framework?r=1ttoeh
34
+ - https://decodingml.substack.com/p/my-favorite-way-to-implement-a-configuration?r=1ttoeh
35
+ - https://decodingml.substack.com/p/a-real-time-retrieval-system-for?r=1ttoeh
36
+ - https://decodingml.substack.com/p/4-key-decoding-strategies-for-llms?r=1ttoeh
37
+ - https://decodingml.substack.com/p/dml-new-year-the-new-and-improved?r=1ttoeh
38
+ - https://decodingml.substack.com/p/dml-8-types-of-mlops-tools-that-must?r=1ttoeh
39
+ - https://decodingml.substack.com/p/dml-this-is-what-you-need-to-build?r=1ttoeh
40
+ - https://decodingml.substack.com/p/dml-7-steps-on-how-to-fine-tune-an?r=1ttoeh
41
+ - https://decodingml.substack.com/p/dml-how-do-you-generate-a-q-and-a?r=1ttoeh
42
+ - https://decodingml.substack.com/p/dml-what-do-you-need-to-fine-tune?r=1ttoeh
43
+ - https://decodingml.substack.com/p/dml-why-and-when-do-you-need-to-fine?r=1ttoeh
44
+ - https://decodingml.substack.com/p/dml-how-to-implement-a-streaming?r=1ttoeh
45
+ - https://decodingml.substack.com/p/dml-why-and-what-do-you-need-a-streaming?r=1ttoeh
46
+ - https://decodingml.substack.com/p/dml-unwrapping-the-3-pipeline-design?r=1ttoeh
47
+ - https://decodingml.substack.com/p/dml-how-to-design-an-llm-system-for?r=1ttoeh
48
+ - https://decodingml.substack.com/p/dml-synced-vector-dbs-a-guide-to?r=1ttoeh
49
+ - https://decodingml.substack.com/p/dml-what-is-the-difference-between?r=1ttoeh
50
+ - https://decodingml.substack.com/p/dml-7-steps-to-build-a-production?r=1ttoeh
51
+ - https://decodingml.substack.com/p/dml-chain-of-thought-reasoning-write?r=1ttoeh
52
+ - https://decodingml.substack.com/p/dml-build-and-serve-a-production?r=1ttoeh
53
+ - https://decodingml.substack.com/p/dml-4-key-ideas-you-must-know-to?r=1ttoeh
54
+ - https://decodingml.substack.com/p/dml-how-to-add-real-time-monitoring?r=1ttoeh
55
+ - https://decodingml.substack.com/p/dml-top-6-ml-platform-features-you?r=1ttoeh
56
+ - user_full_name: Maxime Labonne # [First Name(s)] [Last Name]
57
+ links:
58
+ # Substack
59
+ - https://maximelabonne.substack.com/p/uncensor-any-llm-with-abliteration-d30148b7d43e
60
+ - https://maximelabonne.substack.com/p/create-mixtures-of-experts-with-mergekit-11b318c99562
61
+ - https://maximelabonne.substack.com/p/merge-large-language-models-with-mergekit-2118fb392b54
62
+ - https://maximelabonne.substack.com/p/fine-tune-a-mistral-7b-model-with-direct-preference-optimization-708042745aac
63
+ - https://maximelabonne.substack.com/p/exllamav2-the-fastest-library-to-run-llms-32aeda294d26
64
+ - https://maximelabonne.substack.com/p/quantize-llama-models-with-ggml-and-llama-cpp-3612dfbcc172
65
+ - https://maximelabonne.substack.com/p/a-beginners-guide-to-llm-fine-tuning-4bae7d4da672
66
+ - https://maximelabonne.substack.com/p/graph-convolutional-networks-introduction-to-gnns-24b3f60d6c95
67
+ - https://maximelabonne.substack.com/p/4-bit-quantization-with-gptq-36b0f4f02c34
68
+ - https://maximelabonne.substack.com/p/fine-tune-your-own-llama-2-model-in-a-colab-notebook-df9823a04a32
69
+ - https://maximelabonne.substack.com/p/introduction-to-weight-quantization-2494701b9c0c
70
+ - https://maximelabonne.substack.com/p/decoding-strategies-in-large-language-models-9733a8f70539
71
+ - https://maximelabonne.substack.com/p/the-art-of-spending-optimizing-your-marketing-budget-with-nonlinear-optimization-6c8a39afb3c2
72
+ - https://maximelabonne.substack.com/p/create-a-bot-to-find-diamonds-in-minecraft-d836606a993a
73
+ - https://maximelabonne.substack.com/p/constraint-programming-67ac16fa0c81
74
+ - https://maximelabonne.substack.com/p/how-to-design-the-most-powerful-graph-neural-network-3d18b07a6e66
75
+ - https://maximelabonne.substack.com/p/introduction-to-graphsage-in-python-a9e7f9ecf9d7
76
+ - https://maximelabonne.substack.com/p/graph-attention-networks-in-python-975736ac5c0c
77
+ - https://maximelabonne.substack.com/p/integer-programming-vs-linear-programming-in-python-f1be5bb4e60e
78
+ - https://maximelabonne.substack.com/p/introduction-to-linear-programming-in-python-9261e7eb44b
79
+ - https://maximelabonne.substack.com/p/what-is-a-tensor-in-deep-learning-6dedd95d6507
80
+ - https://maximelabonne.substack.com/p/efficiently-iterating-over-rows-in-a-pandas-dataframe-7dd5f9992c01
81
+ - https://maximelabonne.substack.com/p/q-learning-for-beginners-2837b777741
82
+ - https://maximelabonne.substack.com/p/how-to-start-machine-learning-for-developers-in-2022-390af12b193f
83
+ # Generate instruct dataset pipeline parameters
84
+ test_split_size: 0.1
85
+ push_to_huggingface: false
86
+ dataset_id: pauliusztin/llmtwin
87
+ mock: false
configs/evaluating.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ settings:
2
+ docker:
3
+ parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
4
+ skip_build: True
5
+ orchestrator.sagemaker:
6
+ synchronous: false
7
+
8
+ parameters:
9
+ is_dummy: true # Change this to 'false' to run the evaluation on the full dataset.
configs/export_artifact_to_json.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ settings:
2
+ docker:
3
+ parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
4
+ skip_build: True
5
+ orchestrator.sagemaker:
6
+ synchronous: false
7
+
8
+ parameters:
9
+ artifact_names:
10
+ - raw_documents
11
+ - cleaned_documents
12
+ - instruct_datasets
13
+ - preference_datasets
configs/feature_engineering.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ settings:
2
+ docker:
3
+ parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
4
+ skip_build: True
5
+ orchestrator.sagemaker:
6
+ synchronous: false
7
+
8
+ parameters:
9
+ author_full_names:
10
+ - CS370 Project
configs/generate_instruct_datasets.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ settings:
2
+ docker:
3
+ parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
4
+ skip_build: True
5
+ orchestrator.sagemaker:
6
+ synchronous: false
7
+
8
+ parameters:
9
+ test_split_size: 0.1
10
+ dataset_type: "instruction"
11
+ push_to_huggingface: true
12
+ dataset_id: pauliusztin/llmtwin
13
+ mock: false
configs/generate_preference_datasets.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ settings:
2
+ docker:
3
+ parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
4
+ skip_build: True
5
+ orchestrator.sagemaker:
6
+ synchronous: false
7
+
8
+ parameters:
9
+ test_split_size: 0.05
10
+ dataset_type: "preference"
11
+ push_to_huggingface: true
12
+ dataset_id: pauliusztin/llmtwin-dpo
13
+ mock: false
configs/training.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ settings:
2
+ docker:
3
+ parent_image: 992382797823.dkr.ecr.eu-central-1.amazonaws.com/zenml-rlwlcs:latest
4
+ skip_build: True
5
+ orchestrator.sagemaker:
6
+ synchronous: false
7
+
8
+ parameters:
9
+ finetuning_type: sft
10
+ num_train_epochs: 3
11
+ per_device_train_batch_size: 2
12
+ learning_rate: 3e-4
13
+ dataset_huggingface_workspace: mlabonne
14
+ is_dummy: true # Change this to 'false' to run the training with the full dataset and epochs.
data/artifacts/cleaned_documents.json ADDED
The diff for this file is too large to render. See raw diff
 
data/artifacts/instruct_datasets.json ADDED
The diff for this file is too large to render. See raw diff
 
data/artifacts/preference_datasets.json ADDED
The diff for this file is too large to render. See raw diff
 
data/artifacts/raw_documents.json ADDED
The diff for this file is too large to render. See raw diff
 
data/data_warehouse_raw_data/ArticleDocument.json ADDED
The diff for this file is too large to render. See raw diff
 
data/data_warehouse_raw_data/PostDocument.json ADDED
@@ -0,0 +1 @@
 
 
1
+ []
data/data_warehouse_raw_data/RepositoryDocument.json ADDED
@@ -0,0 +1 @@
 
 
1
+ []
data/data_warehouse_raw_data/UserDocument.json ADDED
@@ -0,0 +1 @@
 
 
1
+ [{"first_name": "Maxime", "last_name": "Labonne", "_id": "eff74089-0271-4319-8543-745c087f4f61"}, {"first_name": "Paul", "last_name": "Iusztin", "_id": "b5fa1f08-75f0-402d-8e88-d1357e346d9e"}]
docker-compose.yml ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: "3.8"
2
+
3
+ services:
4
+ mongo:
5
+ image: mongo:latest
6
+ container_name: "llm_engineering_mongo"
7
+ logging:
8
+ options:
9
+ max-size: 1g
10
+ environment:
11
+ MONGO_INITDB_ROOT_USERNAME: "llm_engineering"
12
+ MONGO_INITDB_ROOT_PASSWORD: "llm_engineering"
13
+ ports:
14
+ - 27017:27017
15
+ volumes:
16
+ - mongo_data:/data/db
17
+ networks:
18
+ - local
19
+ restart: always
20
+
21
+ qdrant:
22
+ image: qdrant/qdrant:latest
23
+ container_name: "llm_engineering_qdrant"
24
+ ports:
25
+ - 6333:6333
26
+ - 6334:6334
27
+ expose:
28
+ - 6333
29
+ - 6334
30
+ volumes:
31
+ - qdrant_data:/qdrant/storage
32
+ networks:
33
+ - local
34
+ restart: always
35
+
36
+ app:
37
+ build: ./
38
+ container_name: "llm_engineering_app"
39
+ ports:
40
+ - 7860:7860
41
+ volumes:
42
+ - ./app:/app
43
+ environment:
44
+ PYTHONUNBUFFERED: "1"
45
+ NVIDIA_VISIBLE_DEVICES: "all"
46
+ networks:
47
+ - local
48
+ depends_on:
49
+ - mongo
50
+ - qdrant
51
+ deploy:
52
+ resources:
53
+ reservations:
54
+ devices:
55
+ - driver: nvidia
56
+ capabilities: ["gpu"]
57
+ device_ids: ["all"]
58
+
59
+ clearml:
60
+ image: allegroai/clearml:latest
61
+ container_name: "llm_engineering_clearml"
62
+ ports:
63
+ - 8080:8080
64
+ environment:
65
+ CLEARML_API_ACCESS_KEY: "your_access_key"
66
+ CLEARML_API_SECRET_KEY: "your_secret_key"
67
+ CLEARML_WEB_HOST: "http://localhost:8080"
68
+ CLEARML_API_HOST: "http://localhost:8080"
69
+ CLEARML_FILES_HOST: "http://localhost:8080"
70
+ volumes:
71
+ - clearml_data:/root/.clearml
72
+ networks:
73
+ - local
74
+ restart: always
75
+
76
+ volumes:
77
+ mongo_data:
78
+ qdrant_data:
79
+ clearml_data:
80
+
81
+ networks:
82
+ local:
83
+ driver: bridge
images/cover_plus.png ADDED
images/crazy_cat.jpg ADDED
llm_engineering/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from llm_engineering import application, domain, infrastructure
2
+ from llm_engineering.settings import settings
3
+
4
+ __all__ = ["settings", "application", "domain", "infrastructure"]
llm_engineering/application/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from . import utils
2
+
3
+ __all__ = ["utils"]
llm_engineering/application/crawlers/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from .dispatcher import CrawlerDispatcher
2
+ from .github import GithubCrawler
3
+
4
+ __all__ = [
5
+ "CrawlerDispatcher",
6
+ "GithubCrawler",
7
+ ]
llm_engineering/application/crawlers/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (356 Bytes). View file
 
llm_engineering/application/crawlers/__pycache__/base.cpython-311.pyc ADDED
Binary file (4.13 kB). View file
 
llm_engineering/application/crawlers/__pycache__/custom_article.cpython-311.pyc ADDED
Binary file (3 kB). View file
 
llm_engineering/application/crawlers/__pycache__/dispatcher.cpython-311.pyc ADDED
Binary file (2.61 kB). View file
 
llm_engineering/application/crawlers/__pycache__/github.cpython-311.pyc ADDED
Binary file (6.3 kB). View file
 
llm_engineering/application/crawlers/__pycache__/linkedin.cpython-311.pyc ADDED
Binary file (10.9 kB). View file
 
llm_engineering/application/crawlers/__pycache__/medium.cpython-311.pyc ADDED
Binary file (2.8 kB). View file
 
llm_engineering/application/crawlers/base.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from abc import ABC, abstractmethod
3
+ from tempfile import mkdtemp
4
+
5
+ from selenium import webdriver
6
+ from selenium.webdriver.chrome.options import Options
7
+
8
+ from llm_engineering.domain.documents import NoSQLBaseDocument
9
+
10
+ # Check if the current version of chromedriver exists
11
+ # and if it doesn't exist, download it automatically,
12
+ # then add chromedriver to path
13
+
14
+
15
+ class BaseCrawler(ABC):
16
+ model: type[NoSQLBaseDocument]
17
+
18
+ @abstractmethod
19
+ def extract(self, link: str, **kwargs) -> None: ...
20
+
21
+
22
+ class BaseSeleniumCrawler(BaseCrawler, ABC):
23
+ def __init__(self, scroll_limit: int = 5) -> None:
24
+ options = webdriver.ChromeOptions()
25
+
26
+ options.add_argument("--no-sandbox")
27
+ options.add_argument("--headless=new")
28
+ options.add_argument("--disable-dev-shm-usage")
29
+ options.add_argument("--log-level=3")
30
+ options.add_argument("--disable-popup-blocking")
31
+ options.add_argument("--disable-notifications")
32
+ options.add_argument("--disable-extensions")
33
+ options.add_argument("--disable-background-networking")
34
+ options.add_argument("--ignore-certificate-errors")
35
+ options.add_argument(f"--data-path={mkdtemp()}")
36
+ options.add_argument(f"--disk-cache-dir={mkdtemp()}")
37
+ options.add_argument("--remote-debugging-port=9226")
38
+
39
+ self.set_extra_driver_options(options)
40
+
41
+ self.scroll_limit = scroll_limit
42
+ self.driver = webdriver.Chrome(
43
+ options=options,
44
+ )
45
+
46
+ def set_extra_driver_options(self, options: Options) -> None:
47
+ pass
48
+
49
+ def login(self) -> None:
50
+ pass
51
+
52
+ def scroll_page(self) -> None:
53
+ """Scroll through the LinkedIn page based on the scroll limit."""
54
+ current_scroll = 0
55
+ last_height = self.driver.execute_script("return document.body.scrollHeight")
56
+ while True:
57
+ self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
58
+ time.sleep(5)
59
+ new_height = self.driver.execute_script("return document.body.scrollHeight")
60
+ if new_height == last_height or (self.scroll_limit and current_scroll >= self.scroll_limit):
61
+ break
62
+ last_height = new_height
63
+ current_scroll += 1
llm_engineering/application/crawlers/custom_article.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from urllib.parse import urlparse
2
+
3
+ from langchain_community.document_loaders import AsyncHtmlLoader
4
+ from langchain_community.document_transformers.html2text import Html2TextTransformer
5
+ from loguru import logger
6
+
7
+ from llm_engineering.domain.documents import ArticleDocument
8
+
9
+ from .base import BaseCrawler
10
+
11
+
12
+ class CustomArticleCrawler(BaseCrawler):
13
+ model = ArticleDocument
14
+
15
+ def __init__(self) -> None:
16
+ super().__init__()
17
+
18
+ def extract(self, link: str, **kwargs) -> None:
19
+ old_model = self.model.find(link=link)
20
+ if old_model is not None:
21
+ logger.info(f"Article already exists in the database: {link}")
22
+
23
+ return
24
+
25
+ logger.info(f"Starting scrapping article: {link}")
26
+
27
+ loader = AsyncHtmlLoader([link])
28
+ docs = loader.load()
29
+
30
+ html2text = Html2TextTransformer()
31
+ docs_transformed = html2text.transform_documents(docs)
32
+ doc_transformed = docs_transformed[0]
33
+
34
+ content = {
35
+ "Title": doc_transformed.metadata.get("title"),
36
+ "Subtitle": doc_transformed.metadata.get("description"),
37
+ "Content": doc_transformed.page_content,
38
+ "language": doc_transformed.metadata.get("language"),
39
+ }
40
+
41
+ parsed_url = urlparse(link)
42
+ platform = parsed_url.netloc
43
+
44
+ user = kwargs["user"]
45
+ instance = self.model(
46
+ content=content,
47
+ link=link,
48
+ platform=platform,
49
+ author_id=user.id,
50
+ author_full_name=user.full_name,
51
+ )
52
+ instance.save()
53
+
54
+ logger.info(f"Finished scrapping custom article: {link}")
llm_engineering/application/crawlers/dispatcher.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from urllib.parse import urlparse
3
+
4
+ from loguru import logger
5
+
6
+ from .base import BaseCrawler
7
+ from .custom_article import CustomArticleCrawler
8
+ from .github import GithubCrawler
9
+
10
+
11
+ class CrawlerDispatcher:
12
+ def __init__(self) -> None:
13
+ self._crawlers = {}
14
+
15
+ @classmethod
16
+ def build(cls) -> "CrawlerDispatcher":
17
+ dispatcher = cls()
18
+
19
+ return dispatcher
20
+
21
+ def register_github(self) -> "CrawlerDispatcher":
22
+ self.register("https://github.com", GithubCrawler)
23
+
24
+ return self
25
+
26
+ def register(self, domain: str, crawler: type[BaseCrawler]) -> None:
27
+ parsed_domain = urlparse(domain)
28
+ domain = parsed_domain.netloc
29
+
30
+ self._crawlers[r"https://(www\.)?{}/*".format(re.escape(domain))] = crawler
31
+
32
+ def get_crawler(self, url: str) -> BaseCrawler:
33
+ for pattern, crawler in self._crawlers.items():
34
+ if re.match(pattern, url):
35
+ return crawler()
36
+ else:
37
+ logger.warning(f"No crawler found for {url}. Defaulting to CustomArticleCrawler.")
38
+
39
+ return CustomArticleCrawler()
llm_engineering/application/crawlers/github.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pathlib
3
+ import shutil
4
+ import subprocess
5
+ import tempfile
6
+
7
+ from loguru import logger
8
+
9
+ from llm_engineering.domain.documents import RepositoryDocument
10
+
11
+ from .base import BaseCrawler
12
+
13
+
14
+ class GithubCrawler(BaseCrawler):
15
+ model = RepositoryDocument
16
+
17
+ def __init__(
18
+ self,
19
+ include=(
20
+ ".txt",
21
+ ".md",
22
+ ".rst",
23
+ ".json",
24
+ ".yml",
25
+ ".yaml",
26
+ ".xml",
27
+ ".html",
28
+ ".csv",
29
+ ".py",
30
+ ".sh",
31
+ ".cfg",
32
+ ".conf",
33
+ ".js",
34
+ ".css",
35
+ ".scss",
36
+ ".cpp",
37
+ ".hpp",
38
+ ".h",
39
+ ".cc",
40
+ ".hh",
41
+ ".cmake",
42
+ ".bat",
43
+ ".rb",
44
+ ".bash",
45
+ ".qml",
46
+ ".proto",
47
+ ".properties",
48
+ ".template",
49
+ ".in",
50
+ ".inc",
51
+ ".pyi",
52
+ ".typed",
53
+ ),
54
+ ignore=(
55
+ ".git",
56
+ ".toml",
57
+ ".lock",
58
+ ".png",
59
+ ".gitignore",
60
+ ".ico",
61
+ ".jpg",
62
+ ".jpeg",
63
+ ".webp",
64
+ ".svg",
65
+ ".gif",
66
+ ".stl",
67
+ ".dae",
68
+ ".jar",
69
+ ".pdf",
70
+ ),
71
+ ) -> None:
72
+ super().__init__()
73
+ self._ignore = ignore
74
+ self._include = include
75
+
76
+ def extract(self, link: str, **kwargs) -> None:
77
+ old_model = self.model.find(link=link)
78
+ if old_model is not None:
79
+ logger.info(f"Repository already exists in the database: {link}")
80
+
81
+ return
82
+
83
+ logger.info(f"Starting scrapping GitHub repository: {link}")
84
+
85
+ repo_name = link.rstrip("/").split("/")[-1]
86
+
87
+ local_temp = tempfile.mkdtemp()
88
+ file_types = {}
89
+ try:
90
+ os.chdir(local_temp)
91
+ subprocess.run(["git", "clone", link], check=True)
92
+
93
+ repo_path = os.path.join(local_temp, os.listdir(local_temp)[0]) # noqa: PTH118
94
+
95
+ tree = {}
96
+ current_size = 0
97
+ max_size = 16793598 - 100000 # 16 MB in bytes
98
+
99
+ for root, _, files in os.walk(repo_path):
100
+ dir = root.replace(repo_path, "").lstrip("/")
101
+ if dir.startswith(tuple(self._ignore)):
102
+ continue
103
+ for file in files:
104
+ if file.endswith(tuple(self._ignore)) or file.startswith("."):
105
+ continue
106
+ if not file.endswith(tuple(self._include)):
107
+ continue
108
+ file_path = os.path.join(dir, file) # noqa: PTH118
109
+ full_file_path = os.path.join(root, file) # noqa: PTH118
110
+
111
+ try:
112
+ with open(full_file_path, "r", errors="ignore") as f: # noqa: PTH123
113
+ file_extension = pathlib.Path(full_file_path).suffix
114
+ file_types[file_extension] = 1
115
+ content = f.read().replace(" ", "")
116
+ file_size = len(content.encode("utf-8"))
117
+
118
+ # Check if adding this file exceeds the size limit
119
+ if current_size + file_size > max_size:
120
+ # Save the current tree and clear it
121
+ self.save_tree(tree, repo_name, link)
122
+ tree.clear()
123
+ current_size = 0
124
+
125
+ # Add file to tree
126
+ tree[file_path] = content
127
+ current_size += file_size
128
+
129
+ except Exception as e:
130
+ logger.error(f"Failed to process file {file_path}: {e}")
131
+
132
+ # Save any remaining files in the tree
133
+ if tree:
134
+ self.save_tree(tree, repo_name, link)
135
+
136
+ except Exception as e:
137
+ logger.error(f"Error while processing repository: {e}")
138
+ raise
139
+ finally:
140
+ shutil.rmtree(local_temp, ignore_errors=True)
141
+
142
+ logger.info(f"Finished scrapping GitHub repository: {link}")
143
+ logger.info(file_types)
144
+
145
+ def save_tree(self, tree, repo_name, link):
146
+ """Helper method to save the current tree."""
147
+ try:
148
+ instance = self.model(
149
+ content=tree,
150
+ name=repo_name,
151
+ link=link,
152
+ platform="github",
153
+ author_id="46648381-8bf3-4877-b6b4-d48c9de9d870",
154
+ author_full_name="CS370 Project",
155
+ )
156
+ instance.save()
157
+ except Exception as e:
158
+ logger.error(f"Failed to save tree: {e}")
llm_engineering/application/dataset/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from . import generation
2
+
3
+ __all__ = ["generation"]
llm_engineering/application/dataset/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (263 Bytes). View file
 
llm_engineering/application/dataset/__pycache__/constants.cpython-311.pyc ADDED
Binary file (1.71 kB). View file
 
llm_engineering/application/dataset/__pycache__/generation.cpython-311.pyc ADDED
Binary file (14.5 kB). View file
 
llm_engineering/application/dataset/__pycache__/output_parsers.cpython-311.pyc ADDED
Binary file (1.39 kB). View file
 
llm_engineering/application/dataset/__pycache__/utils.cpython-311.pyc ADDED
Binary file (6.82 kB). View file
 
llm_engineering/application/dataset/constants.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llm_engineering.domain.dataset import DatasetType
2
+
3
+ MOCKED_RESPONSE_INSTRUCT = """
4
+ [
5
+ {"instruction": "<mocked generated instruction> 1", "answer": "<mocked generated answer> 1"},
6
+ {"instruction": "<mocked generated instruction> 2", "answer": "<mocked generated answer> 2"},
7
+ {"instruction": "<mocked generated instruction> 3", "answer": "<mocked generated answer> 3"}
8
+ ]
9
+ """
10
+
11
+ MOCKED_RESPONSE_PREFERENCE = """
12
+ [
13
+ {"instruction": "<mocked generated instruction> 1", "rejected": "<mocked generated answer> 1", "chosen": "Mocked extracted extracted extracted extracted extracted extracted extracted extracted extracted extracted answer 1."},
14
+ {"instruction": "<mocked generated instruction> 2", "rejected": "<mocked generated answer> 2", "chosen": "Mocked extracted extracted extracted extracted extracted extracted extracted extracted extracted extracted answer 2."},
15
+ {"instruction": "<mocked generated instruction> 3", "rejected": "<mocked generated answer> 3", "chosen": "Mocked extracted answer 3"}
16
+ ]
17
+ """
18
+
19
+
20
+ def get_mocked_response(dataset_type: DatasetType) -> str:
21
+ if dataset_type == DatasetType.INSTRUCTION:
22
+ return MOCKED_RESPONSE_INSTRUCT
23
+ elif dataset_type == DatasetType.PREFERENCE:
24
+ return MOCKED_RESPONSE_PREFERENCE
25
+ else:
26
+ raise ValueError(f"Invalid dataset type: {dataset_type}")
llm_engineering/application/dataset/generation.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+ import tiktoken
4
+ from langchain_core.exceptions import OutputParserException
5
+ from langchain_core.language_models.fake import FakeListLLM
6
+ from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
7
+ from langchain_core.prompts import PromptTemplate
8
+ from langchain_ollama import ChatOllama
9
+ from loguru import logger
10
+
11
+ from llm_engineering import domain
12
+ from llm_engineering.application import utils
13
+ from llm_engineering.domain.cleaned_documents import CleanedDocument
14
+ from llm_engineering.domain.dataset import DatasetType, TrainTestSplit
15
+ from llm_engineering.domain.prompt import GenerateDatasetSamplesPrompt, Prompt
16
+ from llm_engineering.domain.types import DataCategory
17
+ from llm_engineering.settings import settings
18
+
19
+ from . import constants
20
+ from . import utils as generation_utils
21
+ from .output_parsers import ListPydanticOutputParser
22
+
23
+
24
+ class DatasetGenerator(ABC):
25
+ tokenizer = tiktoken.encoding_for_model(settings.OPENAI_MODEL_ID)
26
+ dataset_type: DatasetType | None = None
27
+
28
+ system_prompt_template = """You are a helpful assistant who generates {dataset_format} based on the given context. \
29
+ Provide your response in JSON format.
30
+ """
31
+ prompt_template_str: str | None = None
32
+
33
+ @classmethod
34
+ def get_system_prompt(cls) -> Prompt:
35
+ assert cls.dataset_type is not None, "Dataset type must be set before calling get_system_prompt()"
36
+
37
+ dataset_format = (
38
+ "instruction-answer pairs" if cls.dataset_type == DatasetType.INSTRUCTION else "instruction-answer triples"
39
+ )
40
+ input_variables = {
41
+ "dataset_format": dataset_format,
42
+ }
43
+ system_prompt = cls.system_prompt_template.format(**input_variables)
44
+
45
+ return Prompt(
46
+ template=cls.system_prompt_template,
47
+ input_variables=input_variables,
48
+ content=system_prompt,
49
+ )
50
+
51
+ @classmethod
52
+ def get_prompts(cls, documents: list[CleanedDocument]) -> dict[DataCategory, list[GenerateDatasetSamplesPrompt]]:
53
+ documents = generation_utils.extract_substrings(documents)
54
+
55
+ grouped_prompts = {}
56
+ grouped_cleaned_documents = CleanedDocument.group_by_category(documents)
57
+ for category, category_documents in grouped_cleaned_documents.items():
58
+ category_prompts = [cls.get_prompt(document) for document in category_documents]
59
+ grouped_prompts[category] = category_prompts
60
+
61
+ return grouped_prompts
62
+
63
+ @classmethod
64
+ def get_prompt(cls, document: CleanedDocument) -> GenerateDatasetSamplesPrompt:
65
+ assert cls.prompt_template_str is not None, "Prompt template must be set before calling get_prompt()"
66
+
67
+ data_category = document.get_category()
68
+
69
+ prompt_template = PromptTemplate.from_template(
70
+ template=cls.prompt_template_str,
71
+ template_format="jinja2",
72
+ )
73
+ input_variables = {
74
+ "extract": document.content,
75
+ }
76
+ prompt = prompt_template.format(**input_variables)
77
+ prompt_tokens = cls.tokenizer.encode(prompt)
78
+ if len(prompt_tokens) > settings.OPENAI_MAX_TOKEN_WINDOW:
79
+ prompt_tokens = prompt_tokens[: settings.OPENAI_MAX_TOKEN_WINDOW]
80
+ prompt = cls.tokenizer.decode(prompt_tokens)
81
+
82
+ prompt = GenerateDatasetSamplesPrompt(
83
+ template=prompt_template.template,
84
+ input_variables=input_variables,
85
+ content=prompt,
86
+ num_tokens=len(prompt_tokens),
87
+ data_category=data_category,
88
+ document=document,
89
+ )
90
+
91
+ return prompt
92
+
93
+ @classmethod
94
+ def generate(
95
+ cls,
96
+ prompts: dict[DataCategory, list[GenerateDatasetSamplesPrompt]],
97
+ test_size: float = 0.2,
98
+ mock: bool = False,
99
+ ) -> TrainTestSplit:
100
+ assert cls.dataset_type is not None, "Dataset type must be set before calling generate()"
101
+
102
+ def _to_langchain(
103
+ prompt: GenerateDatasetSamplesPrompt,
104
+ ) -> list[BaseMessage]:
105
+ messages = [
106
+ SystemMessage(content=cls.get_system_prompt().content),
107
+ HumanMessage(content=prompt.content),
108
+ ]
109
+
110
+ return messages
111
+
112
+ if mock:
113
+ llm = FakeListLLM(responses=[constants.get_mocked_response(cls.dataset_type)])
114
+ else:
115
+ llm = ChatOllama(
116
+ model=settings.LLAMA_MODEL_ID,
117
+ max_tokens=2000 if cls.dataset_type == DatasetType.PREFERENCE else 1200,
118
+ temperature=0.7,
119
+ )
120
+ parser = ListPydanticOutputParser(pydantic_object=cls._get_dataset_sample_type())
121
+
122
+ chain = llm | parser
123
+
124
+ datasets = {}
125
+ for category, category_prompts in prompts.items():
126
+ langchain_category_prompts = [_to_langchain(prompt) for prompt in category_prompts]
127
+ batches = utils.misc.batch(langchain_category_prompts, size=24)
128
+
129
+ flattened_instruct_dataset_samples = []
130
+ for batch in batches:
131
+ try:
132
+ batched_dataset_samples = chain.batch(batch, stop=None)
133
+
134
+ for instruct_dataset_sample_batch in batched_dataset_samples:
135
+ flattened_instruct_dataset_samples.extend(instruct_dataset_sample_batch)
136
+ except OutputParserException:
137
+ logger.exception(f"Failed to parse the output JSON for a batch for category {category}")
138
+
139
+ dataset = domain.dataset.build_dataset(
140
+ dataset_type=cls.dataset_type, category=category, samples=flattened_instruct_dataset_samples
141
+ )
142
+ datasets[category] = dataset
143
+ logger.info(f"Generated {len(dataset.samples)} samples for category '{category}'.")
144
+
145
+ processed_datasets = cls.post_process_datasets(datasets, test_size=test_size)
146
+
147
+ return processed_datasets
148
+
149
+ @classmethod
150
+ def _get_dataset_sample_type(
151
+ cls,
152
+ ) -> type[domain.dataset.InstructDatasetSample] | type[domain.dataset.PreferenceDatasetSample]:
153
+ return (
154
+ domain.dataset.InstructDatasetSample
155
+ if cls.dataset_type == DatasetType.INSTRUCTION
156
+ else domain.dataset.PreferenceDatasetSample
157
+ )
158
+
159
+ @classmethod
160
+ @abstractmethod
161
+ def post_process_datasets(
162
+ cls, datasets: dict[DataCategory, domain.dataset.InstructDataset], test_size: float
163
+ ) -> TrainTestSplit:
164
+ pass
165
+
166
+
167
+ class InstructionDatasetGenerator(DatasetGenerator):
168
+ dataset_type = DatasetType.INSTRUCTION
169
+
170
+ prompt_template_str = """Based on the following extract, generate five instruction-answer pairs. Each instruction \
171
+ must ask to write about a specific topic contained in the context. Each answer \
172
+ must provide a relevant paragraph based on the information found in the \
173
+ context. Only use concepts from the context to generate the instructions. \
174
+ Instructions must never explicitly mention a context, a system, a course, or an extract. \
175
+ Instructions must be self-contained and general. \
176
+ Answers must imitate the writing style of the context. \
177
+
178
+ Example instruction: Explain the concept of an LLM Twin. \
179
+ Example answer: An LLM Twin is essentially an AI character that mimics your writing style, personality, and voice. \
180
+ It's designed to write just like you by incorporating these elements into a language model. \
181
+ The idea is to create a digital replica of your writing habits using advanced AI techniques. \
182
+
183
+ Structure the answer in JSON format, ready to be loaded in Python by json.loads(), as a list of objects.
184
+ Do not add any extra characters and provide your response in JSON format with the following structure:
185
+ [
186
+ {"instruction": "...", "answer": "..."},
187
+ ...
188
+ ]
189
+
190
+ Extract:
191
+ {extract}
192
+ """
193
+
194
+ @classmethod
195
+ def post_process_datasets(
196
+ cls, datasets: dict[DataCategory, domain.dataset.InstructDataset], test_size: float
197
+ ) -> TrainTestSplit:
198
+ train_test_split = generation_utils.create_instruct_train_test_split(
199
+ datasets, test_size=test_size, random_state=42
200
+ )
201
+
202
+ return train_test_split
203
+
204
+
205
+ class PreferenceDatasetGenerator(DatasetGenerator):
206
+ dataset_type = DatasetType.PREFERENCE
207
+
208
+ prompt_template_str = """Based on the following extract, generate five instruction-answer triples. Each triple should consist of:
209
+ 1. An instruction asking about a specific topic in the context.
210
+ 2. A generated answer that attempts to answer the instruction based on the context, named as 'rejected'.
211
+ 3. An extracted answer that is a relevant excerpt directly from the given context, named as 'chosen'.
212
+
213
+ Instructions must be self-contained and general, without explicitly mentioning a context, system, course, or extract.
214
+
215
+ Important:
216
+ - Ensure that the extracted answer, the chosen one, is a verbatim copy from the context, including all punctuation and apostrophes.
217
+ - Do not add any ellipsis (...) or [...] to indicate skipped text in the extracted answer.
218
+ - If the relevant text is not continuous, use two separate sentences from the context instead of skipping text.
219
+
220
+ Structure the answer in JSON format, ready to be loaded in Python by json.loads(), as a list of objects.
221
+ Do not add any extra characters and provide your response in JSON format with the following structure:
222
+ [
223
+ {
224
+ "instruction": "...",
225
+ "rejected": "...",
226
+ "chosen": "..."
227
+ },
228
+ ...
229
+ ]
230
+
231
+ Extract:
232
+ {extract}
233
+ """
234
+
235
+ @classmethod
236
+ def post_process_datasets(
237
+ cls, datasets: dict[DataCategory, domain.dataset.PreferenceDataset], test_size: float
238
+ ) -> TrainTestSplit:
239
+ datasets = generation_utils.filter_short_answers(datasets)
240
+ datasets = generation_utils.filter_answer_format(datasets)
241
+
242
+ remaining_samples = sum([dataset.num_samples for dataset in datasets.values()])
243
+ logger.info(
244
+ f"Filtered out short answers and answers with incorrect format. Remaining samples: {remaining_samples}"
245
+ )
246
+
247
+ train_test_split = generation_utils.create_preference_train_test_split(
248
+ datasets, test_size=test_size, random_state=42
249
+ )
250
+
251
+ return train_test_split
252
+
253
+
254
+ def get_dataset_generator(dataset_type: DatasetType) -> type[DatasetGenerator]:
255
+ if dataset_type == DatasetType.INSTRUCTION:
256
+ return InstructionDatasetGenerator
257
+ elif dataset_type == DatasetType.PREFERENCE:
258
+ return PreferenceDatasetGenerator
259
+ else:
260
+ raise ValueError(f"Invalid dataset type: {dataset_type}")