{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "#! pip install openai" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "from dotenv import load_dotenv\n", "from openai import OpenAI\n", "import os" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Preparación para Fine-Tuning" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "load_dotenv()\n", "API_KEY = os.getenv('OPENAI_KEY')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "client = OpenAI()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Separamos en Training y Validation cada file" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Función para separar la data\n", "def dividir_training_validation(ruta_archivo, proporcion_training=0.8):\n", " # Leer todas las líneas del archivo\n", " with open(ruta_archivo, 'r', encoding='utf-8') as file:\n", " lineas = file.readlines()\n", "\n", " # Calcular el punto de corte para el conjunto de entrenamiento\n", " corte = int(len(lineas) * proporcion_training)\n", "\n", " # Dividir las líneas en conjuntos de entrenamiento y validación\n", " lineas_training = lineas[:corte]\n", " lineas_validation = lineas[corte:]\n", "\n", " # Crear archivos para training y validation\n", " ruta_archivo_base = ruta_archivo.replace('.jsonl', '')\n", " archivo_training = f'{ruta_archivo_base}_train.jsonl'\n", " archivo_validation = f'{ruta_archivo_base}_val.jsonl'\n", "\n", " # Escribir el conjunto de entrenamiento\n", " with open(archivo_training, 'w', encoding='utf-8') as file:\n", " file.writelines(lineas_training)\n", "\n", " # Escribir el conjunto de validación\n", " with open(archivo_validation, 'w', encoding='utf-8') as file:\n", " file.writelines(lineas_validation)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# Llamadas a la función para crear la separación\n", "dividir_training_validation('Training_Data/Training_Prompts.jsonl')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Subimos files de entrenamiento y validación" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "# Para Training\n", "upload_train_response = client.files.create(\n", " file=open(\"Training_Data/Training_Prompts_train.jsonl\", \"rb\"),\n", " purpose=\"fine-tune\"\n", ")\n", "\n", "# Para Validation\n", "upload_val_response = client.files.create(\n", " file=open(\"Training_Data/Training_Prompts_val.jsonl\", \"rb\"),\n", " purpose=\"fine-tune\"\n", ")" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training file id:\tfile-eQOE8MxF51oFiGSVT48x0vLw\n", "Validation file id:\tfile-3LSttIrULCZUz5a4pXc3Fsk4\n" ] } ], "source": [ "train_file_id = upload_train_response.id\n", "val_file_id = upload_val_response.id\n", "\n", "print(f'Training file id:\\t{train_file_id}')\n", "print(f'Validation file id:\\t{val_file_id}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Trabajo de fine-tuning" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "fine_tune_response = client.fine_tuning.jobs.create(\n", " training_file=\"file-eQOE8MxF51oFiGSVT48x0vLw\", \n", " validation_file=\"file-3LSttIrULCZUz5a4pXc3Fsk4\",\n", " model=\"gpt-3.5-turbo-1106\", \n", " suffix=\"CARSE\",\n", " hyperparameters={\n", " \"n_epochs\":5\n", " }\n", ")" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fine-tune id:\tftjob-oKdrQdLt4j9ijBvKuQWXl9C9\n" ] } ], "source": [ "fine_tune_id = fine_tune_response.id\n", "\n", "print(f'Fine-tune id:\\t{fine_tune_id}')" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "FineTuningJob(id='ftjob-oKdrQdLt4j9ijBvKuQWXl9C9', created_at=1702185828, error=None, fine_tuned_model='ft:gpt-3.5-turbo-1106:personal:carse:8U71tg31', finished_at=1702187553, hyperparameters=Hyperparameters(n_epochs=5, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-1106', object='fine_tuning.job', organization_id='org-IXFDgE8ZZcQzb9yKJmEuFxvC', result_files=['file-nTA0MI5GRiQbBnqsUHYJZ4Wx'], status='succeeded', trained_tokens=88345, training_file='file-eQOE8MxF51oFiGSVT48x0vLw', validation_file='file-3LSttIrULCZUz5a4pXc3Fsk4')" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Retrieve the state of a fine-tune\n", "client.fine_tuning.jobs.retrieve(fine_tune_id)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-vscAV4VplFjxd6FkJyTCQWsi', created_at=1702187558, level='info', message='The job has successfully completed', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-DsuUGU4euSPufWPSsHtLQWMw', created_at=1702187554, level='info', message='New fine-tuned model created: ft:gpt-3.5-turbo-1106:personal:carse:8U71tg31', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-mBs22BBjFNsEir1FYqJb1Dpv', created_at=1702187530, level='info', message='Step 881/890: training loss=0.65, validation loss=2.07', object='fine_tuning.job.event', data={'step': 881, 'train_loss': 0.6533868908882141, 'valid_loss': 2.073900442857009, 'train_mean_token_accuracy': 0.8181818127632141, 'valid_mean_token_accuracy': 0.3076923076923077}, type='metrics'), FineTuningJobEvent(id='ftevent-tjVSeWvV47Xf2Cchy0AfbLxN', created_at=1702187510, level='info', message='Step 871/890: training loss=0.90, validation loss=2.88', object='fine_tuning.job.event', data={'step': 871, 'train_loss': 0.8985379338264465, 'valid_loss': 2.877812249319894, 'train_mean_token_accuracy': 0.7647058963775635, 'valid_mean_token_accuracy': 0.35714285714285715}, type='metrics'), FineTuningJobEvent(id='ftevent-0utFYTRmbfWD2tbVHlRwMMhg', created_at=1702187492, level='info', message='Step 861/890: training loss=0.50, validation loss=2.29', object='fine_tuning.job.event', data={'step': 861, 'train_loss': 0.4959395229816437, 'valid_loss': 2.2947926256391735, 'train_mean_token_accuracy': 0.8666666746139526, 'valid_mean_token_accuracy': 0.4722222222222222}, type='metrics'), FineTuningJobEvent(id='ftevent-6i7eASySmSHVHcNshaGriugB', created_at=1702187472, level='info', message='Step 851/890: training loss=0.76, validation loss=2.42', object='fine_tuning.job.event', data={'step': 851, 'train_loss': 0.7591314315795898, 'valid_loss': 2.420450496673584, 'train_mean_token_accuracy': 0.7916666865348816, 'valid_mean_token_accuracy': 0.25}, type='metrics'), FineTuningJobEvent(id='ftevent-6s0QNIYmZFVVa5NGW2OT2gAU', created_at=1702187452, level='info', message='Step 841/890: training loss=0.13, validation loss=2.62', object='fine_tuning.job.event', data={'step': 841, 'train_loss': 0.12887191772460938, 'valid_loss': 2.6182823181152344, 'train_mean_token_accuracy': 1.0, 'valid_mean_token_accuracy': 0.4}, type='metrics'), FineTuningJobEvent(id='ftevent-6CPON2gWiOFDq7LB8zMTg89u', created_at=1702187434, level='info', message='Step 831/890: training loss=0.34, validation loss=3.97', object='fine_tuning.job.event', data={'step': 831, 'train_loss': 0.3421609103679657, 'valid_loss': 3.9680542176769626, 'train_mean_token_accuracy': 1.0, 'valid_mean_token_accuracy': 0.25806451612903225}, type='metrics'), FineTuningJobEvent(id='ftevent-4v205e49u9XilcAJ0kmJ8aRr', created_at=1702187414, level='info', message='Step 821/890: training loss=1.61, validation loss=2.29', object='fine_tuning.job.event', data={'step': 821, 'train_loss': 1.6149991750717163, 'valid_loss': 2.293296359834217, 'train_mean_token_accuracy': 0.5714285969734192, 'valid_mean_token_accuracy': 0.42857142857142855}, type='metrics'), FineTuningJobEvent(id='ftevent-nUWUTTgLG3uUyCSngG708Sw4', created_at=1702187396, level='info', message='Step 811/890: training loss=1.79, validation loss=2.40', object='fine_tuning.job.event', data={'step': 811, 'train_loss': 1.7867711782455444, 'valid_loss': 2.400285849700103, 'train_mean_token_accuracy': 0.529411792755127, 'valid_mean_token_accuracy': 0.35135135135135137}, type='metrics')], object='list', has_more=True)" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# List up to 10 events from a fine-tuning job\n", "client.fine_tuning.jobs.list_events(fine_tuning_job_id=fine_tune_id, limit=10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 2 }