{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "95135ba6ca104151abab245f938b46a1": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_7c0719ab78e3479393e2e160f7bd7a4c", "IPY_MODEL_1395931983524aa58dfb7a603c952748", "IPY_MODEL_e05f535e386e482da0450c2ca0594c42" ], "layout": "IPY_MODEL_b5318f98281a43d388ff7868a2e69cbd" } }, "7c0719ab78e3479393e2e160f7bd7a4c": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_00d64df484074630a2be9bedbcaec9ab", "placeholder": "", "style": "IPY_MODEL_8be96f5efde442bbb1e44e8658d2fa6f", "value": "Loading checkpoint shards: 100%" } }, "1395931983524aa58dfb7a603c952748": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_68c6d56e98734bc7a49859ec57f462ad", "max": 2, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_e5b15807c72d4e69be4051c90d9649d4", "value": 2 } }, "e05f535e386e482da0450c2ca0594c42": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_f04d5e153ee94795bd6a7848be30789a", "placeholder": "", "style": "IPY_MODEL_f4d5244ed5634e99b4e1719d77bd1676", "value": " 2/2 [00:27<00:00, 11.71s/it]" } }, "b5318f98281a43d388ff7868a2e69cbd": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "00d64df484074630a2be9bedbcaec9ab": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "8be96f5efde442bbb1e44e8658d2fa6f": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "68c6d56e98734bc7a49859ec57f462ad": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e5b15807c72d4e69be4051c90d9649d4": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "f04d5e153ee94795bd6a7848be30789a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f4d5244ed5634e99b4e1719d77bd1676": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "id": "ZZzhbHyTbFed" }, "outputs": [], "source": [ "!pip install -q transformers sentence_transformers faiss-cpu torch PyPDF2 nltk" ] }, { "cell_type": "code", "source": [ "!pip install -U langchain-community\n", "from langchain.vectorstores import Qdrant\n", "from langchain.embeddings import HuggingFaceBgeEmbeddings\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "from langchain.document_loaders import PyPDFDirectoryLoader,TextLoader" ], "metadata": { "id": "jlkvzNn_bzX1", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "3b899e09-e0af-4d7c-f122-9edda8c44b52" }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: langchain-community in /usr/local/lib/python3.10/dist-packages (0.3.11)\n", "Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (6.0.2)\n", "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (2.0.36)\n", "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (3.11.9)\n", "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.6.7)\n", "Requirement already satisfied: httpx-sse<0.5.0,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.4.0)\n", "Requirement already satisfied: langchain<0.4.0,>=0.3.11 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.3.11)\n", "Requirement already satisfied: langchain-core<0.4.0,>=0.3.24 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.3.24)\n", "Requirement already satisfied: langsmith<0.3,>=0.1.125 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (0.1.147)\n", "Requirement already satisfied: numpy<2,>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (1.26.4)\n", "Requirement already satisfied: pydantic-settings<3.0.0,>=2.4.0 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (2.6.1)\n", "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (2.32.3)\n", "Requirement already satisfied: tenacity!=8.4.0,<10,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain-community) (9.0.0)\n", "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (2.4.4)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.3.1)\n", "Requirement already satisfied: async-timeout<6.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (4.0.3)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (24.2.0)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.5.0)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (6.1.0)\n", "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (0.2.1)\n", "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.18.3)\n", "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain-community) (3.23.1)\n", "Requirement already satisfied: typing-inspect<1,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain-community) (0.9.0)\n", "Requirement already satisfied: langchain-text-splitters<0.4.0,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from langchain<0.4.0,>=0.3.11->langchain-community) (0.3.2)\n", "Requirement already satisfied: pydantic<3.0.0,>=2.7.4 in /usr/local/lib/python3.10/dist-packages (from langchain<0.4.0,>=0.3.11->langchain-community) (2.10.3)\n", "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.24->langchain-community) (1.33)\n", "Requirement already satisfied: packaging<25,>=23.2 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.24->langchain-community) (24.2)\n", "Requirement already satisfied: typing-extensions>=4.7 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.4.0,>=0.3.24->langchain-community) (4.12.2)\n", "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.3,>=0.1.125->langchain-community) (0.28.0)\n", "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.3,>=0.1.125->langchain-community) (3.10.12)\n", "Requirement already satisfied: requests-toolbelt<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.3,>=0.1.125->langchain-community) (1.0.0)\n", "Requirement already satisfied: python-dotenv>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from pydantic-settings<3.0.0,>=2.4.0->langchain-community) (1.0.1)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (3.4.0)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (2.2.3)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain-community) (2024.8.30)\n", "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain-community) (3.1.1)\n", "Requirement already satisfied: anyio in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.3,>=0.1.125->langchain-community) (3.7.1)\n", "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->langsmith<0.3,>=0.1.125->langchain-community) (1.0.7)\n", "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->langsmith<0.3,>=0.1.125->langchain-community) (0.14.0)\n", "Requirement already satisfied: jsonpointer>=1.9 in /usr/local/lib/python3.10/dist-packages (from jsonpatch<2.0,>=1.33->langchain-core<0.4.0,>=0.3.24->langchain-community) (3.0.0)\n", "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain<0.4.0,>=0.3.11->langchain-community) (0.7.0)\n", "Requirement already satisfied: pydantic-core==2.27.1 in /usr/local/lib/python3.10/dist-packages (from pydantic<3.0.0,>=2.7.4->langchain<0.4.0,>=0.3.11->langchain-community) (2.27.1)\n", "Requirement already satisfied: mypy-extensions>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community) (1.0.0)\n", "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio->httpx<1,>=0.23.0->langsmith<0.3,>=0.1.125->langchain-community) (1.3.1)\n", "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio->httpx<1,>=0.23.0->langsmith<0.3,>=0.1.125->langchain-community) (1.2.2)\n" ] } ] }, { "cell_type": "code", "source": [ "import torch\n", "from transformers import AutoTokenizer, AutoModelForCausalLM\n", "from sentence_transformers import SentenceTransformer\n", "!pip install faiss-cpu\n", "!pip install sentence-transformers\n", "import faiss\n", "import numpy as np\n", "import pandas as pd\n", "!\n", "import PyPDF2\n", "import os\n", "import nltk\n", "# nltk.download('punkt')\n", "nltk.download('punkt_tab')\n", "from nltk.tokenize import sent_tokenize\n", "from google.colab import userdata" ], "metadata": { "id": "PPBaElOGb0um", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "712aa030-a0eb-450f-fb31-dd7e0151b297" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: faiss-cpu in /usr/local/lib/python3.10/dist-packages (1.9.0.post1)\n", "Requirement already satisfied: numpy<3.0,>=1.25.0 in /usr/local/lib/python3.10/dist-packages (from faiss-cpu) (1.26.4)\n", "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from faiss-cpu) (24.2)\n", "Requirement already satisfied: sentence-transformers in /usr/local/lib/python3.10/dist-packages (3.2.1)\n", "Requirement already satisfied: transformers<5.0.0,>=4.41.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (4.46.3)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (4.66.6)\n", "Requirement already satisfied: torch>=1.11.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (2.5.1+cu121)\n", "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.5.2)\n", "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (1.13.1)\n", "Requirement already satisfied: huggingface-hub>=0.20.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (0.26.3)\n", "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from sentence-transformers) (11.0.0)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (3.16.1)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2024.10.0)\n", "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (24.2)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (6.0.2)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2.32.3)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (4.12.2)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.11.0->sentence-transformers) (3.4.2)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.11.0->sentence-transformers) (3.1.4)\n", "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.11.0->sentence-transformers) (1.13.1)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch>=1.11.0->sentence-transformers) (1.3.0)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (1.26.4)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (2024.9.11)\n", "Requirement already satisfied: tokenizers<0.21,>=0.20 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.20.3)\n", "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.4.5)\n", "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence-transformers) (1.4.2)\n", "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence-transformers) (3.5.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.11.0->sentence-transformers) (3.0.2)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (3.4.0)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.2.3)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2024.8.30)\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package punkt_tab to /root/nltk_data...\n", "[nltk_data] Package punkt_tab is already up-to-date!\n" ] } ] }, { "cell_type": "code", "source": [ "HUGGING_FACE_ACCESS_TOKEN = userdata.get('HF_TOKEN_Z')\n", "\n", "model_name = 'google/gemma-2-2b-it'\n", "\n", "model = AutoModelForCausalLM.from_pretrained(\n", " model_name,\n", " torch_dtype=torch.float16,\n", " token=HUGGING_FACE_ACCESS_TOKEN\n", " ).to('cuda')\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name, token=HUGGING_FACE_ACCESS_TOKEN)" ], "metadata": { "id": "j_41WiGgb37x", "colab": { "base_uri": "https://localhost:8080/", "height": 49, "referenced_widgets": [ "95135ba6ca104151abab245f938b46a1", "7c0719ab78e3479393e2e160f7bd7a4c", "1395931983524aa58dfb7a603c952748", "e05f535e386e482da0450c2ca0594c42", "b5318f98281a43d388ff7868a2e69cbd", "00d64df484074630a2be9bedbcaec9ab", "8be96f5efde442bbb1e44e8658d2fa6f", "68c6d56e98734bc7a49859ec57f462ad", "e5b15807c72d4e69be4051c90d9649d4", "f04d5e153ee94795bd6a7848be30789a", "f4d5244ed5634e99b4e1719d77bd1676" ] }, "outputId": "ab4bff38-76d8-4f60-bb91-36e628fec941" }, "execution_count": 5, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "Loading checkpoint shards: 0%| | 0/2 [00:00, ?it/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "95135ba6ca104151abab245f938b46a1" } }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "def extract_text_from_pdf(pdf_path):\n", " try:\n", " with open(pdf_path, 'rb') as file:\n", " reader = PyPDF2.PdfReader(file)\n", " text = \"\".join([page.extract_text() for page in reader.pages])\n", " return text\n", " except Exception as e:\n", " print(f\"Error reading {pdf_path}: {e}\")\n", " return \"\"\n", "\n", "def split_text_into_chunks(text, max_chunk_size=1000):\n", " sentences = sent_tokenize(text)\n", " chunks = []\n", " current_chunk = \"\"\n", "\n", " for sentence in sentences:\n", " if len(current_chunk) + len(sentence) <= max_chunk_size:\n", " current_chunk += sentence + \" \"\n", " else:\n", " chunks.append(current_chunk.strip())\n", " current_chunk = sentence + \" \"\n", "\n", " if current_chunk:\n", " chunks.append(current_chunk.strip())\n", "\n", " return chunks" ], "metadata": { "id": "Hg_hYwQ6b5xU" }, "execution_count": 6, "outputs": [] }, { "cell_type": "code", "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "metadata": { "id": "f8iaap2ib7Vl", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "58a4a282-6796-4097-dee9-d0f3b74f3394" }, "execution_count": 7, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" ] } ] }, { "cell_type": "code", "source": [ "# check list pdfs and replace with yourpath\n", "\n", "os.chdir('/content/drive/MyDrive/Data')\n", "!ls" ], "metadata": { "id": "Ry1jQWXCb82A", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "44771696-6c6d-42d1-b8e9-b6dd0f3a877a" }, "execution_count": 8, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "data_cleaned_aisc.pdf\n" ] } ] }, { "cell_type": "code", "source": [ "encoder = SentenceTransformer('all-MiniLM-L6-v2')\n", "\n", "# Process PDF files\n", "pdf_directory = \"/content/drive/MyDrive/Data\"\n", "df_documents = pd.DataFrame(columns=['path', 'text_chunks', 'embeddings'])\n", "\n", "for filename in os.listdir(pdf_directory):\n", " if filename.endswith(\".pdf\"):\n", " print(filename)\n", " pdf_path = os.path.join(pdf_directory, filename)\n", " text = extract_text_from_pdf(pdf_path)\n", " chunks = split_text_into_chunks(text)\n", " document_embeddings = encoder.encode(chunks)\n", " new_row = pd.DataFrame({'path': [pdf_path], 'text_chunks': [chunks], 'embeddings': [document_embeddings]})\n", " df_documents = pd.concat([df_documents, new_row], ignore_index=True)\n", "\n", "df_documents" ], "metadata": { "id": "JUqcsy6sb-Y0", "colab": { "base_uri": "https://localhost:8080/", "height": 106 }, "outputId": "9c235f52-88a1-4ad8-c07e-94227745cad3" }, "execution_count": 9, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "data_cleaned_aisc.pdf\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " path \\\n", "0 /content/drive/MyDrive/Data/data_cleaned_aisc.pdf \n", "\n", " text_chunks \\\n", "0 [Keo - Pad tản nhiệt là gì? Keo - Pad tản nhiệ... \n", "\n", " embeddings \n", "0 [[0.0036073995, -0.059478104, 0.06371901, -0.0... " ], "text/html": [ "\n", "
\n", " | path | \n", "text_chunks | \n", "embeddings | \n", "
---|---|---|---|
0 | \n", "/content/drive/MyDrive/Data/data_cleaned_aisc.pdf | \n", "[Keo - Pad tản nhiệt là gì? Keo - Pad tản nhiệ... | \n", "[[0.0036073995, -0.059478104, 0.06371901, -0.0... | \n", "