File size: 2,751 Bytes
74b1bac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89

import os
from langchain_qdrant import QdrantVectorStore
import pickle
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()


from langchain_community.vectorstores import Qdrant
from langchain_core.documents import Document
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document
from typing import List, Dict
from langchain.chains import RetrievalQA
import os

llm = ChatGroq(model_name="llama3-70b-8192", temperature=0.1,api_key= os.getenv('llm_api_1'))


def load_and_chunk_data(data_path):
    docs = []
    # Load all .txt files from the specified folder and its subfolders
    for root, _, files in os.walk(data_path):
        for filename in files:
            if filename.endswith('.txt'):
                file_path = os.path.join(root, filename)
                loader = TextLoader(file_path, encoding='utf-8')
                docs.extend(loader.load())

    headers_to_split_on = [
        ("#", "Header_1"),
        ("##", "Header_2"),
        ("###", "Header_3"),
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on, strip_headers=False
    )

    chunk_size = 512
    chunk_overlap = 0

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    chunked_docs = []

    for doc in docs:
        md_header_splits = markdown_splitter.split_text(doc.page_content)
        chunked_docs.extend(text_splitter.split_documents(md_header_splits))

    return chunked_docs



data_path = '/home/azureuser/data/gioithieuhocvien'
chunked_data = load_and_chunk_data(data_path)



# Save the documents list with pickle
import pickle

with open('gioithieuhocvien_filter.pkl', 'wb') as f:
    pickle.dump(chunked_data, f)

os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_KEY')
HF_EMBEDDING =  OpenAIEmbeddings(model='text-embedding-3-small')


url="http://localhost:6333"
qdrant = QdrantVectorStore.from_documents(
    chunked_data,
    HF_EMBEDDING,
    url=url,
    collection_name="gioithieuhocvien_filter",
)