File size: 555 Bytes
3ab64ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import glob
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from transformers import AutoTokenizer
from langchain_community.document_loaders import PyMuPDFLoader
path_to_data = "./data/"


def process_pdf():
    files = {'ABC':'./data/MWTS2021.pdf',
            'XYZ':'./data/Consolidated2021.pdf'}
    docs = {}
    for file,value in files.items():
        try:
            docs[file] = PyMuPDFLoader(value).load()
        except Exception as e:
            print("Exception: ", e)