# all in python script obv import os from pdfparser_hq import pdfs_folder_to_images from google.oauth2 import service_account from googleapiclient.discovery import build import io from googleapiclient.http import MediaIoBaseDownload import gdown def list_files_in_folder(service, folder_id): results = service.files().list( q=f"'{folder_id}' in parents and (mimeType='application/pdf' or mimeType='image/png')", pageSize=1000, fields="nextPageToken, files(id, name)" ).execute() items = results.get('files', []) return items def download_file(service, file_id, file_name, save_path): request = service.files().get_media(fileId=file_id) fh = io.FileIO(os.path.join(save_path, file_name), 'wb') downloader = MediaIoBaseDownload(fh, request) done = False while done is False: status, done = downloader.next_chunk() print(f"Download {file_name} {int(status.progress() * 100)}%.") def download_files_from_folder(service, folder_id, save_path): files = list_files_in_folder(service, folder_id) for file in files: # print(file) download_file(service, file['id'], file['name'], save_path) def run_preprocessing(data_path, source_folder_with_reciepts,link_to_csv ): """_summary_ Args: data_path (_type_): path where to save data source_folder_with_reciepts (_type_): folder_if where the reciepts are saved link_to_csv (_type_): link gdrive to csv """ os.makedirs(data_path, exist_ok=True) full_link_to_csv = f'https://drive.google.com/uc?id={link_to_csv}' print(full_link_to_csv) transaction_csv_path = f'{data_path}/downloaded_file.csv' gdown.download(full_link_to_csv, transaction_csv_path, quiet=False) SCOPES = ['https://www.googleapis.com/auth/drive'] SERVICE_ACCOUNT_FILE = 'secret_google_service_account.json' credentials = service_account.Credentials.from_service_account_file( SERVICE_ACCOUNT_FILE, scopes=SCOPES) service = build('drive', 'v3', credentials=credentials) download_files_from_folder(service, source_folder_with_reciepts, data_path) pdfs_folder_to_images(data_path)