accounting-micro-automation / preprocessing.py
Adr740's picture
Update preprocessing.py
dd42809 verified
# all in python script obv
import os
from pdfparser_hq import pdfs_folder_to_images
from google.oauth2 import service_account
from googleapiclient.discovery import build
import io
from googleapiclient.http import MediaIoBaseDownload
import gdown
def list_files_in_folder(service, folder_id):
results = service.files().list(
q=f"'{folder_id}' in parents and (mimeType='application/pdf' or mimeType='image/png')",
pageSize=1000,
fields="nextPageToken, files(id, name)"
).execute()
items = results.get('files', [])
return items
def download_file(service, file_id, file_name, save_path):
request = service.files().get_media(fileId=file_id)
fh = io.FileIO(os.path.join(save_path, file_name), 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
status, done = downloader.next_chunk()
print(f"Download {file_name} {int(status.progress() * 100)}%.")
def download_files_from_folder(service, folder_id, save_path):
files = list_files_in_folder(service, folder_id)
for file in files:
# print(file)
download_file(service, file['id'], file['name'], save_path)
def run_preprocessing(data_path, source_folder_with_reciepts,link_to_csv ):
"""_summary_
Args:
data_path (_type_): path where to save data
source_folder_with_reciepts (_type_): folder_if where the reciepts are saved
link_to_csv (_type_): link gdrive to csv
"""
os.makedirs(data_path, exist_ok=True)
full_link_to_csv = f'https://drive.google.com/uc?id={link_to_csv}'
print(full_link_to_csv)
transaction_csv_path = f'{data_path}/downloaded_file.csv'
gdown.download(full_link_to_csv, transaction_csv_path, quiet=False)
SCOPES = ['https://www.googleapis.com/auth/drive']
SERVICE_ACCOUNT_FILE = 'secret_google_service_account.json'
credentials = service_account.Credentials.from_service_account_file(
SERVICE_ACCOUNT_FILE, scopes=SCOPES)
service = build('drive', 'v3', credentials=credentials)
download_files_from_folder(service, source_folder_with_reciepts, data_path)
pdfs_folder_to_images(data_path)