File size: 2,220 Bytes
411ca77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# all in python script obv

import os
from pdfparser_hq import pdfs_folder_to_images

from google.oauth2 import service_account
from googleapiclient.discovery import build
import io
from googleapiclient.http import MediaIoBaseDownload
import gdown

def list_files_in_folder(service, folder_id):
    results = service.files().list(
        q=f"'{folder_id}' in parents and (mimeType='application/pdf' or mimeType='image/png')",
        pageSize=1000,
        fields="nextPageToken, files(id, name)"
    ).execute()
    items = results.get('files', [])
    return items

def download_file(service, file_id, file_name, save_path):
    request = service.files().get_media(fileId=file_id)
    fh = io.FileIO(os.path.join(save_path, file_name), 'wb')
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while done is False:
        status, done = downloader.next_chunk()
        print(f"Download {file_name} {int(status.progress() * 100)}%.")


def download_files_from_folder(service, folder_id, save_path):
    files = list_files_in_folder(service, folder_id)
    for file in files:
        # print(file)
        download_file(service, file['id'], file['name'], save_path)
        
        
        
def run_preprocessing(data_path, source_folder_with_reciepts,link_to_csv ):        
    """_summary_

    Args:
        data_path (_type_): path where to save data
        source_folder_with_reciepts (_type_): folder_if where the reciepts are saved
        link_to_csv (_type_): link gdrive to csv
    """

    os.makedirs(data_path, exist_ok=True)
    full_link_to_csv = f'https://drive.google.com/uc?id={link_to_csv}'
    print(full_link_to_csv)
    transaction_csv_path = f'{data_path}/downloaded_file.csv'
    gdown.download(full_link_to_csv, transaction_csv_path, quiet=False)
    SCOPES = ['https://www.googleapis.com/auth/drive']
    SERVICE_ACCOUNT_FILE = 'secret_google_service_account.json'

    credentials = service_account.Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE, scopes=SCOPES)
    service = build('drive', 'v3', credentials=credentials)
        
    download_files_from_folder(service, source_folder_with_reciepts, data_path)

    pdfs_folder_to_images(data_path)