Adr740 commited on
Commit
5acc5a8
·
verified ·
1 Parent(s): 9b91941

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +41 -0
  2. main.py +41 -0
  3. reconciliate_and_upload.py +140 -0
app.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from config import json_url_id
2
+ from main import run_main
3
+ import gdown
4
+ import gradio as gr
5
+
6
+
7
+
8
+ download_url = f'https://drive.google.com/uc?id={json_url_id}'
9
+ output = 'secret_google_service_account.json'
10
+ gdown.download(download_url, output, quiet=False)
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+ with gr.Blocks(title="Accountant automation",theme='nota-ai/theme') as demo:
19
+
20
+ with gr.Row():
21
+ with gr.Column(scale=6):
22
+ with gr.Row():
23
+ with gr.Column(scale=3):
24
+ source_folder_with_reciepts = gr.Textbox(placeholder="Dossier contenant les factures", lines=1)
25
+ link_to_csv = gr.Textbox(placeholder="Lien vers le relevé de compte en csv", lines=1, )
26
+ folder_to_save_processed_reciepts = gr.Textbox(placeholder="Dossier où sauvegarder les factures", lines=1, )
27
+ folder_to_save_reconciled_data = gr.Textbox(placeholder="Dossier où sauvegarder le tableau final", lines=1, )
28
+ name_output_file = gr.Textbox(placeholder="Nom du fichier de tableau final", lines=1, )
29
+ transaction_csv_path = gr.Textbox(placeholder="Company Name", lines=1, )
30
+ chat_submit_button = gr.Button(value="Submit ▶")
31
+
32
+ with gr.Column(scale=6):
33
+ chat_output = gr.Markdown("Appuyez sur valider pour lance le processing")
34
+
35
+
36
+ fn_chat = run_main
37
+
38
+
39
+ chat_submit_button.click(fn=fn_chat, inputs=[source_folder_with_reciepts, link_to_csv, folder_to_save_processed_reciepts, folder_to_save_reconciled_data, name_output_file, transaction_csv_path], outputs=[chat_output])
40
+
41
+ demo.launch(max_threads=40)
main.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #USER INPUT
2
+ from preprocessing import run_preprocessing
3
+ from ai_transcriber import transcribe_all
4
+ from reconciliate_and_upload import reconciliate_and_upload
5
+ import os
6
+ def run_main(
7
+ source_folder_with_reciepts,
8
+ link_to_csv,
9
+ folder_to_save_processed_reciepts,
10
+ folder_to_save_reconciled_data ,
11
+ name_output_file = "[AI generated] June 2024.xlsx",
12
+ transaction_csv_path = 'downloaded_file.csv',
13
+ data_path = "trial2"
14
+ ):
15
+ os.system("apt update; yes | apt-get install poppler-utils; yes | ls")
16
+
17
+ # breakpoint()
18
+ source_folder_with_reciepts = source_folder_with_reciepts.split("?")[0].split("/")[-1]
19
+ folder_to_save_processed_reciepts = folder_to_save_processed_reciepts.split("?")[0].split("/")[-1]
20
+ folder_to_save_reconciled_data = folder_to_save_reconciled_data.split("?")[0].split("/")[-1]
21
+ link_to_csv = link_to_csv.split("/view?")[0].split("/")[-1]
22
+ print("Extracted link csv id: ", link_to_csv)
23
+ name_output_file = name_output_file + ".xlsx"
24
+ name_output_file = name_output_file.replace(".xlsx.xlsx", ".xlsx")
25
+ # breakpoint()
26
+ run_preprocessing(data_path, source_folder_with_reciepts, link_to_csv)
27
+ print("Done pre-processing!")
28
+ transcribe_all(data_path)
29
+ print("Done transcription!")
30
+ id_output = reconciliate_and_upload(
31
+ data_path,
32
+ name_of_csv=transaction_csv_path,
33
+ folder_to_save_processed_reciepts=folder_to_save_processed_reciepts,
34
+ folder_to_save_reconciled_data = folder_to_save_reconciled_data,
35
+ name_of_output=name_output_file)
36
+
37
+
38
+ url_output_file = "https://drive.google.com/file/d/" + str(id_output)
39
+ display = f"[Voir tableau final]({url_output_file})"
40
+ print("Done all!")
41
+ return display
reconciliate_and_upload.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from uploader import save_logs
2
+ import os
3
+ import pandas as pd
4
+ from rapidfuzz import process, fuzz
5
+ from random import randint
6
+ from time import sleep
7
+ #other imports (google API)
8
+ def fuzzy_match(row, choices, scorer, cutoff):
9
+ match = process.extractOne(row['Libellé d\'opération'], choices, scorer=scorer, score_cutoff=cutoff)
10
+ if match:
11
+ return match[0]
12
+ return "missing receipt"
13
+ def reconciliate_and_upload(data_path,
14
+ name_of_csv,
15
+ folder_to_save_processed_reciepts,
16
+ folder_to_save_reconciled_data,
17
+ name_of_raw_transcripts = "transcript_raw.txt",
18
+ name_of_output = "[AI Generated] Output.xlsx" ):
19
+
20
+ with open(f"{data_path}/{name_of_raw_transcripts}") as file:
21
+ transcripts = eval(file.read())
22
+
23
+ imgs = []
24
+ path_to_pdfs =data_path
25
+
26
+ for root, dirs, files in os.walk(path_to_pdfs):
27
+ for file in files:
28
+ if file.endswith('.png'):
29
+ print(os.path.join(root, file))
30
+ imgs.append({"path": os.path.join(root, file)})
31
+ pass
32
+
33
+ list_transcripts_evaled = []
34
+ objects = []
35
+ for i,t in enumerate(transcripts):
36
+ content = eval(t["content"].replace('null', '-1'))
37
+ try:
38
+ obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts)
39
+ except:
40
+ print("sleeping a bit innit")
41
+ breakpoint()
42
+ sleep(randint(30,40))
43
+ obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts)
44
+
45
+ objects.append(obk)
46
+ print("uploaded image!")
47
+ try:
48
+ list_transcripts_evaled.append({
49
+ "path": imgs[i]["path"],
50
+ "name_of_supplier" :content["name_of_supplier"],
51
+ "amount":content["amount"],
52
+ "currency":content["currency"],
53
+ "date": content["date"]})
54
+ except:
55
+ breakpoint()
56
+ urls = []
57
+ for ob in objects:
58
+ url = "https://drive.google.com/file/d/" + ob["id"]
59
+ urls.append(url)
60
+ df_app = pd.DataFrame(list_transcripts_evaled)
61
+ float_regex = r'[-+]?\d*\.\d+|\d+'
62
+ df_app['amount'] = df_app['amount'].astype(str).str.extract(f'({float_regex})', expand=False)
63
+
64
+ # Replace spaces and commas, then handle N/A and convert to float
65
+ df_app['amount'] = (
66
+ df_app['amount']
67
+ .str.replace(" ", "", regex=False)
68
+ .str.replace(",", ".", regex=False)
69
+ .str.replace("N/A", "-1", regex=False)
70
+ .astype(float)
71
+ )
72
+
73
+ df_app["date"] = pd.to_datetime(df_app['date'], format="%d/%m/%Y", errors='coerce')
74
+ df_app["url"] = urls
75
+ df_app = df_app.drop_duplicates(["name_of_supplier", "amount", "date"]).reset_index(drop=True)
76
+
77
+ df_opp_app = pd.read_csv(f"{data_path}/{name_of_csv}",skiprows=3)
78
+ # df_opp_app["Débit"] = df_opp_app["Débit"].str.replace(" ", "").str.replace(",", ".").astype("float")
79
+
80
+ df_opp_app['Débit'] = df_opp_app['Débit'].astype(str).str.extract(f'({float_regex})', expand=False)
81
+
82
+ # Replace spaces and commas, then handle N/A and convert to float
83
+ df_opp_app['Débit'] = (
84
+ df_opp_app['Débit']
85
+ .str.replace(" ", "", regex=False)
86
+ .str.replace(",", ".", regex=False)
87
+ .str.replace("N/A", "-1", regex=False)
88
+ .astype(float)
89
+ )
90
+ # df_opp_app["Crédit"] = df_opp_app["Crédit"].str.replace(" ", "").str.replace(",", ".").astype("float")
91
+
92
+
93
+ df_opp_app['Crédit'] = df_opp_app['Crédit'].astype(str).str.extract(f'({float_regex})', expand=False)
94
+
95
+ # Replace spaces and commas, then handle N/A and convert to float
96
+ df_opp_app['Crédit'] = (
97
+ df_opp_app['Crédit']
98
+ .str.replace(" ", "", regex=False)
99
+ .str.replace(",", ".", regex=False)
100
+ .str.replace("N/A", "-1", regex=False)
101
+ .astype(float)
102
+ )
103
+ df_opp_app["Date"] = pd.to_datetime(df_opp_app['Date'], format="%d/%m/%Y", errors='coerce')
104
+
105
+
106
+ merged_df_app = pd.merge(df_opp_app, df_app, left_on=['Débit'], right_on=['amount'], how='left').drop(columns=["currency", "date","path"]).rename(columns={"name_of_supplier": "Nom fournisseur facture"})
107
+ merged_df_app["Nom fournisseur facture"] = merged_df_app["Nom fournisseur facture"].fillna("* Facture manquante *")
108
+ # Merge on amount (Débit and amount)
109
+ merged_df_app = pd.merge(df_opp_app, df_app, left_on='Débit', right_on='amount', how='left', suffixes=('_ops', '_df'))
110
+
111
+
112
+
113
+ # Apply fuzzy matching
114
+ raw_choices = df_app['name_of_supplier'].tolist()
115
+ choices = []
116
+ for r in raw_choices:
117
+ choices.append(r.upper())
118
+ merged_df_app['fuzzy_matched_supplier'] = merged_df_app.apply(lambda row: fuzzy_match(row, choices, fuzz.WRatio, 80), axis=1)
119
+ merged_df_app = merged_df_app.drop_duplicates(subset=["Date", "Valeur", "Libellé d'opération", "Débit"])
120
+ # Identify residuals in df that were not matched
121
+ df_residuals_app = df_app[~df_app['name_of_supplier'].isin(merged_df_app['name_of_supplier'])]
122
+
123
+ # Replace original supplier column with fuzzy_matched_supplier and drop the name_of_supplier column from df
124
+ merged_df_app['name_of_supplier'] = merged_df_app['fuzzy_matched_supplier']
125
+ # merged_df_app.drop(columns=['name_of_supplier', 'fuzzy_matched_supplier'], inplace=True)
126
+ merged_df_app.drop(columns=["name_of_supplier", "currency", "date", "path", "fuzzy_matched_supplier"], inplace=True)
127
+ df_residuals_app.drop(columns=["path"], inplace=True)
128
+ merged_df_app['url'] = merged_df_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '')
129
+ df_residuals_app['url'] = df_residuals_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '')
130
+
131
+
132
+ with pd.ExcelWriter(name_of_output) as writer:
133
+ merged_df_app.to_excel(writer, sheet_name='Données réconciliées', index=False)
134
+ df_residuals_app.to_excel(writer, sheet_name='Résidus et transactions introuvables', index=False)
135
+
136
+
137
+
138
+ id_output = save_logs(name_of_output, name_of_output , folder_to_save_reconciled_data)
139
+
140
+ return id_output