Spaces:
Runtime error
Runtime error
File size: 6,224 Bytes
5acc5a8 75d91e5 5acc5a8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
from uploader import save_logs
import os
import pandas as pd
from rapidfuzz import process, fuzz
from random import randint
from time import sleep
#other imports (google API)
def fuzzy_match(row, choices, scorer, cutoff):
match = process.extractOne(row['Libellé d\'opération'], choices, scorer=scorer, score_cutoff=cutoff)
if match:
return match[0]
return "missing receipt"
def reconciliate_and_upload(data_path,
name_of_csv,
folder_to_save_processed_reciepts,
folder_to_save_reconciled_data,
name_of_raw_transcripts = "transcript_raw.txt",
name_of_output = "[AI Generated] Output.xlsx" ):
with open(f"{data_path}/{name_of_raw_transcripts}") as file:
transcripts = eval(file.read())
imgs = []
path_to_pdfs =data_path
for root, dirs, files in os.walk(path_to_pdfs):
for file in files:
if file.endswith('.png'):
print(os.path.join(root, file))
imgs.append({"path": os.path.join(root, file)})
pass
list_transcripts_evaled = []
objects = []
for i,t in enumerate(transcripts):
content = eval(t["content"].replace('null', '-1'))
try:
obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts)
except:
print("sleeping a bit innit")
breakpoint()
sleep(randint(30,40))
obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts)
objects.append(obk)
print("uploaded image!")
try:
list_transcripts_evaled.append({
"path": imgs[i]["path"],
"name_of_supplier" :content["name_of_supplier"],
"amount":content["amount"],
"currency":content["currency"],
"date": content["date"]})
except:
breakpoint()
urls = []
for ob in objects:
url = "https://drive.google.com/file/d/" + ob["id"]
urls.append(url)
df_app = pd.DataFrame(list_transcripts_evaled)
float_regex = r'[-+]?\d*\.\d+|\d+'
df_app['amount'] = df_app['amount'].astype(str).str.extract(f'({float_regex})', expand=False)
# Replace spaces and commas, then handle N/A and convert to float
df_app['amount'] = (
df_app['amount']
.str.replace(" ", "", regex=False)
.str.replace(",", ".", regex=False)
.str.replace("N/A", "-1", regex=False)
.astype(float)
)
df_app["date"] = pd.to_datetime(df_app['date'], format="%d/%m/%Y", errors='coerce')
df_app["url"] = urls
df_app = df_app.drop_duplicates(["name_of_supplier", "amount", "date"]).reset_index(drop=True)
print(data_path)
print("NAME OF CSV: ", name_of_csv)
df_opp_app = pd.read_csv(f"{data_path}/{name_of_csv}",skiprows=3)
# df_opp_app["Débit"] = df_opp_app["Débit"].str.replace(" ", "").str.replace(",", ".").astype("float")
df_opp_app['Débit'] = df_opp_app['Débit'].astype(str).str.extract(f'({float_regex})', expand=False)
# Replace spaces and commas, then handle N/A and convert to float
df_opp_app['Débit'] = (
df_opp_app['Débit']
.str.replace(" ", "", regex=False)
.str.replace(",", ".", regex=False)
.str.replace("N/A", "-1", regex=False)
.astype(float)
)
# df_opp_app["Crédit"] = df_opp_app["Crédit"].str.replace(" ", "").str.replace(",", ".").astype("float")
df_opp_app['Crédit'] = df_opp_app['Crédit'].astype(str).str.extract(f'({float_regex})', expand=False)
# Replace spaces and commas, then handle N/A and convert to float
df_opp_app['Crédit'] = (
df_opp_app['Crédit']
.str.replace(" ", "", regex=False)
.str.replace(",", ".", regex=False)
.str.replace("N/A", "-1", regex=False)
.astype(float)
)
df_opp_app["Date"] = pd.to_datetime(df_opp_app['Date'], format="%d/%m/%Y", errors='coerce')
merged_df_app = pd.merge(df_opp_app, df_app, left_on=['Débit'], right_on=['amount'], how='left').drop(columns=["currency", "date","path"]).rename(columns={"name_of_supplier": "Nom fournisseur facture"})
merged_df_app["Nom fournisseur facture"] = merged_df_app["Nom fournisseur facture"].fillna("* Facture manquante *")
# Merge on amount (Débit and amount)
merged_df_app = pd.merge(df_opp_app, df_app, left_on='Débit', right_on='amount', how='left', suffixes=('_ops', '_df'))
# Apply fuzzy matching
raw_choices = df_app['name_of_supplier'].tolist()
choices = []
for r in raw_choices:
choices.append(r.upper())
merged_df_app['fuzzy_matched_supplier'] = merged_df_app.apply(lambda row: fuzzy_match(row, choices, fuzz.WRatio, 80), axis=1)
merged_df_app = merged_df_app.drop_duplicates(subset=["Date", "Valeur", "Libellé d'opération", "Débit"])
# Identify residuals in df that were not matched
df_residuals_app = df_app[~df_app['name_of_supplier'].isin(merged_df_app['name_of_supplier'])]
# Replace original supplier column with fuzzy_matched_supplier and drop the name_of_supplier column from df
merged_df_app['name_of_supplier'] = merged_df_app['fuzzy_matched_supplier']
# merged_df_app.drop(columns=['name_of_supplier', 'fuzzy_matched_supplier'], inplace=True)
merged_df_app.drop(columns=["name_of_supplier", "currency", "date", "path", "fuzzy_matched_supplier"], inplace=True)
df_residuals_app.drop(columns=["path"], inplace=True)
merged_df_app['url'] = merged_df_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '')
df_residuals_app['url'] = df_residuals_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '')
with pd.ExcelWriter(name_of_output) as writer:
merged_df_app.to_excel(writer, sheet_name='Données réconciliées', index=False)
df_residuals_app.to_excel(writer, sheet_name='Résidus et transactions introuvables', index=False)
id_output = save_logs(name_of_output, name_of_output , folder_to_save_reconciled_data)
return id_output
|