File size: 6,224 Bytes
5acc5a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75d91e5
 
5acc5a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from uploader import save_logs
import os 
import pandas as pd
from rapidfuzz import process, fuzz
from random import randint
from time import sleep
#other imports (google API)
def fuzzy_match(row, choices, scorer, cutoff):
    match = process.extractOne(row['Libellé d\'opération'], choices, scorer=scorer, score_cutoff=cutoff)
    if match:
        return match[0]
    return "missing receipt"
def reconciliate_and_upload(data_path,
    name_of_csv,
    folder_to_save_processed_reciepts,
    folder_to_save_reconciled_data,
    name_of_raw_transcripts = "transcript_raw.txt",
    name_of_output = "[AI Generated] Output.xlsx" ):
    
    with open(f"{data_path}/{name_of_raw_transcripts}") as file:
        transcripts = eval(file.read())

    imgs = []
    path_to_pdfs =data_path

    for root, dirs, files in os.walk(path_to_pdfs):
        for file in files:
            if file.endswith('.png'):
                print(os.path.join(root, file))
                imgs.append({"path": os.path.join(root, file)})
                pass
        
    list_transcripts_evaled = []
    objects = []
    for i,t in enumerate(transcripts):
        content = eval(t["content"].replace('null', '-1'))
        try:
            obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts)
        except:
            print("sleeping a bit innit")
            breakpoint()
            sleep(randint(30,40))
            obk = save_logs(imgs[i]["path"], f"P{i+1}.png", folder_to_save_processed_reciepts)
            
        objects.append(obk)
        print("uploaded image!")
        try:
            list_transcripts_evaled.append({
                "path": imgs[i]["path"],
                "name_of_supplier" :content["name_of_supplier"],
                                            "amount":content["amount"],
                                            "currency":content["currency"],
                                            "date": content["date"]})
        except:
            breakpoint()
    urls = []
    for ob in objects:
        url = "https://drive.google.com/file/d/" + ob["id"]
        urls.append(url)
    df_app = pd.DataFrame(list_transcripts_evaled)
    float_regex = r'[-+]?\d*\.\d+|\d+'
    df_app['amount'] = df_app['amount'].astype(str).str.extract(f'({float_regex})', expand=False)

    # Replace spaces and commas, then handle N/A and convert to float
    df_app['amount'] = (
        df_app['amount']
        .str.replace(" ", "", regex=False)
        .str.replace(",", ".", regex=False)
        .str.replace("N/A", "-1", regex=False)
        .astype(float)
    )

    df_app["date"] = pd.to_datetime(df_app['date'], format="%d/%m/%Y", errors='coerce')
    df_app["url"] = urls
    df_app = df_app.drop_duplicates(["name_of_supplier", "amount", "date"]).reset_index(drop=True)
    print(data_path)
    print("NAME OF CSV: ", name_of_csv)
    df_opp_app = pd.read_csv(f"{data_path}/{name_of_csv}",skiprows=3)
    # df_opp_app["Débit"] = df_opp_app["Débit"].str.replace(" ", "").str.replace(",", ".").astype("float")
    
    df_opp_app['Débit'] = df_opp_app['Débit'].astype(str).str.extract(f'({float_regex})', expand=False)

    # Replace spaces and commas, then handle N/A and convert to float
    df_opp_app['Débit'] = (
        df_opp_app['Débit']
        .str.replace(" ", "", regex=False)
        .str.replace(",", ".", regex=False)
        .str.replace("N/A", "-1", regex=False)
        .astype(float)
    )
    # df_opp_app["Crédit"] = df_opp_app["Crédit"].str.replace(" ", "").str.replace(",", ".").astype("float")
    
    
    df_opp_app['Crédit'] = df_opp_app['Crédit'].astype(str).str.extract(f'({float_regex})', expand=False)

    # Replace spaces and commas, then handle N/A and convert to float
    df_opp_app['Crédit'] = (
        df_opp_app['Crédit']
        .str.replace(" ", "", regex=False)
        .str.replace(",", ".", regex=False)
        .str.replace("N/A", "-1", regex=False)
        .astype(float)
    )
    df_opp_app["Date"] = pd.to_datetime(df_opp_app['Date'], format="%d/%m/%Y", errors='coerce')

    
    merged_df_app = pd.merge(df_opp_app, df_app, left_on=['Débit'], right_on=['amount'], how='left').drop(columns=["currency", "date","path"]).rename(columns={"name_of_supplier": "Nom fournisseur facture"})
    merged_df_app["Nom fournisseur facture"] = merged_df_app["Nom fournisseur facture"].fillna("* Facture manquante *")
    # Merge on amount (Débit and amount)
    merged_df_app = pd.merge(df_opp_app, df_app, left_on='Débit', right_on='amount', how='left', suffixes=('_ops', '_df'))



    # Apply fuzzy matching
    raw_choices = df_app['name_of_supplier'].tolist()
    choices = []
    for r in raw_choices:
        choices.append(r.upper())
    merged_df_app['fuzzy_matched_supplier'] = merged_df_app.apply(lambda row: fuzzy_match(row, choices, fuzz.WRatio, 80), axis=1)
    merged_df_app = merged_df_app.drop_duplicates(subset=["Date", "Valeur", "Libellé d'opération", "Débit"])
    # Identify residuals in df that were not matched
    df_residuals_app = df_app[~df_app['name_of_supplier'].isin(merged_df_app['name_of_supplier'])]

    # Replace original supplier column with fuzzy_matched_supplier and drop the name_of_supplier column from df
    merged_df_app['name_of_supplier'] = merged_df_app['fuzzy_matched_supplier']
    # merged_df_app.drop(columns=['name_of_supplier', 'fuzzy_matched_supplier'], inplace=True)
    merged_df_app.drop(columns=["name_of_supplier", "currency", "date", "path", "fuzzy_matched_supplier"], inplace=True)
    df_residuals_app.drop(columns=["path"], inplace=True)
    merged_df_app['url'] = merged_df_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '')
    df_residuals_app['url'] = df_residuals_app['url'].apply(lambda x: f'=HYPERLINK("{x}", "Voir Facture")' if pd.notna(x) else '')


    with pd.ExcelWriter(name_of_output) as writer:
        merged_df_app.to_excel(writer, sheet_name='Données réconciliées', index=False)
        df_residuals_app.to_excel(writer, sheet_name='Résidus et transactions introuvables', index=False)



    id_output = save_logs(name_of_output, name_of_output , folder_to_save_reconciled_data)

    return id_output