andreeabodea commited on
Commit
b7c096f
·
verified ·
1 Parent(s): bc1bd61

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -253
app.py CHANGED
@@ -85,256 +85,4 @@ for file in files:
85
  print(results_dict)
86
 
87
  json_string = json.dumps(results_dict, indent=4)
88
- print(json_string)
89
-
90
- """
91
- def extract_section_text(pdf_path, start_section, end_section=None):
92
- Extract text from a specific section of a PDF.
93
-
94
- :param pdf_path: Path to the PDF file.
95
- :param start_section: The title of the section to start extracting text.
96
- :param end_section: The title of the section to stop extracting text (optional).
97
- :return: Extracted text from the specified section.
98
- text = ""
99
- section_started = False
100
- with fitz.open(pdf_path) as doc: # Open the PDF
101
- for page in doc: # Iterate through each page
102
- page_text = page.get_text("text") # Extract text from the current page
103
- if start_section in page_text and not section_started:
104
- # Start section found
105
- section_started = True
106
- text += page_text
107
- elif section_started:
108
- if end_section and end_section in page_text:
109
- # End section found, stop reading further
110
- break
111
- else:
112
- # Continue adding text from the section
113
- text += page_text
114
-
115
- # Optional: refine text extraction, if necessary
116
- if section_started:
117
- # If the start section is in the middle of the page, trim the text before it
118
- start_index = text.find(start_section)
119
- text = text[start_index:]
120
-
121
- if end_section:
122
- # If an end section is specified, trim the text after it
123
- end_index = text.find(end_section)
124
- if end_index != -1:
125
- text = text[:end_index]
126
-
127
- return text
128
-
129
- # create function to read pdf and extract appendix 1 with results matrix
130
- def get_appendix(pdf):
131
- #for each page, check whether it contains Anlage 1 and Anlage 2 to get relevant pages
132
- start_page = []
133
- end_page = []
134
- for page in range(len(pdf.pages)):
135
- if len(pdf.pages[page].search("Anlage 1: Wirkungsmatrix", return_chars=False, case = False)) > 0: # FOR PROJECTS
136
- # if len(pdf.pages[page].search("A1 - Wirkungsmatrix", return_chars=False, case=False)) > 0: # FOR PROGRAMS
137
- start_page.append(page)
138
- if len(pdf.pages[page].search("Anlage 2: Wirkungslogik", return_chars=False, case = False)) > 0: # FOR PROJECTS
139
- # if len(pdf.pages[page].search("A2 - Daten", return_chars=False, case = False)) > 0: # FOR PROGRAMS
140
- end_page.append(page)
141
- # return results
142
- return start_page, end_page
143
-
144
- # create function to parse table from results_matrix and transform to dataframe
145
- def extract_tables_from_pdf(start_page, end_page):
146
-
147
- # for each page in appendix
148
- for page in range(max(start_page), max(end_page)):
149
-
150
- try:
151
- # extract table(s)
152
- table = pdf.pages[page].extract_tables()[0]
153
- except IndexError:
154
- break
155
-
156
- print(table)
157
-
158
- # for each row of the table...
159
- for row_num in range(len(table)):
160
- row = table[row_num]
161
-
162
- # ...remove the line breakers from the wrapped texts
163
- cleaned_row = [item.replace("-\n", "") if item is not None and "-\n" in item
164
- else "None" if item is None
165
- else item for item in row]
166
-
167
- cleaned_row = [item.replace("\n", " ") if item is not None and "\n" in item
168
- else "None" if item is None
169
- else item for item in cleaned_row]
170
-
171
- # append row to results_matrix_list
172
- results_matrix_list.append(cleaned_row)
173
-
174
- return results_matrix_list
175
-
176
- # define function to extract programm-infos
177
- def extract_programm(table_rows_list, file_name):
178
- # define empty lists to save results
179
- programmziel = []
180
- pz_indikator = []
181
- basiswert = []
182
- zielwert = []
183
- istwert = []
184
-
185
- # for each row in results matrix (list), extract elements
186
- for row in table_rows_list:
187
- for i in row:
188
- if "Programmziel " in i:
189
- programmziel.append(i)
190
- else:
191
- pass
192
- if "Programmzielindikator" in i:
193
- pz_indikator.append(i)
194
- else:
195
- pass
196
-
197
- # extract values from impact indicators
198
- for indikator in pz_indikator:
199
- if (("Basiswert:" in indikator) and ("Zielwert:" in indikator)):
200
- index1 = indikator.index("Basiswert:")
201
- index2 = indikator.index("Zielwert:")
202
- basiswert.append(indikator[index1 + len("Basiswert:") + 1: index2])
203
- elif (("Basiswert:" in indikator) and ("Zielwert:" not in indikator)):
204
- basiswert.append(indikator.split("Basiswert:")[1])
205
- else:
206
- basiswert.append("")
207
- if (("Zielwert:" in indikator) and ("Istwert:" in indikator)):
208
- index1 = indikator.index("Zielwert:")
209
- index2 = indikator.index("Istwert:")
210
- zielwert.append(indikator[index1 + len("Zielwert:") + 1: index2])
211
- elif (("Zielwert:" in indikator) and ("Istwert:" not in indikator)):
212
- zielwert.append(indikator.split("Zielwert:")[1])
213
- else:
214
- zielwert.append("")
215
- if "Istwert:" in indikator:
216
- istwert.append(indikator.split("Istwert:")[1])
217
- else:
218
- istwert.append("")
219
-
220
- # create dataframes for each tier (programm, modul, output)
221
- programm = p
222
-
223
- # extract values from outcome indicators
224
- for indikator in mz_indikator:
225
- if (("Basiswert:" in indikator) and ("Zielwert:" in indikator)):
226
- index1 = indikator.index("Basiswert:")
227
- index2 = indikator.index("Zielwert:")
228
- basiswert.append(indikator[index1 + len("Basiswert:") + 1: index2])
229
- elif (("Basiswert:" in indikator) and ("Zielwert:" not in indikator)):
230
- basiswert.append(indikator.split("Basiswert:")[1])
231
- else:
232
- basiswert.append("")
233
- if (("Zielwert:" in indikator) and ("Istwert:" in indikator)):
234
- index1 = indikator.index("Zielwert:")
235
- index2 = indikator.index("Istwert:")
236
- zielwert.append(indikator[index1 + len("Zielwert:") + 1: index2])
237
- elif (("Zielwert:" in indikator) and ("Istwert:" not in indikator)):
238
- zielwert.append(indikator.split("Zielwert:")[1])
239
- else:
240
- zielwert.append("")
241
- if "Istwert:" in indikator:
242
- istwert.append(indikator.split("Istwert:")[1])
243
- else:
244
- istwert.append("")
245
-
246
- # create dataframes for each tier (programm, modul, output)
247
- outcome = pd.DataFrame.from_dict({"ziel":modulziel, "indikator":mz_indikator,"basiswert": basiswert,
248
- "zielwert": zielwert, "istwert": istwert,"datei":[file_name]*len(mz_indikator)},
249
- orient="index")
250
- outcome = outcome.transpose()
251
-
252
- return outcome
253
-
254
- # define function for outputs
255
- def extract_outputs(table_rows_list,file_name):
256
- # define empty lists to save results
257
- output = []
258
- output_indikator = []
259
- basiswert = []
260
- zielwert = []
261
- istwert = []
262
-
263
- # for each row in results matrix (list), extract elements
264
- for row in table_rows_list:
265
- for i in row:
266
- if "Output " in i:
267
- output.append(i)
268
- else:
269
- pass
270
- if "Outputindikator" in i:
271
- output_indikator.append(i)
272
- else:
273
- pass
274
-
275
- # extract values from output indicators
276
- for indikator in output_indikator:
277
- if (("Basiswert:" in indikator) and ("Zielwert:" in indikator)):
278
- index1 = indikator.index("Basiswert:")
279
- index2 = indikator.index("Zielwert:")
280
- basiswert.append(indikator[index1 + len("Basiswert:") + 1: index2])
281
- elif (("Basiswert:" in indikator) and ("Zielwert:" not in indikator)):
282
- basiswert.append(indikator.split("Basiswert:")[1])
283
- else:
284
- basiswert.append("")
285
- if (("Zielwert:" in indikator) and ("Istwert:" in indikator)):
286
- index1 = indikator.index("Zielwert:")
287
- index2 = indikator.index("Istwert:")
288
- zielwert.append(indikator[index1 + len("Zielwert:") + 1: index2])
289
- elif (("Zielwert:" in indikator) and ("Istwert:" not in indikator)):
290
- zielwert.append(indikator.split("Zielwert:")[1])
291
- else:
292
- zielwert.append("")
293
- if "Istwert:" in indikator:
294
- istwert.append(indikator.split("Istwert:")[1])
295
- else:
296
- istwert.append("")
297
-
298
- # create dataframes for each tier (programm, modul, output)
299
- output = pd.DataFrame.from_dict({"output":output, "indikator":output_indikator, "basiswert": basiswert,
300
- "zielwert": zielwert, "istwert": istwert,"datei":[file_name]*len(output_indikator)},
301
- orient = "index")
302
- output = output.transpose()
303
- return output
304
-
305
- # apply functions to files
306
- #Define global dataframes to store results from all files
307
- programme = pd.DataFrame(columns = ["ziel", "indikator", "basiswert", "zielwert", "istwert", "datei"])
308
- outcomes = pd.DataFrame(columns = ["ziel", "indikator", "basiswert", "zielwert", "istwert", "datei"])
309
- outputs = pd.DataFrame(columns = ["output", "indikator", "basiswert", "zielwert", "istwert", "datei"])
310
-
311
-
312
-
313
- print("...and extract table and store as list")
314
- results_matrix_list = extract_tables_from_pdf(start_page, end_page)
315
-
316
- print("...extract programm information")
317
- programm = extract_programm(results_matrix_list, file)
318
-
319
- print("...extract modul information")
320
- outcome = extract_modul(results_matrix_list, file)
321
-
322
- print("...extract outputs")
323
- output = extract_outputs(results_matrix_list, file)
324
-
325
- print("...add results from extract functions to global dataframe")
326
- programme = pd.concat([programme, programm], ignore_index=True)
327
- outcomes = pd.concat([outcomes, outcome], ignore_index=True)
328
- outputs = pd.concat([outputs, output], ignore_index=True)
329
-
330
- # write results to csv file
331
- programme.to_csv("/Users/andreeabodea/programme.csv", sep="|", index=False, decimal=",")
332
- outcomes.to_csv("/Users/andreeabodea/module_outcomes.csv", sep="|", index=False, decimal=",")
333
- outputs.to_csv("/Users/andreeabodea/module_outputs.csv", sep="|", index=False, decimal=",")
334
-
335
- print(programme)
336
- print(outcomes)
337
- print(outputs)
338
-
339
-
340
- """
 
85
  print(results_dict)
86
 
87
  json_string = json.dumps(results_dict, indent=4)
88
+ print(json_string)