andreeabodea commited on
Commit
fd73c59
·
verified ·
1 Parent(s): 306c9b5

Uploaded code for extraction of the text from the sections of a PDF monitoring report of a project

Browse files
Files changed (1) hide show
  1. extraction_project_report.py +340 -0
extraction_project_report.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import pdfplumber
4
+ import re
5
+ import fitz # PyMuPDF
6
+ import json
7
+
8
+ files = [f for f in os.listdir("/Users/andreeabodea/") if f.endswith(".pdf")]
9
+ print(files)
10
+
11
+ """
12
+ Extract the text from a section of a PDF file between 'wanted_section' and 'next_section'.
13
+ Parameters:
14
+ - path (str): The file path to the PDF file.
15
+ - wanted_section (str): The section to start extracting text from.
16
+ - next_section (str): The section to stop extracting text at.
17
+ Returns:
18
+ - text (str): The extracted text from the specified section range.
19
+ """
20
+ def get_section(path, wanted_section, next_section):
21
+ print(wanted_section)
22
+
23
+ # Open the PDF file
24
+ doc = pdfplumber.open(path)
25
+ start_page = []
26
+ end_page = []
27
+
28
+ # Find the all the pages for the specified sections
29
+ for page in range(len(doc.pages)):
30
+ if len(doc.pages[page].search(wanted_section, return_chars = False, case = False)) > 0:
31
+ start_page.append(page)
32
+ if len(doc.pages[page].search(next_section, return_chars = False, case = False)) > 0:
33
+ end_page.append(page)
34
+ print(max(start_page))
35
+ print(max(end_page))
36
+
37
+ # Extract the text between the start and end page of the wanted section
38
+ text = []
39
+ for page_num in range(max(start_page), max(end_page)):
40
+ page = doc.pages[page_num]
41
+ text.append(page.extract_text())
42
+ text = " ".join(text)
43
+ new_text = text.replace("\n", " ")
44
+ special_char_unicode_list = ["\u00e4", "\u00f6", "\u00fc", "\u00df"]
45
+ special_char_replacement_list = ["ae", "oe", "ue", "ss"]
46
+ for index, special_char in enumerate(special_char_unicode_list):
47
+ final_text = new_text.replace(special_char, special_char_replacement_list[index])
48
+ return final_text
49
+
50
+ for file in files:
51
+
52
+ print("for each pdf file...")
53
+ path = "/Users/andreeabodea/" + file
54
+ pdf = pdfplumber.open(path)
55
+ print(path)
56
+
57
+ results_dict = {}
58
+ results_dict["2.1 Aktualisierte Einordnung des Moduls in das EZ-Programm"] = \
59
+ get_section(path, "2.1 Aktualisierte Einordnung des Moduls in das EZ-Programm", "2.2 Andere Entwicklungsmaßnahmen im konkreten Interventionsbereich des Moduls")
60
+ results_dict["2.1 Aktualisierte Einordnung des Moduls in das EZ-Programm"] = \
61
+ get_section(path,"2.1 Aktualisierte Einordnung des Moduls in das EZ-Programm", "2.2 Andere Entwicklungsmaßnahmen im konkreten Interventionsbereich des Moduls")
62
+ results_dict["2.2 Andere Entwicklungsmaßnahmen im konkreten Interventionsbereich des Moduls"] = \
63
+ get_section(path, "2.2 Andere Entwicklungsmaßnahmen im konkreten Interventionsbereich des Moduls", "3. Entwicklungen im Interventionsbereich")
64
+ results_dict["3. Entwicklungen im Interventionsbereich"] = \
65
+ get_section(path, "3. Entwicklungen im Interventionsbereich", "4.1 Bewertungen von Zielen, Zielgruppen, Wirkungshypothesen und Indikatoren")
66
+ results_dict["4.1 Bewertungen von Zielen, Zielgruppen, Wirkungshypothesen und Indikatoren"] = \
67
+ get_section(path, "4.1 Bewertungen von Zielen, Zielgruppen, Wirkungshypothesen und Indikatoren", "4.2 Umgesetzte Maßnahmen / Aktivitäten während des Berichtszeitraums")
68
+ results_dict["4.2 Umgesetzte Maßnahmen / Aktivitäten während des Berichtszeitraums"] = \
69
+ get_section(path, "4.2 Umgesetzte Maßnahmen / Aktivitäten während des Berichtszeitraums", "4.3 Umsetzung von Maßnahmen zur Sicherstellung der nachhaltigen Wirksamkeit")
70
+ results_dict["4.3 Umsetzung von Maßnahmen zur Sicherstellung der nachhaltigen Wirksamkeit des Vorhabens"] = \
71
+ get_section(path, "4.3 Umsetzung von Maßnahmen zur Sicherstellung der nachhaltigen Wirksamkeit", "4.4 Laufzeit und Zeitplan")
72
+ results_dict["4.4 Laufzeit und Zeitplan"] = \
73
+ get_section(path, "4.4 Laufzeit und Zeitplan", "4.5 Entstandene Kosten und Kostenverschiebungen")
74
+ results_dict["4.5 Entstandene Kosten und Kostenverschiebungen"] = \
75
+ get_section(path, "4.5 Entstandene Kosten und Kostenverschiebungen", "4.6 Bewertung der Wirkungen und Risiken")
76
+ results_dict["4.6 Bewertung der Wirkungen und Risiken"] = \
77
+ get_section(path, "4.6 Bewertung der Wirkungen und Risiken", "5. Übergeordnete Empfehlungen")
78
+ results_dict["5.1 Empfehlungen und Merkposten für den Politik- und Schwerpunktdialog"] = \
79
+ get_section(path, "5.1 Empfehlungen und Merkposten für den Politik- und Schwerpunktdialog", "5.2 Lernerfahrungen, die für die Länderstrategie und zukünftige EZ-Programme")
80
+ results_dict["5.2 Lernerfahrungen, die für die Länderstrategie und zukünftige EZ-Programme interessant sein könnten"] = \
81
+ get_section(path, "5.2 Lernerfahrungen", "6. Testat")
82
+ results_dict["6. Testat (TZ)"] = \
83
+ get_section(path, "6. Testat", "Anlage 1: Wirkungsmatrix des Moduls")
84
+
85
+ print(results_dict)
86
+
87
+ json_string = json.dumps(results_dict, indent=4)
88
+ print(json_string)
89
+
90
+ """
91
+ def extract_section_text(pdf_path, start_section, end_section=None):
92
+ Extract text from a specific section of a PDF.
93
+
94
+ :param pdf_path: Path to the PDF file.
95
+ :param start_section: The title of the section to start extracting text.
96
+ :param end_section: The title of the section to stop extracting text (optional).
97
+ :return: Extracted text from the specified section.
98
+ text = ""
99
+ section_started = False
100
+ with fitz.open(pdf_path) as doc: # Open the PDF
101
+ for page in doc: # Iterate through each page
102
+ page_text = page.get_text("text") # Extract text from the current page
103
+ if start_section in page_text and not section_started:
104
+ # Start section found
105
+ section_started = True
106
+ text += page_text
107
+ elif section_started:
108
+ if end_section and end_section in page_text:
109
+ # End section found, stop reading further
110
+ break
111
+ else:
112
+ # Continue adding text from the section
113
+ text += page_text
114
+
115
+ # Optional: refine text extraction, if necessary
116
+ if section_started:
117
+ # If the start section is in the middle of the page, trim the text before it
118
+ start_index = text.find(start_section)
119
+ text = text[start_index:]
120
+
121
+ if end_section:
122
+ # If an end section is specified, trim the text after it
123
+ end_index = text.find(end_section)
124
+ if end_index != -1:
125
+ text = text[:end_index]
126
+
127
+ return text
128
+
129
+ # create function to read pdf and extract appendix 1 with results matrix
130
+ def get_appendix(pdf):
131
+ #for each page, check whether it contains Anlage 1 and Anlage 2 to get relevant pages
132
+ start_page = []
133
+ end_page = []
134
+ for page in range(len(pdf.pages)):
135
+ if len(pdf.pages[page].search("Anlage 1: Wirkungsmatrix", return_chars=False, case = False)) > 0: # FOR PROJECTS
136
+ # if len(pdf.pages[page].search("A1 - Wirkungsmatrix", return_chars=False, case=False)) > 0: # FOR PROGRAMS
137
+ start_page.append(page)
138
+ if len(pdf.pages[page].search("Anlage 2: Wirkungslogik", return_chars=False, case = False)) > 0: # FOR PROJECTS
139
+ # if len(pdf.pages[page].search("A2 - Daten", return_chars=False, case = False)) > 0: # FOR PROGRAMS
140
+ end_page.append(page)
141
+ # return results
142
+ return start_page, end_page
143
+
144
+ # create function to parse table from results_matrix and transform to dataframe
145
+ def extract_tables_from_pdf(start_page, end_page):
146
+
147
+ # for each page in appendix
148
+ for page in range(max(start_page), max(end_page)):
149
+
150
+ try:
151
+ # extract table(s)
152
+ table = pdf.pages[page].extract_tables()[0]
153
+ except IndexError:
154
+ break
155
+
156
+ print(table)
157
+
158
+ # for each row of the table...
159
+ for row_num in range(len(table)):
160
+ row = table[row_num]
161
+
162
+ # ...remove the line breakers from the wrapped texts
163
+ cleaned_row = [item.replace("-\n", "") if item is not None and "-\n" in item
164
+ else "None" if item is None
165
+ else item for item in row]
166
+
167
+ cleaned_row = [item.replace("\n", " ") if item is not None and "\n" in item
168
+ else "None" if item is None
169
+ else item for item in cleaned_row]
170
+
171
+ # append row to results_matrix_list
172
+ results_matrix_list.append(cleaned_row)
173
+
174
+ return results_matrix_list
175
+
176
+ # define function to extract programm-infos
177
+ def extract_programm(table_rows_list, file_name):
178
+ # define empty lists to save results
179
+ programmziel = []
180
+ pz_indikator = []
181
+ basiswert = []
182
+ zielwert = []
183
+ istwert = []
184
+
185
+ # for each row in results matrix (list), extract elements
186
+ for row in table_rows_list:
187
+ for i in row:
188
+ if "Programmziel " in i:
189
+ programmziel.append(i)
190
+ else:
191
+ pass
192
+ if "Programmzielindikator" in i:
193
+ pz_indikator.append(i)
194
+ else:
195
+ pass
196
+
197
+ # extract values from impact indicators
198
+ for indikator in pz_indikator:
199
+ if (("Basiswert:" in indikator) and ("Zielwert:" in indikator)):
200
+ index1 = indikator.index("Basiswert:")
201
+ index2 = indikator.index("Zielwert:")
202
+ basiswert.append(indikator[index1 + len("Basiswert:") + 1: index2])
203
+ elif (("Basiswert:" in indikator) and ("Zielwert:" not in indikator)):
204
+ basiswert.append(indikator.split("Basiswert:")[1])
205
+ else:
206
+ basiswert.append("")
207
+ if (("Zielwert:" in indikator) and ("Istwert:" in indikator)):
208
+ index1 = indikator.index("Zielwert:")
209
+ index2 = indikator.index("Istwert:")
210
+ zielwert.append(indikator[index1 + len("Zielwert:") + 1: index2])
211
+ elif (("Zielwert:" in indikator) and ("Istwert:" not in indikator)):
212
+ zielwert.append(indikator.split("Zielwert:")[1])
213
+ else:
214
+ zielwert.append("")
215
+ if "Istwert:" in indikator:
216
+ istwert.append(indikator.split("Istwert:")[1])
217
+ else:
218
+ istwert.append("")
219
+
220
+ # create dataframes for each tier (programm, modul, output)
221
+ programm = p
222
+
223
+ # extract values from outcome indicators
224
+ for indikator in mz_indikator:
225
+ if (("Basiswert:" in indikator) and ("Zielwert:" in indikator)):
226
+ index1 = indikator.index("Basiswert:")
227
+ index2 = indikator.index("Zielwert:")
228
+ basiswert.append(indikator[index1 + len("Basiswert:") + 1: index2])
229
+ elif (("Basiswert:" in indikator) and ("Zielwert:" not in indikator)):
230
+ basiswert.append(indikator.split("Basiswert:")[1])
231
+ else:
232
+ basiswert.append("")
233
+ if (("Zielwert:" in indikator) and ("Istwert:" in indikator)):
234
+ index1 = indikator.index("Zielwert:")
235
+ index2 = indikator.index("Istwert:")
236
+ zielwert.append(indikator[index1 + len("Zielwert:") + 1: index2])
237
+ elif (("Zielwert:" in indikator) and ("Istwert:" not in indikator)):
238
+ zielwert.append(indikator.split("Zielwert:")[1])
239
+ else:
240
+ zielwert.append("")
241
+ if "Istwert:" in indikator:
242
+ istwert.append(indikator.split("Istwert:")[1])
243
+ else:
244
+ istwert.append("")
245
+
246
+ # create dataframes for each tier (programm, modul, output)
247
+ outcome = pd.DataFrame.from_dict({"ziel":modulziel, "indikator":mz_indikator,"basiswert": basiswert,
248
+ "zielwert": zielwert, "istwert": istwert,"datei":[file_name]*len(mz_indikator)},
249
+ orient="index")
250
+ outcome = outcome.transpose()
251
+
252
+ return outcome
253
+
254
+ # define function for outputs
255
+ def extract_outputs(table_rows_list,file_name):
256
+ # define empty lists to save results
257
+ output = []
258
+ output_indikator = []
259
+ basiswert = []
260
+ zielwert = []
261
+ istwert = []
262
+
263
+ # for each row in results matrix (list), extract elements
264
+ for row in table_rows_list:
265
+ for i in row:
266
+ if "Output " in i:
267
+ output.append(i)
268
+ else:
269
+ pass
270
+ if "Outputindikator" in i:
271
+ output_indikator.append(i)
272
+ else:
273
+ pass
274
+
275
+ # extract values from output indicators
276
+ for indikator in output_indikator:
277
+ if (("Basiswert:" in indikator) and ("Zielwert:" in indikator)):
278
+ index1 = indikator.index("Basiswert:")
279
+ index2 = indikator.index("Zielwert:")
280
+ basiswert.append(indikator[index1 + len("Basiswert:") + 1: index2])
281
+ elif (("Basiswert:" in indikator) and ("Zielwert:" not in indikator)):
282
+ basiswert.append(indikator.split("Basiswert:")[1])
283
+ else:
284
+ basiswert.append("")
285
+ if (("Zielwert:" in indikator) and ("Istwert:" in indikator)):
286
+ index1 = indikator.index("Zielwert:")
287
+ index2 = indikator.index("Istwert:")
288
+ zielwert.append(indikator[index1 + len("Zielwert:") + 1: index2])
289
+ elif (("Zielwert:" in indikator) and ("Istwert:" not in indikator)):
290
+ zielwert.append(indikator.split("Zielwert:")[1])
291
+ else:
292
+ zielwert.append("")
293
+ if "Istwert:" in indikator:
294
+ istwert.append(indikator.split("Istwert:")[1])
295
+ else:
296
+ istwert.append("")
297
+
298
+ # create dataframes for each tier (programm, modul, output)
299
+ output = pd.DataFrame.from_dict({"output":output, "indikator":output_indikator, "basiswert": basiswert,
300
+ "zielwert": zielwert, "istwert": istwert,"datei":[file_name]*len(output_indikator)},
301
+ orient = "index")
302
+ output = output.transpose()
303
+ return output
304
+
305
+ # apply functions to files
306
+ #Define global dataframes to store results from all files
307
+ programme = pd.DataFrame(columns = ["ziel", "indikator", "basiswert", "zielwert", "istwert", "datei"])
308
+ outcomes = pd.DataFrame(columns = ["ziel", "indikator", "basiswert", "zielwert", "istwert", "datei"])
309
+ outputs = pd.DataFrame(columns = ["output", "indikator", "basiswert", "zielwert", "istwert", "datei"])
310
+
311
+
312
+
313
+ print("...and extract table and store as list")
314
+ results_matrix_list = extract_tables_from_pdf(start_page, end_page)
315
+
316
+ print("...extract programm information")
317
+ programm = extract_programm(results_matrix_list, file)
318
+
319
+ print("...extract modul information")
320
+ outcome = extract_modul(results_matrix_list, file)
321
+
322
+ print("...extract outputs")
323
+ output = extract_outputs(results_matrix_list, file)
324
+
325
+ print("...add results from extract functions to global dataframe")
326
+ programme = pd.concat([programme, programm], ignore_index=True)
327
+ outcomes = pd.concat([outcomes, outcome], ignore_index=True)
328
+ outputs = pd.concat([outputs, output], ignore_index=True)
329
+
330
+ # write results to csv file
331
+ programme.to_csv("/Users/andreeabodea/programme.csv", sep="|", index=False, decimal=",")
332
+ outcomes.to_csv("/Users/andreeabodea/module_outcomes.csv", sep="|", index=False, decimal=",")
333
+ outputs.to_csv("/Users/andreeabodea/module_outputs.csv", sep="|", index=False, decimal=",")
334
+
335
+ print(programme)
336
+ print(outcomes)
337
+ print(outputs)
338
+
339
+
340
+ """