Spaces:

spark-ds549
/

F24-Racist-Deeds

Sleeping

App Files Files Community

F24-Racist-Deeds / modules /last_year /racism_checker_old_pipeline.py

jacob-stein

Migrate Flask backend

97208ad about 2 months ago

raw

history blame

2.57 kB

	from modules.racist_text_query import racist_text_query
	from modules.bigotry_dict import bigotry_dict
	from modules.OCR import tiff_to_ocr
	from modules.racist_chatgpt_analysis import racist_chatgpt_analysis
	from modules.locate import locate
	from modules.pagenum import crop_image
	import os
	import pandas as pd

	def racism_threshold(file_dir):
	# Create the new folder for cropped images
	cropped_images_dir = os.path.join(file_dir, 'deed page number')
	if not os.path.exists(cropped_images_dir):
	os.makedirs(cropped_images_dir)

	data = []
	for images in os.listdir(file_dir):
	if images.endswith(".tif") or images.endswith(".tiff"):
	image_path = os.path.join(file_dir, images)

	# run ocr on images
	text = tiff_to_ocr(image_path)

	result1 = racist_chatgpt_analysis(text)
	result2 = racist_text_query(text, bigotry_dict)

	a, b, c = locate(text)

	# Define the output path for the cropped image in the new folder
	cropped_image_name = "cropped_" + images
	cropped_image_path = os.path.join(cropped_images_dir, cropped_image_name)

	# Crop the image and save it to the new folder
	crop_image(image_path, cropped_image_path)

	image_path_formatted = cropped_image_path
	#.replace(' ', '%20')
	hyperlink_formula = f'file://{image_path_formatted}'

	# fail safe page number detection
	page = tiff_to_ocr(cropped_image_path)
	fail_safe_page = []
	result = page.split("\n")
	for word in result:
	# checks for possible page numbers
	if word.isdigit() == True:
	fail_safe_page.append(word)


	if result1 or result2:
	print(images, a, b, c)
	if len(fail_safe_page) != 0:
	a.append(fail_safe_page)
	data.append([images, a, b[0], c[0], hyperlink_formula])
	else:
	print(images + " : Not Racist")
	# data.append([images, a, b[0], c[0], hyperlink_formula])

	# Include the hyperlink in the DataFrame columns
	df = pd.DataFrame(data, columns=['File Name', 'Probable Page Number', 'Date', 'Book Number', "Page Link"])
	df.index += 1
	df.to_csv(os.path.join(file_dir, 'Racist Deeds.csv'), index=True)
	df.to_excel(os.path.join(file_dir, 'Racist Deeds.xlsx'), index=True)

	racism_threshold('folderpath')