SCOTUS / utils /cleaning.py
raminass's picture
Update utils/cleaning.py
823ce87
raw
history blame
6.29 kB
import subprocess
import sys
import re
import pandas as pd
try:
import eyecite
except ImportError:
subprocess.check_call([sys.executable, "-m", "pip", "install", "eyecite"])
finally:
from eyecite import find, clean
# @title
def full_case(citation, text):
text = text.replace(citation.matched_text(), "")
if citation.metadata.year:
pattern = r"\([^)]*{}\)".format(
citation.metadata.year
) # Matches any word that ends with "year"
text = re.sub(pattern, "", text)
if citation.metadata.pin_cite:
text = text.replace(citation.metadata.pin_cite, "")
if citation.metadata.parenthetical:
text = text.replace(f"({citation.metadata.parenthetical})", "")
if citation.metadata.plaintiff:
text = text.replace(
f"{citation.metadata.plaintiff} v. {citation.metadata.defendant}", ""
)
publisher_date = " ".join(
i for i in (citation.metadata.court, citation.metadata.year) if i
)
if publisher_date:
text = text.replace(f"{publisher_date}", "")
if citation.metadata.extra:
text = text.replace(citation.metadata.extra, "")
return text
def supra_case(citation, text):
text = text.replace(citation.matched_text(), "")
if citation.metadata.pin_cite:
text = text.replace(citation.metadata.pin_cite, "")
if citation.metadata.parenthetical:
text = text.replace(f"({citation.metadata.parenthetical})", "")
if citation.metadata.antecedent_guess:
text = text.replace(citation.metadata.antecedent_guess, "")
return text
def short_case(citation, text):
text = text.replace(citation.matched_text(), "")
if citation.metadata.parenthetical:
text = text.replace(f"({citation.metadata.parenthetical})", "")
if citation.metadata.year:
pattern = r"\([^)]*{}\)".format(citation.metadata.year)
if citation.metadata.antecedent_guess:
text = text.replace(citation.metadata.antecedent_guess, "")
return text
def id_case(citation, text):
text = text.replace(citation.matched_text(), "")
if citation.metadata.parenthetical:
text = text.replace(f"({citation.metadata.parenthetical})", "")
if citation.metadata.pin_cite:
text = text.replace(citation.metadata.pin_cite, "")
return text
def unknown_case(citation, text):
text = text.replace(citation.matched_text(), "")
if citation.metadata.parenthetical:
text = text.replace(f"({citation.metadata.parenthetical})", "")
return text
def full_law_case(citation, text):
text = text.replace(citation.matched_text(), "")
if citation.metadata.parenthetical:
text = text.replace(f"({citation.metadata.parenthetical})", "")
return text
def full_journal_case(citation, text):
text = text.replace(citation.matched_text(), "")
if citation.metadata.year:
pattern = r"\([^)]*{}\)".format(
citation.metadata.year
) # Matches any word that ends with "year"
text = re.sub(pattern, "", text)
if citation.metadata.pin_cite:
text = text.replace(citation.metadata.pin_cite, "")
if citation.metadata.parenthetical:
text = text.replace(f"({citation.metadata.parenthetical})", "")
return text
def all_commas(text: str) -> str:
return re.sub(r"\,+", ",", text)
def all_dots(text: str) -> str:
return re.sub(r"\.+", ".", text)
functions_dict = {
"FullCaseCitation": full_case,
"SupraCitation": supra_case,
"ShortCaseCitation": short_case,
"IdCitation": id_case,
"UnknownCitation": unknown_case,
"FullLawCitation": full_law_case,
"FullJournalCitation": full_journal_case,
}
# @title
def remove_citations(input_text):
# clean text
plain_text = clean.clean_text(
input_text, ["html", "inline_whitespace", "underscores"]
)
# remove citations
found_citations = find.get_citations(plain_text)
for citation in found_citations:
plain_text = functions_dict[citation.__class__.__name__](citation, plain_text)
# clean text
plain_text = clean.clean_text(
plain_text,
["inline_whitespace", "underscores", "all_whitespace", all_commas, all_dots],
)
plain_text = clean.clean_text(plain_text, ["inline_whitespace", "all_whitespace"])
pattern = r"\*?\d*\s*I+\n"
plain_text = re.sub(pattern, "", plain_text)
pattern = r"\s[,.]"
plain_text = re.sub(pattern, "", plain_text)
return plain_text
def split_text(text):
words = text.split()
chunks = []
for i in range(0, len(words), 420):
chunks.append(" ".join(words[i : i + 430]))
return chunks
# @title
def chunk_text_to_paragraphs(text):
paragraphs = text.split("\n") # Split by empty line
# Remove leading and trailing whitespace from each paragraph
paragraphs = [p.strip() for p in paragraphs]
return paragraphs
# @title
def split_data(data, id2label, label2id):
data_dict = {
"author_name": [],
"label": [],
"category": [],
"case_name": [],
"url": [],
"text": [],
}
opinions_split = pd.DataFrame(data_dict)
opinions_split["label"] = opinions_split["label"].astype(int)
for index, row in data.iterrows():
# chunks = chunk_text_to_paragraphs(row['text'])
chunks = split_text(row["clean_text"])
for chunk in chunks:
if len(chunk) < 1000:
continue
tmp = pd.DataFrame(
{
"author_name": row["author_name"],
"label": [label2id[row["author_name"]]],
"category": row["category"],
"case_name": row["case_name"],
"url": [row["absolute_url"]],
"text": [chunk],
}
)
opinions_split = pd.concat([opinions_split, tmp])
return opinions_split
def chunk_data(data):
data_dict = {"text": []}
opinions_split = pd.DataFrame(data_dict)
chunks = split_text(data)
for chunk in chunks:
# if len(chunk) < 1000:
# continue
tmp = pd.DataFrame({"label": [200], "text": [chunk]})
opinions_split = pd.concat([opinions_split, tmp])
return opinions_split