|
import subprocess |
|
import sys |
|
import re |
|
import pandas as pd |
|
|
|
try: |
|
import eyecite |
|
except ImportError: |
|
subprocess.check_call([sys.executable, "-m", "pip", "install", "eyecite"]) |
|
finally: |
|
from eyecite import find, clean |
|
|
|
|
|
|
|
def full_case(citation, text): |
|
text = text.replace(citation.matched_text(), "") |
|
if citation.metadata.year: |
|
pattern = r"\([^)]*{}\)".format( |
|
citation.metadata.year |
|
) |
|
text = re.sub(pattern, "", text) |
|
if citation.metadata.pin_cite: |
|
text = text.replace(citation.metadata.pin_cite, "") |
|
if citation.metadata.parenthetical: |
|
text = text.replace(f"({citation.metadata.parenthetical})", "") |
|
if citation.metadata.plaintiff: |
|
text = text.replace( |
|
f"{citation.metadata.plaintiff} v. {citation.metadata.defendant}", "" |
|
) |
|
publisher_date = " ".join( |
|
i for i in (citation.metadata.court, citation.metadata.year) if i |
|
) |
|
if publisher_date: |
|
text = text.replace(f"{publisher_date}", "") |
|
if citation.metadata.extra: |
|
text = text.replace(citation.metadata.extra, "") |
|
return text |
|
|
|
|
|
def supra_case(citation, text): |
|
text = text.replace(citation.matched_text(), "") |
|
if citation.metadata.pin_cite: |
|
text = text.replace(citation.metadata.pin_cite, "") |
|
if citation.metadata.parenthetical: |
|
text = text.replace(f"({citation.metadata.parenthetical})", "") |
|
if citation.metadata.antecedent_guess: |
|
text = text.replace(citation.metadata.antecedent_guess, "") |
|
return text |
|
|
|
|
|
def short_case(citation, text): |
|
text = text.replace(citation.matched_text(), "") |
|
if citation.metadata.parenthetical: |
|
text = text.replace(f"({citation.metadata.parenthetical})", "") |
|
if citation.metadata.year: |
|
pattern = r"\([^)]*{}\)".format(citation.metadata.year) |
|
if citation.metadata.antecedent_guess: |
|
text = text.replace(citation.metadata.antecedent_guess, "") |
|
return text |
|
|
|
|
|
def id_case(citation, text): |
|
text = text.replace(citation.matched_text(), "") |
|
if citation.metadata.parenthetical: |
|
text = text.replace(f"({citation.metadata.parenthetical})", "") |
|
if citation.metadata.pin_cite: |
|
text = text.replace(citation.metadata.pin_cite, "") |
|
return text |
|
|
|
|
|
def unknown_case(citation, text): |
|
text = text.replace(citation.matched_text(), "") |
|
if citation.metadata.parenthetical: |
|
text = text.replace(f"({citation.metadata.parenthetical})", "") |
|
return text |
|
|
|
|
|
def full_law_case(citation, text): |
|
text = text.replace(citation.matched_text(), "") |
|
if citation.metadata.parenthetical: |
|
text = text.replace(f"({citation.metadata.parenthetical})", "") |
|
return text |
|
|
|
|
|
def full_journal_case(citation, text): |
|
text = text.replace(citation.matched_text(), "") |
|
if citation.metadata.year: |
|
pattern = r"\([^)]*{}\)".format( |
|
citation.metadata.year |
|
) |
|
text = re.sub(pattern, "", text) |
|
if citation.metadata.pin_cite: |
|
text = text.replace(citation.metadata.pin_cite, "") |
|
if citation.metadata.parenthetical: |
|
text = text.replace(f"({citation.metadata.parenthetical})", "") |
|
return text |
|
|
|
|
|
def all_commas(text: str) -> str: |
|
return re.sub(r"\,+", ",", text) |
|
|
|
|
|
def all_dots(text: str) -> str: |
|
return re.sub(r"\.+", ".", text) |
|
|
|
|
|
functions_dict = { |
|
"FullCaseCitation": full_case, |
|
"SupraCitation": supra_case, |
|
"ShortCaseCitation": short_case, |
|
"IdCitation": id_case, |
|
"UnknownCitation": unknown_case, |
|
"FullLawCitation": full_law_case, |
|
"FullJournalCitation": full_journal_case, |
|
} |
|
|
|
|
|
|
|
def remove_citations(input_text): |
|
|
|
plain_text = clean.clean_text( |
|
input_text, ["html", "inline_whitespace", "underscores"] |
|
) |
|
|
|
found_citations = find.get_citations(plain_text) |
|
for citation in found_citations: |
|
plain_text = functions_dict[citation.__class__.__name__](citation, plain_text) |
|
|
|
plain_text = clean.clean_text( |
|
plain_text, |
|
["inline_whitespace", "underscores", "all_whitespace", all_commas, all_dots], |
|
) |
|
plain_text = clean.clean_text(plain_text, ["inline_whitespace", "all_whitespace"]) |
|
pattern = r"\*?\d*\s*I+\n" |
|
plain_text = re.sub(pattern, "", plain_text) |
|
pattern = r"\s[,.]" |
|
plain_text = re.sub(pattern, "", plain_text) |
|
return plain_text |
|
|
|
|
|
def split_text(text): |
|
words = text.split() |
|
chunks = [] |
|
for i in range(0, len(words), 420): |
|
chunks.append(" ".join(words[i : i + 430])) |
|
return chunks |
|
|
|
|
|
|
|
def chunk_text_to_paragraphs(text): |
|
paragraphs = text.split("\n") |
|
|
|
|
|
paragraphs = [p.strip() for p in paragraphs] |
|
|
|
return paragraphs |
|
|
|
|
|
|
|
def split_data(data, id2label, label2id): |
|
data_dict = { |
|
"author_name": [], |
|
"label": [], |
|
"category": [], |
|
"case_name": [], |
|
"url": [], |
|
"text": [], |
|
} |
|
opinions_split = pd.DataFrame(data_dict) |
|
opinions_split["label"] = opinions_split["label"].astype(int) |
|
for index, row in data.iterrows(): |
|
|
|
chunks = split_text(row["clean_text"]) |
|
for chunk in chunks: |
|
if len(chunk) < 1000: |
|
continue |
|
tmp = pd.DataFrame( |
|
{ |
|
"author_name": row["author_name"], |
|
"label": [label2id[row["author_name"]]], |
|
"category": row["category"], |
|
"case_name": row["case_name"], |
|
"url": [row["absolute_url"]], |
|
"text": [chunk], |
|
} |
|
) |
|
opinions_split = pd.concat([opinions_split, tmp]) |
|
return opinions_split |
|
|
|
|
|
def chunk_data(data): |
|
data_dict = {"text": []} |
|
opinions_split = pd.DataFrame(data_dict) |
|
chunks = split_text(data) |
|
for chunk in chunks: |
|
|
|
|
|
tmp = pd.DataFrame({"label": [200], "text": [chunk]}) |
|
opinions_split = pd.concat([opinions_split, tmp]) |
|
return opinions_split |
|
|