raminass commited on
Commit
8ddc567
·
1 Parent(s): 7268745

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: SCOTUS
3
- emoji: 😻
4
- colorFrom: gray
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 3.45.1
8
  app_file: app.py
9
- pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: scotus
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 3.45.2
6
  ---
 
 
app.py CHANGED
@@ -1,36 +1,47 @@
1
  import gradio as gr
2
- from transformers import pipeline, TextClassificationPipeline
3
  from utils import *
4
 
5
  pipe = pipeline(model="raminass/scotus-v10", top_k=13, padding=True, truncation=True)
6
 
7
- def average_text(text, model):
8
- # result = classifier(df_train[(df_train.case_name==case) & (df_train.category=='per_curiam')]['clean_text'].to_list())
9
- result = model(text)
10
- pred = {}
11
- for c in result:
12
- for d in c:
13
- if d['label'] not in pred:
14
- pred[d['label']] = [round(d['score'],2)]
15
- else:
16
- pred[d['label']].append(round(d['score'],2))
17
- sumary = {k:round(sum(v)/len(v),2) for k,v in pred.items()}
18
- result = [[{k: round(v, 2) if k=='score' else v for k, v in dct.items()} for dct in lst ] for lst in result]
19
- return dict(sorted(sumary.items(), key=lambda x: x[1],reverse=True)), result
20
 
 
 
21
  def greet(opinion):
22
- result = average_text(chunk_data(remove_citations(opinion))['text'].to_list(),pipe)
23
- # print(f"average prediction:")
24
- # display(result[0])
25
- # print(f"paragraph prediction:")
26
- # display(result[1])
27
- return result[0]
 
 
 
 
 
 
 
28
 
29
  with gr.Blocks() as demo:
30
  opinion = gr.Textbox(label="Opinion")
31
- output = gr.Textbox(label="Result")
32
  greet_btn = gr.Button("Predict")
33
- greet_btn.click(fn=greet, inputs=opinion, outputs=output, api_name="SCOTUS")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  if __name__ == "__main__":
36
  demo.launch()
 
1
  import gradio as gr
2
+ from transformers import pipeline
3
  from utils import *
4
 
5
  pipe = pipeline(model="raminass/scotus-v10", top_k=13, padding=True, truncation=True)
6
 
7
+ max_textboxes = 100
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+
10
+ # https://www.gradio.app/guides/controlling-layout
11
  def greet(opinion):
12
+ chunks = chunk_data(remove_citations(opinion))["text"].to_list()
13
+ result = average_text(chunks, pipe)
14
+ k = len(chunks)
15
+ wrt_boxes = []
16
+ for i in range(k):
17
+ wrt_boxes.append(gr.Textbox(chunks[i], visible=True))
18
+ wrt_boxes.append(gr.Label(value=result[1][i], visible=True))
19
+ return (
20
+ [result[0]]
21
+ + wrt_boxes
22
+ + [gr.Textbox(visible=False), gr.Label(visible=False)] * (max_textboxes - k)
23
+ )
24
+
25
 
26
  with gr.Blocks() as demo:
27
  opinion = gr.Textbox(label="Opinion")
28
+ op_level = gr.outputs.Label(num_top_classes=13, label="Overall")
29
  greet_btn = gr.Button("Predict")
30
+ textboxes = []
31
+ for i in range(max_textboxes):
32
+ t = gr.Textbox(f"Textbox {i}", visible=False, label=f"Paragraph {i+1} Text")
33
+ par_level = gr.Label(
34
+ num_top_classes=5, label=f"Paragraph {i+1} Prediction", visible=False
35
+ )
36
+ textboxes.append(t)
37
+ textboxes.append(par_level)
38
+
39
+ greet_btn.click(
40
+ fn=greet,
41
+ inputs=opinion,
42
+ outputs=[op_level] + textboxes,
43
+ )
44
+
45
 
46
  if __name__ == "__main__":
47
  demo.launch()
utils/.DS_Store ADDED
Binary file (6.15 kB). View file
 
utils/__init__.py CHANGED
@@ -3,14 +3,39 @@ import pandas as pd
3
  import numpy as np
4
  import json
5
 
6
- with open('utils/id2label.json', 'r') as j:
7
- id2label = json.loads(j.read())
8
 
9
- with open('utils/label2id.json', 'r') as j:
10
- label2id = json.loads(j.read())
11
 
12
- def find_case_by_name(df, name):
13
- return display(HTML(df[df['case_name'].str.contains(name)].iloc[:,:-1].to_html(render_links=True, escape=False)))
14
 
15
- def head_df(df):
16
- return display(HTML(df.iloc[:,:-1].head().to_html(render_links=True, escape=False)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import numpy as np
4
  import json
5
 
6
+ with open("utils/id2label.json", "r") as j:
7
+ id2label = json.loads(j.read())
8
 
9
+ with open("utils/label2id.json", "r") as j:
10
+ label2id = json.loads(j.read())
11
 
 
 
12
 
13
+ def average_text(text, model):
14
+ # result = classifier(df_train[(df_train.case_name==case) & (df_train.category=='per_curiam')]['clean_text'].to_list())
15
+ result = model(text)
16
+ pred = {}
17
+ for c in result:
18
+ for d in c:
19
+ if d["label"] not in pred:
20
+ pred[d["label"]] = [round(d["score"], 2)]
21
+ else:
22
+ pred[d["label"]].append(round(d["score"], 2))
23
+ sumary = {k: round(sum(v) / len(v), 2) for k, v in pred.items()}
24
+ result = [{dct["label"]: round(dct["score"], 2) for dct in lst} for lst in result]
25
+ return dict(sorted(sumary.items(), key=lambda x: x[1], reverse=True)), result
26
+
27
+
28
+ # def find_case_by_name(df, name):
29
+ # return display(
30
+ # HTML(
31
+ # df[df["case_name"].str.contains(name)]
32
+ # .iloc[:, :-1]
33
+ # .to_html(render_links=True, escape=False)
34
+ # )
35
+ # )
36
+
37
+
38
+ # def head_df(df):
39
+ # return display(
40
+ # HTML(df.iloc[:, :-1].head().to_html(render_links=True, escape=False))
41
+ # )
utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.54 kB). View file
 
utils/__pycache__/cleaning.cpython-310.pyc ADDED
Binary file (4.81 kB). View file
 
utils/cleaning.py CHANGED
@@ -1,123 +1,148 @@
1
  import subprocess
2
  import sys
3
  import re
4
- import pandas as pd
5
 
6
  try:
7
  import eyecite
8
  except ImportError:
9
- subprocess.check_call([sys.executable, "-m", "pip", "install", 'eyecite'])
10
  finally:
11
  from eyecite import find, clean
12
 
 
13
  # @title
14
  def full_case(citation, text):
15
  text = text.replace(citation.matched_text(), "")
16
  if citation.metadata.year:
17
- pattern = r'\([^)]*{}\)'.format(citation.metadata.year) # Matches any word that ends with "year"
18
- text = re.sub(pattern, '', text)
 
 
19
  if citation.metadata.pin_cite:
20
- text = text.replace(citation.metadata.pin_cite, "")
21
  if citation.metadata.parenthetical:
22
- text = text.replace(f"({citation.metadata.parenthetical})", "")
23
  if citation.metadata.plaintiff:
24
- text = text.replace(f"{citation.metadata.plaintiff} v. {citation.metadata.defendant}", "")
25
- publisher_date = " ".join(i for i in (citation.metadata.court, citation.metadata.year) if i)
 
 
 
 
26
  if publisher_date:
27
- text = text.replace(f"{publisher_date}", "")
28
  if citation.metadata.extra:
29
- text = text.replace(citation.metadata.extra, "")
30
  return text
31
 
 
32
  def supra_case(citation, text):
33
  text = text.replace(citation.matched_text(), "")
34
  if citation.metadata.pin_cite:
35
- text = text.replace(citation.metadata.pin_cite, "")
36
  if citation.metadata.parenthetical:
37
- text = text.replace(f"({citation.metadata.parenthetical})", "")
38
  if citation.metadata.antecedent_guess:
39
- text = text.replace(citation.metadata.antecedent_guess, "")
40
  return text
41
 
 
42
  def short_case(citation, text):
43
  text = text.replace(citation.matched_text(), "")
44
  if citation.metadata.parenthetical:
45
- text = text.replace(f"({citation.metadata.parenthetical})", "")
46
  if citation.metadata.year:
47
- pattern = r'\([^)]*{}\)'.format(citation.metadata.year)
48
  if citation.metadata.antecedent_guess:
49
- text = text.replace(citation.metadata.antecedent_guess, "")
50
  return text
51
 
 
52
  def id_case(citation, text):
53
  text = text.replace(citation.matched_text(), "")
54
  if citation.metadata.parenthetical:
55
- text = text.replace(f"({citation.metadata.parenthetical})", "")
56
  if citation.metadata.pin_cite:
57
- text = text.replace(citation.metadata.pin_cite, "")
58
  return text
59
 
 
60
  def unknown_case(citation, text):
61
  text = text.replace(citation.matched_text(), "")
62
  if citation.metadata.parenthetical:
63
- text = text.replace(f"({citation.metadata.parenthetical})", "")
64
  return text
65
 
 
66
  def full_law_case(citation, text):
67
  text = text.replace(citation.matched_text(), "")
68
  if citation.metadata.parenthetical:
69
- text = text.replace(f"({citation.metadata.parenthetical})", "")
70
  return text
71
 
 
72
  def full_journal_case(citation, text):
73
  text = text.replace(citation.matched_text(), "")
74
  if citation.metadata.year:
75
- pattern = r'\([^)]*{}\)'.format(citation.metadata.year) # Matches any word that ends with "year"
76
- text = re.sub(pattern, '', text)
 
 
77
  if citation.metadata.pin_cite:
78
- text = text.replace(citation.metadata.pin_cite, "")
79
  if citation.metadata.parenthetical:
80
- text = text.replace(f"({citation.metadata.parenthetical})", "")
81
  return text
82
 
 
83
  def all_commas(text: str) -> str:
84
  return re.sub(r"\,+", ",", text)
85
 
 
86
  def all_dots(text: str) -> str:
87
  return re.sub(r"\.+", ".", text)
88
 
 
89
  functions_dict = {
90
- 'FullCaseCitation': full_case,
91
- 'SupraCitation': supra_case,
92
- 'ShortCaseCitation': short_case,
93
- 'IdCitation': id_case,
94
- 'UnknownCitation': unknown_case,
95
- 'FullLawCitation': full_law_case,
96
- 'FullJournalCitation': full_journal_case,
97
  }
98
 
 
99
  # @title
100
  def remove_citations(input_text):
101
- #clean text
102
- plain_text = clean.clean_text(input_text, ['html', 'inline_whitespace', 'underscores'])
103
- #remove citations
104
- found_citations = find.get_citations(plain_text)
105
- for citation in found_citations:
106
- plain_text = functions_dict[citation.__class__.__name__](citation, plain_text)
107
- #clean text
108
- plain_text = clean.clean_text(plain_text, ['inline_whitespace', 'underscores','all_whitespace', all_commas, all_dots])
109
- plain_text = clean.clean_text(plain_text, ['inline_whitespace','all_whitespace'])
110
- pattern = r"\*?\d*\s*I+\n"
111
- plain_text = re.sub(pattern, '', plain_text)
112
- pattern = r"\s[,.]"
113
- plain_text = re.sub(pattern, '', plain_text)
114
- return plain_text
 
 
 
 
 
 
115
 
116
  def split_text(text):
117
  words = text.split()
118
  chunks = []
119
  for i in range(0, len(words), 420):
120
- chunks.append(' '.join(words[i:i+430]))
121
  return chunks
122
 
123
 
@@ -130,37 +155,46 @@ def chunk_text_to_paragraphs(text):
130
 
131
  return paragraphs
132
 
 
133
  # @title
134
  def split_data(data, id2label, label2id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
- data_dict = {'author_name': [],
137
- 'label': [],
138
- 'category': [],
139
- 'case_name': [],
140
- 'url': [],
141
- 'text': []}
142
- opinions_split = pd.DataFrame(data_dict)
143
- opinions_split['label'] = opinions_split['label'].astype(int)
144
- for index, row in data.iterrows():
145
- # chunks = chunk_text_to_paragraphs(row['text'])
146
- chunks = split_text(row['clean_text'])
147
- for chunk in chunks:
148
- if len(chunk)<1000:
149
- continue
150
- tmp = pd.DataFrame({'author_name': row['author_name'],'label': [label2id[row['author_name']]],
151
- 'category': row['category'],'case_name': row['case_name'],
152
- 'url': [row['absolute_url']], 'text': [chunk]})
153
- opinions_split = pd.concat([opinions_split, tmp])
154
- return opinions_split
155
 
156
  def chunk_data(data):
157
-
158
- data_dict = {'text': []}
159
- opinions_split = pd.DataFrame(data_dict)
160
- chunks = split_text(data)
161
- for chunk in chunks:
162
- if len(chunk)<1000:
163
- continue
164
- tmp = pd.DataFrame({'label': [200],'text': [chunk]})
165
- opinions_split = pd.concat([opinions_split, tmp])
166
- return opinions_split
 
1
  import subprocess
2
  import sys
3
  import re
4
+ import pandas as pd
5
 
6
  try:
7
  import eyecite
8
  except ImportError:
9
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "eyecite"])
10
  finally:
11
  from eyecite import find, clean
12
 
13
+
14
  # @title
15
  def full_case(citation, text):
16
  text = text.replace(citation.matched_text(), "")
17
  if citation.metadata.year:
18
+ pattern = r"\([^)]*{}\)".format(
19
+ citation.metadata.year
20
+ ) # Matches any word that ends with "year"
21
+ text = re.sub(pattern, "", text)
22
  if citation.metadata.pin_cite:
23
+ text = text.replace(citation.metadata.pin_cite, "")
24
  if citation.metadata.parenthetical:
25
+ text = text.replace(f"({citation.metadata.parenthetical})", "")
26
  if citation.metadata.plaintiff:
27
+ text = text.replace(
28
+ f"{citation.metadata.plaintiff} v. {citation.metadata.defendant}", ""
29
+ )
30
+ publisher_date = " ".join(
31
+ i for i in (citation.metadata.court, citation.metadata.year) if i
32
+ )
33
  if publisher_date:
34
+ text = text.replace(f"{publisher_date}", "")
35
  if citation.metadata.extra:
36
+ text = text.replace(citation.metadata.extra, "")
37
  return text
38
 
39
+
40
  def supra_case(citation, text):
41
  text = text.replace(citation.matched_text(), "")
42
  if citation.metadata.pin_cite:
43
+ text = text.replace(citation.metadata.pin_cite, "")
44
  if citation.metadata.parenthetical:
45
+ text = text.replace(f"({citation.metadata.parenthetical})", "")
46
  if citation.metadata.antecedent_guess:
47
+ text = text.replace(citation.metadata.antecedent_guess, "")
48
  return text
49
 
50
+
51
  def short_case(citation, text):
52
  text = text.replace(citation.matched_text(), "")
53
  if citation.metadata.parenthetical:
54
+ text = text.replace(f"({citation.metadata.parenthetical})", "")
55
  if citation.metadata.year:
56
+ pattern = r"\([^)]*{}\)".format(citation.metadata.year)
57
  if citation.metadata.antecedent_guess:
58
+ text = text.replace(citation.metadata.antecedent_guess, "")
59
  return text
60
 
61
+
62
  def id_case(citation, text):
63
  text = text.replace(citation.matched_text(), "")
64
  if citation.metadata.parenthetical:
65
+ text = text.replace(f"({citation.metadata.parenthetical})", "")
66
  if citation.metadata.pin_cite:
67
+ text = text.replace(citation.metadata.pin_cite, "")
68
  return text
69
 
70
+
71
  def unknown_case(citation, text):
72
  text = text.replace(citation.matched_text(), "")
73
  if citation.metadata.parenthetical:
74
+ text = text.replace(f"({citation.metadata.parenthetical})", "")
75
  return text
76
 
77
+
78
  def full_law_case(citation, text):
79
  text = text.replace(citation.matched_text(), "")
80
  if citation.metadata.parenthetical:
81
+ text = text.replace(f"({citation.metadata.parenthetical})", "")
82
  return text
83
 
84
+
85
  def full_journal_case(citation, text):
86
  text = text.replace(citation.matched_text(), "")
87
  if citation.metadata.year:
88
+ pattern = r"\([^)]*{}\)".format(
89
+ citation.metadata.year
90
+ ) # Matches any word that ends with "year"
91
+ text = re.sub(pattern, "", text)
92
  if citation.metadata.pin_cite:
93
+ text = text.replace(citation.metadata.pin_cite, "")
94
  if citation.metadata.parenthetical:
95
+ text = text.replace(f"({citation.metadata.parenthetical})", "")
96
  return text
97
 
98
+
99
  def all_commas(text: str) -> str:
100
  return re.sub(r"\,+", ",", text)
101
 
102
+
103
  def all_dots(text: str) -> str:
104
  return re.sub(r"\.+", ".", text)
105
 
106
+
107
  functions_dict = {
108
+ "FullCaseCitation": full_case,
109
+ "SupraCitation": supra_case,
110
+ "ShortCaseCitation": short_case,
111
+ "IdCitation": id_case,
112
+ "UnknownCitation": unknown_case,
113
+ "FullLawCitation": full_law_case,
114
+ "FullJournalCitation": full_journal_case,
115
  }
116
 
117
+
118
  # @title
119
  def remove_citations(input_text):
120
+ # clean text
121
+ plain_text = clean.clean_text(
122
+ input_text, ["html", "inline_whitespace", "underscores"]
123
+ )
124
+ # remove citations
125
+ found_citations = find.get_citations(plain_text)
126
+ for citation in found_citations:
127
+ plain_text = functions_dict[citation.__class__.__name__](citation, plain_text)
128
+ # clean text
129
+ plain_text = clean.clean_text(
130
+ plain_text,
131
+ ["inline_whitespace", "underscores", "all_whitespace", all_commas, all_dots],
132
+ )
133
+ plain_text = clean.clean_text(plain_text, ["inline_whitespace", "all_whitespace"])
134
+ pattern = r"\*?\d*\s*I+\n"
135
+ plain_text = re.sub(pattern, "", plain_text)
136
+ pattern = r"\s[,.]"
137
+ plain_text = re.sub(pattern, "", plain_text)
138
+ return plain_text
139
+
140
 
141
  def split_text(text):
142
  words = text.split()
143
  chunks = []
144
  for i in range(0, len(words), 420):
145
+ chunks.append(" ".join(words[i : i + 430]))
146
  return chunks
147
 
148
 
 
155
 
156
  return paragraphs
157
 
158
+
159
  # @title
160
  def split_data(data, id2label, label2id):
161
+ data_dict = {
162
+ "author_name": [],
163
+ "label": [],
164
+ "category": [],
165
+ "case_name": [],
166
+ "url": [],
167
+ "text": [],
168
+ }
169
+ opinions_split = pd.DataFrame(data_dict)
170
+ opinions_split["label"] = opinions_split["label"].astype(int)
171
+ for index, row in data.iterrows():
172
+ # chunks = chunk_text_to_paragraphs(row['text'])
173
+ chunks = split_text(row["clean_text"])
174
+ for chunk in chunks:
175
+ if len(chunk) < 1000:
176
+ continue
177
+ tmp = pd.DataFrame(
178
+ {
179
+ "author_name": row["author_name"],
180
+ "label": [label2id[row["author_name"]]],
181
+ "category": row["category"],
182
+ "case_name": row["case_name"],
183
+ "url": [row["absolute_url"]],
184
+ "text": [chunk],
185
+ }
186
+ )
187
+ opinions_split = pd.concat([opinions_split, tmp])
188
+ return opinions_split
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
  def chunk_data(data):
192
+ data_dict = {"text": []}
193
+ opinions_split = pd.DataFrame(data_dict)
194
+ chunks = split_text(data)
195
+ for chunk in chunks:
196
+ if len(chunk) < 1000:
197
+ continue
198
+ tmp = pd.DataFrame({"label": [200], "text": [chunk]})
199
+ opinions_split = pd.concat([opinions_split, tmp])
200
+ return opinions_split