LLMReferenceLetterBias / ls_classifier.py
emmatliu's picture
Upload 6 files
a269338 verified
# Run example: `python3 classifier.py --task formality`
import pandas as pd
from transformers import pipeline
from collections import Counter
# (label tracked, other labels)
task_label_mapping = {
"sentiment": ("POSITIVE", "NEGATIVE"),
# "sentiment": ("positive", "neutral", "negative"),
"formality": ("formal", "informal"),
}
# Define a function to perform sentiment analysis on each row of the dataframe
def predict(text, classifier, task, output_type="csv", is_sentencelevel=True):
if is_sentencelevel:
labels = []
scores = []
text = text
sentences = text.split(".")
for sentence in sentences:
if len(sentence) >= 800:
continue
result = classifier((sentence + "."))[0]
labels.append(result["label"])
scores.append(result["score"])
confidence = sum(scores) / len(scores)
if output_type == "csv":
mapping = Counter(labels)
label_tracked, other_label = task_label_mapping[task]
return (
mapping[label_tracked]
/ (mapping[label_tracked] + mapping[other_label]),
confidence,
)
# Get the most common word
return max(set(labels), key=labels.count), confidence
result = classifier(text)[0]
return result["label"], result["score"]
def compute_sentiment_and_formality(df,hallucination=False):
if hallucination:
INPUT = 'hallucination'
else:
INPUT = 'text'
# https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english?text=I+like+you.+I+love+you
classifier_sentiment = pipeline("sentiment-analysis")
# https://huggingface.co/s-nlp/xlmr_formality_classifier
classifier_formality = pipeline(
"text-classification", "s-nlp/roberta-base-formality-ranker"
)
# Apply the sentiment analysis function to each row of the dataframe
sentiment_outputs = None
formality_outputs = None
formality_outputs = df[INPUT].apply(
(lambda x: predict(x, classifier_formality, "formality"))
)
sentiment_outputs = df[INPUT].apply(
(lambda x: predict(x, classifier_sentiment, "sentiment"))
)
if sentiment_outputs is not None:
df["per_pos"] = [s[0] for s in sentiment_outputs]
df["con_pos"] = [s[1] for s in sentiment_outputs]
if formality_outputs is not None:
df["per_for"] = [s[0] for s in formality_outputs]
df["con_for"] = [s[1] for s in formality_outputs]
return df