LLMReferenceLetterBias / agentic_classifier.py
emmatliu's picture
Upload 6 files
a269338 verified
raw
history blame
2.79 kB
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
def run_inference(df, INPUT, TASK, classifier, label_mapping, rev_map, task_label_mapping, is_sentencelevel=True):
inferences = []
for i in tqdm(range(len(df)), ascii=True):
if is_sentencelevel:
labels = []
scores = []
sentences = df.iloc[i, :][INPUT].split(".")
for sentence in sentences:
if len(sentence) >= 800:
continue
output = classifier((sentence + ".").lower())[0]
labels.append(label_mapping[TASK][rev_map[output["label"]]])
scores.append(output["score"])
confidence = sum(scores) / len(scores)
mapping = Counter(labels)
label_tracked, other_label = task_label_mapping[TASK]
inferences.append(
(
mapping[label_tracked]
/ (mapping[label_tracked] + mapping[other_label]),
confidence,
)
)
else:
output = classifier(df.iloc[i, :][INPUT])[0]
inferences.append(
(label_mapping[TASK][rev_map[output["label"]]], output["score"])
)
return inferences
# TODO: remove when model is fixed :/
def compute_agentic_communal(df, hallucination=False):
df['per_ac'] = np.random.rand(len(df))
df['con_ac'] = np.random.rand(len(df))
return df
# Need clarification on model lol
# def compute_agentic_communal(df,hallucination=False):
# model_path = "./checkpoints/checkpoint-48" #
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# model = AutoModelForSequenceClassification.from_pretrained(model_path)
# classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
# rev_map = {v: k for k, v in model.config.id2label.items()}
# if hallucination:
# INPUT = "hallucination"
# else:
# INPUT = "TEXT" # need to tell users what this should be called TODO: change this to the correct column name
# TASK = "ac_classifier"
# task_label_mapping = {
# # Track percentage agentic / percentage agentic + percentage communal
# "ac_classifier": ("agentic", "communal"),
# }
# label_mapping = {
# "ac_classifier": {
# 0: "communal",
# 1: "agentic",
# }
# }
# inferences = run_inference(df, INPUT, TASK, classifier, label_mapping, rev_map, task_label_mapping)
# df["per_ac"] = [i[0] for i in inferences]
# df["con_ac"] = [i[1] for i in inferences]
# return df