File size: 2,412 Bytes
a269338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
694a2f7
 
 
 
 
a269338
694a2f7
 
 
 
a269338
694a2f7
 
 
 
 
 
 
 
 
 
 
a269338
694a2f7
 
 
a269338
694a2f7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

def run_inference(df, INPUT, TASK, classifier, label_mapping, rev_map, task_label_mapping, is_sentencelevel=True):
    inferences = []
    for i in tqdm(range(len(df)), ascii=True):
        if is_sentencelevel:
            labels = []
            scores = []
            sentences = df.iloc[i, :][INPUT].split(".")
            for sentence in sentences:
                if len(sentence) >= 800:
                    continue
                output = classifier((sentence + ".").lower())[0]
                labels.append(label_mapping[TASK][rev_map[output["label"]]])
                scores.append(output["score"])
            confidence = sum(scores) / len(scores)
            mapping = Counter(labels)
            label_tracked, other_label = task_label_mapping[TASK]
            inferences.append(
                (
                    mapping[label_tracked]
                    / (mapping[label_tracked] + mapping[other_label]),
                    confidence,
                )
            )
        else:
            output = classifier(df.iloc[i, :][INPUT])[0]
            inferences.append(
                (label_mapping[TASK][rev_map[output["label"]]], output["score"])
            )

    return inferences

def compute_agentic_communal(df,hallucination=False):
    tokenizer = AutoTokenizer.from_pretrained("emmatliu/language-agency-classifier")
    model = AutoModelForSequenceClassification.from_pretrained("emmatliu/language-agency-classifier")
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
    rev_map = {v: k for k, v in model.config.id2label.items()}

    if hallucination:
        INPUT = "hallucination"
    else:
        INPUT = "text"

    TASK = "ac_classifier"
    task_label_mapping = {
        # Track percentage agentic / percentage agentic + percentage communal
        "ac_classifier": ("agentic", "communal"),
    }
    label_mapping = {
        "ac_classifier": {
            0: "communal",
            1: "agentic",
        }
    }

    inferences = run_inference(df, INPUT, TASK, classifier, label_mapping, rev_map, task_label_mapping)
    df["per_ac"] = [i[0] for i in inferences]
    df["con_ac"] = [i[1] for i in inferences]

    return df