File size: 3,808 Bytes
981fd76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from transformers import  Pipeline, AutoTokenizer
import torch
from bs4 import BeautifulSoup
import re

class MyPipeline(Pipeline):
    tokenizer = AutoTokenizer.from_pretrained("abacusai/Llama-3-Smaug-8B")
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        if "context" in kwargs:
            preprocess_kwargs["context"] = kwargs["context"]
        if "search_person" in kwargs:
            preprocess_kwargs["search_person"] = kwargs["search_person"]
        return preprocess_kwargs, {}, {}

    def preprocess(self, inputs, **kwargs):
        tokenizer = MyPipeline.tokenizer
        context = inputs["context"]
        search_person = inputs["search_person"]
        #print(f"here -->  {len(context)},   {search_person}")

        def create_prompt(context, search_person):
            def clean_text(text):
                soup = BeautifulSoup(text, 'html.parser')
                for link in soup.find_all('a'):
                    link.decompose()
                text = re.sub(r'\([^)]*\)', '', soup.get_text())
                return text

            def prepare_question(search_person):
                q = f"""
                Based on the information provided in the context, what is the most likely perception of {search_person}?
                Pick one answer option.

                Answer options:

                Positive: {search_person} is portrayed in a favorable light, and the context suggests that she is a caring and responsible parent.
                Negative: {search_person} is portrayed in an unfavorable light, and the context suggests that she is a neglectful and/or abusive parent.
                Neutral: The context does not provide enough information to make a determination about the character or actions of {search_person}, or it presents a balanced and unbiased view of her.
                """
                return q

            context = clean_text(context)
            question = prepare_question(search_person)
            if len(tokenizer.tokenize(context + ' ' + question)) > tokenizer.model_max_length:
                print("found such")
                context = context[500:]

            prompt_template = f"### CONTEXT\n{context}\n\n### QUESTION\n{question}\n\n### ANSWER\n"
            return prompt_template

        prompt = create_prompt(context, search_person)
        predToken = tokenizer(prompt, return_tensors='pt')
        #tokens = self.tokenizer(prompt, return_tensors='pt')
        
        return predToken

    def _forward(self, model_inputs):
        tokenizer = MyPipeline.tokenizer
        try:
          # out of memory error is most likely to happen here
          device = "cuda" if torch.cuda.is_available() else "cpu"
          self.model = self.model.to(device)
          model_inputs = {k:v.to(device) for k,v in model_inputs.items()}
        except RuntimeError as e:
          # explicitly transferring to cpu
          self.model = self.model.to("cpu")
          model_inputs = {k:v.to("cpu") for k,v in model_inputs.items()}
        #model_inputs = model_inputs.to(device)

        with torch.no_grad():
            outputs = self.model.generate(**model_inputs, max_new_tokens=20,pad_token_id=tokenizer.eos_token_id)
        generated_tokens = outputs[0, len(model_inputs['input_ids'][0]):]
        out_text = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
        return {'out_text': out_text}

    def postprocess(self, model_outputs):
        out_text = model_outputs['out_text']
        if 'Positive' in out_text:
            return 'Positive'
        elif 'Negative' in out_text:
            return 'Negative'
        elif 'Neutral' in out_text:
            return 'Neutral'
        else:
            return 'Neutral'

# Initialize the model and tokenizer