|
from transformers import Pipeline, AutoTokenizer |
|
import torch |
|
from bs4 import BeautifulSoup |
|
import re |
|
|
|
class MyPipeline(Pipeline): |
|
tokenizer = AutoTokenizer.from_pretrained("abacusai/Llama-3-Smaug-8B") |
|
def _sanitize_parameters(self, **kwargs): |
|
preprocess_kwargs = {} |
|
if "context" in kwargs: |
|
preprocess_kwargs["context"] = kwargs["context"] |
|
if "search_person" in kwargs: |
|
preprocess_kwargs["search_person"] = kwargs["search_person"] |
|
return preprocess_kwargs, {}, {} |
|
|
|
def preprocess(self, inputs, **kwargs): |
|
tokenizer = MyPipeline.tokenizer |
|
context = inputs["context"] |
|
search_person = inputs["search_person"] |
|
|
|
|
|
def create_prompt(context, search_person): |
|
def clean_text(text): |
|
soup = BeautifulSoup(text, 'html.parser') |
|
for link in soup.find_all('a'): |
|
link.decompose() |
|
text = re.sub(r'\([^)]*\)', '', soup.get_text()) |
|
return text |
|
|
|
def prepare_question(search_person): |
|
q = f""" |
|
Based on the information provided in the context, what is the most likely perception of {search_person}? |
|
Pick one answer option. |
|
|
|
Answer options: |
|
|
|
Positive: {search_person} is portrayed in a favorable light, and the context suggests that she is a caring and responsible parent. |
|
Negative: {search_person} is portrayed in an unfavorable light, and the context suggests that she is a neglectful and/or abusive parent. |
|
Neutral: The context does not provide enough information to make a determination about the character or actions of {search_person}, or it presents a balanced and unbiased view of her. |
|
""" |
|
return q |
|
|
|
context = clean_text(context) |
|
question = prepare_question(search_person) |
|
if len(tokenizer.tokenize(context + ' ' + question)) > tokenizer.model_max_length: |
|
print("found such") |
|
context = context[500:] |
|
|
|
prompt_template = f"### CONTEXT\n{context}\n\n### QUESTION\n{question}\n\n### ANSWER\n" |
|
return prompt_template |
|
|
|
prompt = create_prompt(context, search_person) |
|
predToken = tokenizer(prompt, return_tensors='pt') |
|
|
|
|
|
return predToken |
|
|
|
def _forward(self, model_inputs): |
|
tokenizer = MyPipeline.tokenizer |
|
try: |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
self.model = self.model.to(device) |
|
model_inputs = {k:v.to(device) for k,v in model_inputs.items()} |
|
except RuntimeError as e: |
|
|
|
self.model = self.model.to("cpu") |
|
model_inputs = {k:v.to("cpu") for k,v in model_inputs.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = self.model.generate(**model_inputs, max_new_tokens=20,pad_token_id=tokenizer.eos_token_id) |
|
generated_tokens = outputs[0, len(model_inputs['input_ids'][0]):] |
|
out_text = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip() |
|
return {'out_text': out_text} |
|
|
|
def postprocess(self, model_outputs): |
|
out_text = model_outputs['out_text'] |
|
if 'Positive' in out_text: |
|
return 'Positive' |
|
elif 'Negative' in out_text: |
|
return 'Negative' |
|
elif 'Neutral' in out_text: |
|
return 'Neutral' |
|
else: |
|
return 'Neutral' |
|
|
|
|
|
|
|
|