import gradio as gr from transformers import pipeline import torch import pandas as pd from openprompt.plms import load_plm from openprompt import PromptDataLoader from openprompt.prompts import ManualVerbalizer from openprompt.prompts import ManualTemplate from openprompt.data_utils import InputExample from openprompt import PromptForClassification from transformers import AutoModelForSeq2SeqLM, AutoTokenizer def readLMwords(): alldata = pd.read_csv("LoughranMcDonald_MasterDictionary_2020.csv") positive = list(alldata[alldata["Positive"]!=0]["Word"].str.lower()) negative = list(alldata[alldata["Negative"]!=0]["Word"].str.lower()) uncertainty = list(alldata[alldata["Uncertainty"]!=0]["Word"].str.lower()) return positive,negative,uncertainty def sentiment_analysis(sentence, model_name): model_name = "HYCCC/"+model_name raw_sentences = sentence.strip().split('\n') template = '{"placeholder":"text_a"} Shares are {"mask"}.' classes = ['positive', 'neutral', 'negative'] positive,negative,neutral = readLMwords() label_words = { "positive": positive, "neutral": neutral, "negative": negative, } type_dic = { "HYCCC/RoBERTa_Chinese_AnnualReport_tuned":"roberta", "HYCCC/RoBERTa_Chinese_FinancialNews_tuned":"roberta", "HYCCC/RoBERTa_English_AnnualReport_tuned":"roberta", "HYCCC/RoBERTa_English_FinancialNews_tuned":"roberta", } if 'Chinese' in model_name: tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en") model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en") translated_tokens = model.generate( **tokenizer(raw_sentences, return_tensors="pt", padding=True) ) sentences_translated = [] for t in translated_tokens: sentences_translated.append(tokenizer.decode(t, skip_special_tokens=True)) sentences = sentences_translated else: sentences = raw_sentences testdata = [] for i,sentence in enumerate(sentences): testdata.append(InputExample(guid=i,text_a=sentence,label=0)) plm, tokenizer, model_config, WrapperClass = load_plm(type_dic[model_name], model_name) promptTemplate = ManualTemplate( text = template, tokenizer = tokenizer, ) promptVerbalizer = ManualVerbalizer( classes = classes, label_words = label_words, tokenizer = tokenizer, ) test_dataloader = PromptDataLoader( dataset = testdata, tokenizer = tokenizer, template = promptTemplate, tokenizer_wrapper_class = WrapperClass, batch_size = 4, max_seq_length = 512, ) prompt_model = PromptForClassification( plm=plm, template=promptTemplate, verbalizer=promptVerbalizer, freeze_plm=True ) result = [] for step, inputs in enumerate(test_dataloader): logits = prompt_model(inputs) result.extend(torch.argmax(logits, dim=-1)) output = '\n'.join([f"{classes[res]}, {raw_sentences[i]}" for i,res in enumerate(result)]) return str(output) demo = gr.Interface(fn=sentiment_analysis, inputs = [gr.TextArea(placeholder="Enter sentence here. If you have multiple sentences, separate them with '\\n'.", label="Sentence",lines=5, max_lines = 10), gr.Radio(choices=["RoBERTa_Chinese_AnnualReport_tuned", "RoBERTa_Chinese_FinancialNews_tuned", "RoBERTa_English_AnnualReport_tuned", "RoBERTa_English_FinancialNews_tuned"], label="Model Selection")], outputs=gr.TextArea(label="Sentiment",lines=5, show_copy_button=True, max_lines = 10), title = "Prompt Learning-Based Disclosure Sentiment Detection" ) demo.launch()