Spaces:

HYCCC
/

sentiment_analysis

Sleeping

App Files Files Community

sentiment_analysis / app.py

HYCCC

Update app.py

8554332 over 1 year ago

raw

history blame contribute delete

4.11 kB

	import gradio as gr
	from transformers import pipeline
	import torch
	import pandas as pd
	from openprompt.plms import load_plm
	from openprompt import PromptDataLoader
	from openprompt.prompts import ManualVerbalizer
	from openprompt.prompts import ManualTemplate
	from openprompt.data_utils import InputExample
	from openprompt import PromptForClassification
	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

	def readLMwords():
	alldata = pd.read_csv("LoughranMcDonald_MasterDictionary_2020.csv")
	positive = list(alldata[alldata["Positive"]!=0]["Word"].str.lower())
	negative = list(alldata[alldata["Negative"]!=0]["Word"].str.lower())
	uncertainty = list(alldata[alldata["Uncertainty"]!=0]["Word"].str.lower())
	return positive,negative,uncertainty


	def sentiment_analysis(sentence, model_name):
	model_name = "HYCCC/"+model_name
	raw_sentences = sentence.strip().split('\n')
	template = '{"placeholder":"text_a"} Shares are {"mask"}.'
	classes = ['positive', 'neutral', 'negative']
	positive,negative,neutral = readLMwords()
	label_words = {
	"positive": positive,
	"neutral": neutral,
	"negative": negative,
	}
	type_dic = {
	"HYCCC/RoBERTa_Chinese_AnnualReport_tuned":"roberta",
	"HYCCC/RoBERTa_Chinese_FinancialNews_tuned":"roberta",
	"HYCCC/RoBERTa_English_AnnualReport_tuned":"roberta",
	"HYCCC/RoBERTa_English_FinancialNews_tuned":"roberta",
	}

	if 'Chinese' in model_name:
	tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
	model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en")

	translated_tokens = model.generate(
	**tokenizer(raw_sentences, return_tensors="pt", padding=True)
	)
	sentences_translated = []
	for t in translated_tokens:
	sentences_translated.append(tokenizer.decode(t, skip_special_tokens=True))
	sentences = sentences_translated
	else:
	sentences = raw_sentences

	testdata = []
	for i,sentence in enumerate(sentences):
	testdata.append(InputExample(guid=i,text_a=sentence,label=0))

	plm, tokenizer, model_config, WrapperClass = load_plm(type_dic[model_name], model_name)

	promptTemplate = ManualTemplate(
	text = template,
	tokenizer = tokenizer,
	)
	promptVerbalizer = ManualVerbalizer(
	classes = classes,
	label_words = label_words,
	tokenizer = tokenizer,
	)
	test_dataloader = PromptDataLoader(
	dataset = testdata,
	tokenizer = tokenizer,
	template = promptTemplate,
	tokenizer_wrapper_class = WrapperClass,
	batch_size = 4,
	max_seq_length = 512,
	)
	prompt_model = PromptForClassification(
	plm=plm,
	template=promptTemplate,
	verbalizer=promptVerbalizer,
	freeze_plm=True
	)
	result = []
	for step, inputs in enumerate(test_dataloader):
	logits = prompt_model(inputs)
	result.extend(torch.argmax(logits, dim=-1))
	output = '\n'.join([f"{classes[res]}, {raw_sentences[i]}" for i,res in enumerate(result)])
	return str(output)



	demo = gr.Interface(fn=sentiment_analysis,
	inputs = [gr.TextArea(placeholder="Enter sentence here. If you have multiple sentences, separate them with '\\n'.",
	label="Sentence",lines=5,
	max_lines = 10),
	gr.Radio(choices=["RoBERTa_Chinese_AnnualReport_tuned",
	"RoBERTa_Chinese_FinancialNews_tuned",
	"RoBERTa_English_AnnualReport_tuned",
	"RoBERTa_English_FinancialNews_tuned"],
	label="Model Selection")],
	outputs=gr.TextArea(label="Sentiment",lines=5, show_copy_button=True, max_lines = 10),
	title = "Prompt Learning-Based Disclosure Sentiment Detection"
	)

	demo.launch()