Spaces:
Runtime error
Runtime error
# utilities | |
import re | |
import pickle | |
import numpy as np | |
import pandas as pd | |
# plotting | |
import seaborn as sns | |
from wordcloud import WordCloud | |
import matplotlib.pyplot as plt | |
# nltk | |
from nltk.stem import WordNetLemmatizer | |
# sklearn | |
from sklearn.svm import LinearSVC | |
from sklearn.naive_bayes import BernoulliNB | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.model_selection import train_test_split | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics import confusion_matrix, classification_report | |
from datasets import load_dataset | |
dataset = load_dataset('Twitter_Emoticon_Analysis_NLP/training.1600000.processed.noemoticon.csv') | |
DATASET_COLUMNS = ["sentiment", "ids", "date", "flag", "user", "text"] | |
DATASET_ENCODING = "ISO-8859-1" | |
dataset = pd.read_csv('training.1600000.processed.noemoticon.csv', | |
encoding=DATASET_ENCODING , names=DATASET_COLUMNS) | |
# Removing the unnecessary columns. | |
dataset = dataset[['sentiment','text']] | |
# Replacing the values to ease understanding. | |
dataset['sentiment'] = dataset['sentiment'].replace(4,1) | |
# Storing data in lists. | |
text, sentiment = list(dataset['text']), list(dataset['sentiment']) | |
def preprocess(textdata): | |
processedText = [] | |
# Create Lemmatizer and Stemmer. | |
wordLemm = WordNetLemmatizer() | |
# Defining regex patterns. | |
urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)" | |
userPattern = '@[^\s]+' | |
alphaPattern = "[^a-zA-Z0-9]" | |
sequencePattern = r"(.)\1\1+" | |
seqReplacePattern = r"\1\1" | |
for tweet in textdata: | |
tweet = tweet.lower() | |
# Replace all URls with 'URL' | |
tweet = re.sub(urlPattern,' URL',tweet) | |
# Replace all emojis. | |
for emoji in emojis.keys(): | |
tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji]) | |
# Replace @USERNAME to 'USER'. | |
tweet = re.sub(userPattern,' USER', tweet) | |
# Replace all non alphabets. | |
tweet = re.sub(alphaPattern, " ", tweet) | |
# Replace 3 or more consecutive letters by 2 letter. | |
tweet = re.sub(sequencePattern, seqReplacePattern, tweet) | |
tweetwords = '' | |
for word in tweet.split(): | |
# Checking if the word is a stopword. | |
#if word not in stopwordlist: | |
if len(word)>1: | |
# Lemmatizing the word. | |
word = wordLemm.lemmatize(word) | |
tweetwords += (word+' ') | |
processedText.append(tweetwords) | |
return processedText | |
def preprocess(textdata): | |
processedText = [] | |
# Create Lemmatizer and Stemmer. | |
wordLemm = WordNetLemmatizer() | |
# Defining regex patterns. | |
urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)" | |
userPattern = '@[^\s]+' | |
alphaPattern = "[^a-zA-Z0-9]" | |
sequencePattern = r"(.)\1\1+" | |
seqReplacePattern = r"\1\1" | |
for tweet in textdata: | |
tweet = tweet.lower() | |
# Replace all URls with 'URL' | |
tweet = re.sub(urlPattern,' URL',tweet) | |
# Replace all emojis. | |
for emoji in emojis.keys(): | |
tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji]) | |
# Replace @USERNAME to 'USER'. | |
tweet = re.sub(userPattern,' USER', tweet) | |
# Replace all non alphabets. | |
tweet = re.sub(alphaPattern, " ", tweet) | |
# Replace 3 or more consecutive letters by 2 letter. | |
tweet = re.sub(sequencePattern, seqReplacePattern, tweet) | |
tweetwords = '' | |
for word in tweet.split(): | |
# Checking if the word is a stopword. | |
#if word not in stopwordlist: | |
if len(word)>1: | |
# Lemmatizing the word. | |
word = wordLemm.lemmatize(word) | |
tweetwords += (word+' ') | |
processedText.append(tweetwords) | |
return processedText | |
import gradio as gr | |
import nltk | |
from nltk.sentiment.vader import SentimentIntensityAnalyzer | |
# Download required NLTK resources | |
nltk.download('vader_lexicon') | |
# Load the pre-trained sentiment intensity analyzer | |
sia = SentimentIntensityAnalyzer() | |
def get_sentiment(tweet): | |
# Preprocess the tweet | |
processed_tweet = preprocess([tweet]) | |
# Get the sentiment score using VADER sentiment analyzer | |
sentiment_score = sia.polarity_scores(processed_tweet[0]) | |
# Determine the sentiment label based on the compound score | |
compound_score = sentiment_score['compound'] | |
if compound_score >= 0.05: | |
sentiment = 'Positive' | |
elif compound_score <= -0.05: | |
sentiment = 'Negative' | |
else: | |
sentiment = 'Neutral' | |
return sentiment | |
# Create a Gradio interface | |
iface = gr.Interface( | |
fn=get_sentiment, | |
inputs='text', | |
outputs='text', | |
title='Tweet Sentiment Analyzer', | |
description='Enter a tweet with text or emoticon or both, and get the sentiment prediction.', | |
examples=[['I love this movie!', 'This weather is terrible.']], | |
theme='Soft' | |
) | |
# Launch the interface | |
iface.launch(share = True) |