merve's picture
merve HF staff
Upload app.py
143d008
raw
history blame
2.22 kB
import streamlit as st
import pandas as pd
import re
import nltk
from PIL import Image
import os
import numpy as np
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
import datasets
from datasets import load_dataset
import sklearn
from sklearn.preprocessing import LabelEncoder
# loading dataset
dataset = load_dataset("merve/poetry", streaming=True)
df = pd.DataFrame.from_dict(dataset["train"])
d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()
nltk.download("stopwords")
stop = stopwords.words('english')
# standardizing dataset by removing special characters and lowercasing
def standardize(text, remove_digits=True):
text=re.sub('[^a-zA-Z\d\s]', '',text)
text = text.lower()
return text
st.write("Poetry dataset, content column cleaned from special characters and lowercased")
df.content = df.content.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df.content=df.content.apply(standardize)
st.dataframe(df)
#most appearing words including stopwords
st.write("Most appearing words including stopwords")
words = df.content.str.split(expand=True).unstack().value_counts()
st.bar_chart(words[0:50])
st.set_option('deprecation.showPyplotGlobalUse', False)
mask = np.array(Image.open(os.path.join(d, "poet.png")))
# distributions of poem types according to ages and authors
st.write("Distributions of poem types according to ages and authors")
le = LabelEncoder()
df.author = le.fit_transform(df.author)
sns.catplot(x="age", y="author",hue="type", data=df)
st.pyplot()
# most appearing words other than stop words
import matplotlib.pyplot as plt
def word_cloud(content, title):
wc = WordCloud(background_color="white", max_words=200,contour_width=3,
stopwords=STOPWORDS, mask = mask, max_font_size=50)
wc.generate(" ".join(content.index.values))
fig = plt.figure(figsize=(10, 10))
plt.title(title, fontsize=20)
plt.imshow(wc.recolor(colormap='magma', random_state=42), cmap=plt.cm.gray, interpolation = "bilinear", alpha=0.98)
plt.axis('off')
st.pyplot()
st.write("Most appearing words excluding stopwords")
word_cloud(words, "Word Cloud")