In [1]:
from sklearn.dummy import DummyRegressor
from nltk.corpus import stopwords
from textblob import TextBlob
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest
import pandas as pd
from util import escape_tags_and_content, escape_tags, escape_strings, escape_links, escape_hex_character_codes, escape_punctuation_boundaries, escape_odd_spaces
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import f_classif, f_regression

In [2]:
def gerar_metricas(project_name):

 ########## SE FOR TAWOS DESCOMENTAR
 # df = pd.read_csv("database\\tawos\\deep\\{}_deep-se.csv".format(project_name))
 
 ########## SE FOR NEODATASET, SE FOR TAWOS REMOVER
 df = pd.read_json("database\\neo\\json\\{}.json".format(project_name))
 
 ########## SE FOR NEODATASET, SE FOR TAWOS REMOVER
 df = df.rename(columns={ "id": "issuekey", "created_at": "created", "weight": "storypoint"})
 
 # criação de uma nova coluna
 df["context"] = df["title"] + df["description"]

 # pré-processamento
 df['context'] = df['context'].astype(str)
 df["context"] = df["context"].apply(lambda x: escape_tags_and_content(x))
 df["context"] = df["context"].apply(lambda x: escape_tags(x))
 df["context"] = df["context"].apply(lambda x: escape_strings(x))
 df["context"] = df["context"].apply(lambda x: escape_links(x))
 df["context"] = df["context"].apply(lambda x: escape_hex_character_codes(x))
 df["context"] = df["context"].apply(lambda x: escape_punctuation_boundaries(x))
 df["context"] = df["context"].apply(lambda x: escape_odd_spaces(x))

 # removendo stop-words
 stop = stopwords.words('english')
 df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

 # SE FOR TAWOS renomeando as colunas porque senão dá um problema com a extração de features do NEOSP
 #df = df.rename(columns={ "issuekey": "issuekey_", "created": "created_", "description": "description_", "title": "title_", "context": "context_", "storypoint": "storypoint_"})
 
 ########## SE FOR NEODATASET, SE FOR TAWOS REMOVER
 df = df.rename(columns={ "issuekey": "issuekey_", "created": "created_", "description": "description_", "title": "title_", "context": "context_", "storypoint": "storypoint_"})
 
 y = df["storypoint_"]
 df = df.drop(columns=['storypoint_'])
 
 ########## SE FOR NEODATASET, SE FOR TAWOS REMOVER
 df = df[["issuekey_", "created_", "title_", "description_", "context_"]]

 # 5º coluna -> extração das features para o neosp
 df["gunning_fog_"] = df['context_'].apply(textstat.gunning_fog)
 df["flesch_reading_ease_"] = df['context_'].apply(textstat.flesch_reading_ease)
 df["flesch_kincaid_grade_"] = df['context_'].apply(textstat.flesch_kincaid_grade)
 df["smog_index_"] = df['context_'].apply(textstat.smog_index)
 df["coleman_liau_index_"] = df['context_'].apply(textstat.coleman_liau_index)
 df["automated_readability_index_"] = df['context_'].apply(textstat.automated_readability_index)
 df["dale_chall_readability_score_"] = df['context_'].apply(textstat.dale_chall_readability_score)
 df["difficult_words_"] = df['context_'].apply(textstat.difficult_words)
 df["linsear_write_formula_"] = df['context_'].apply(textstat.linsear_write_formula)
 df["polarity_"] = df["context_"].apply(lambda x: TextBlob(x).sentiment.polarity)
 df["subjectivity_"] = df["context_"].apply(lambda x: TextBlob(x).sentiment.subjectivity)
 # 16º colunas

 # Extração das features para o TFIDF
 vectorizer = TfidfVectorizer()
 X_vec = vectorizer.fit_transform(df["context_"])

 df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())

 # Juntando as features do neosp com o tfidf
 df = df.join(df_vec)
 X = df

 rkf = RepeatedKFold(n_splits=10, n_repeats=30, random_state=2652124)
 
 list_results_MAE_MbR, list_results_MAE_NEOSP, list_results_TFIDF_MbR = list(), list(), list()
 ### Dummy
 model = DummyRegressor(strategy="mean")
 list_results_MAE_MbR = cross_val_score(model, X[X.columns[5:6]], y, cv = rkf, scoring="neg_mean_absolute_error")
 df_results_MAE_MbR = pd.DataFrame(list_results_MAE_MbR, columns = ["MAE"])
 df_results_MAE_MbR = df_results_MAE_MbR.apply(lambda x: x*-1)
 df_results_MAE_MbR.to_csv("metricas/metricas_{}_MbR.csv".format(project_name),index = False, header=False)
 
 ##### NEOSP
 model = make_pipeline(StandardScaler(), svm.SVR())
 list_results_MAE_NEOSP = cross_val_score(model, X[X.columns[5:16]], y, cv = rkf, scoring="neg_mean_absolute_error")
 df_results_MAE_NEOSP = pd.DataFrame(list_results_MAE_NEOSP, columns = ["MAE"])
 df_results_MAE_NEOSP = df_results_MAE_NEOSP.apply(lambda x: x*-1)
 df_results_MAE_NEOSP.to_csv("metricas/metricas_{}_NEOSP_SVR.csv".format(project_name), index = False, header=False)
 
 #### TFIDF
 model = make_pipeline(SelectKBest(f_regression, k=50), StandardScaler(), svm.SVR())
 list_results_TFIDF_MbR = cross_val_score(model, X[X.columns[16:]], y, cv = rkf, scoring="neg_mean_absolute_error")
 df_results_MAE_TFIDF = pd.DataFrame(list_results_TFIDF_MbR, columns = ["MAE"])
 df_results_MAE_TFIDF = df_results_MAE_TFIDF.apply(lambda x: x*-1)
 df_results_MAE_TFIDF.to_csv("metricas/metricas_{}_TFIDF.csv".format(project_name),index = False, header=False)

In [3]:
import pandas as pd

#LIBRARIES_TAWOS = ["ALOY", "APSTUD", "CLI", "CLOV", "COMPASS", "CONFCLOUD", "CONFSERVER", "DAEMON", "DM", "DNN", "DURACLOUD", "EVG", "FAB", 
 #"MDL", "MESOS" ,"MULE", "NEXUS", "SERVER", "STL", "TIDOC", "TIMOB", "TISTUD", "XD"]

#LIBRARIES_NEO = ["7764", "250833", "734943", "2009901", "2670515", "3828396","3836952", 
# "4456656", "5261717", "6206924", "7071551", "7128869","7603319", 
# "7776928", "10152778","10171263", "10171270", "10171280", "10174980", 
# "12450835","12584701","12894267", "14052249","14976868", "15502567",
# "19921167", "21149814", "23285197", "28419588","28644964", "28847821"]

#for project_name in LIBRARIES_NEO:
# gerar_metricas(project_name)