In [5]:
from sklearn.dummy import DummyRegressor
from nltk.corpus import stopwords
from textblob import TextBlob
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest
import pandas as pd
from util import escape_tags_and_content, escape_tags, escape_strings, escape_links, escape_hex_character_codes, escape_punctuation_boundaries, escape_odd_spaces
from sklearn.model_selection import RepeatedKFold
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
import numpy as np
import util

In [6]:
def gerar_metricas(project_name):

 # carregando os dados
 df = pd.read_csv("database\\tawos\\deep\\{}_deep-se.csv".format(project_name))

 # criação de uma nova coluna
 df["context"] = df["title"] + df["description"]

 # pré-processamento
 df["context"] = df["context"].apply(lambda x: escape_tags_and_content(x))
 df["context"] = df["context"].apply(lambda x: escape_tags(x))
 df["context"] = df["context"].apply(lambda x: escape_strings(x))
 df["context"] = df["context"].apply(lambda x: escape_links(x))
 df["context"] = df["context"].apply(lambda x: escape_hex_character_codes(x))
 df["context"] = df["context"].apply(lambda x: escape_punctuation_boundaries(x))
 df["context"] = df["context"].apply(lambda x: escape_odd_spaces(x))

 # removendo stop-words
 stop = stopwords.words('english')
 df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

 # renomeando as colunas porque senão dá um problema com a extração de features do NEOSP
 df = df.rename(columns={ "issuekey": "issuekey_", "created": "created_", "description": "description_", "title": "title_", "context": "context_", "storypoint": "storypoint_"})
 y = df["storypoint_"]
 df = df.drop(columns=['storypoint_'])

 # 5º coluna -> extração das features para o neosp
 df["gunning_fog_"] = df['context_'].apply(textstat.gunning_fog)
 df["flesch_reading_ease_"] = df['context_'].apply(textstat.flesch_reading_ease)
 df["flesch_kincaid_grade_"] = df['context_'].apply(textstat.flesch_kincaid_grade)
 df["smog_index_"] = df['context_'].apply(textstat.smog_index)
 df["coleman_liau_index_"] = df['context_'].apply(textstat.coleman_liau_index)
 df["automated_readability_index_"] = df['context_'].apply(textstat.automated_readability_index)
 df["dale_chall_readability_score_"] = df['context_'].apply(textstat.dale_chall_readability_score)
 df["difficult_words_"] = df['context_'].apply(textstat.difficult_words)
 df["linsear_write_formula_"] = df['context_'].apply(textstat.linsear_write_formula)
 df["polarity_"] = df["context_"].apply(lambda x: TextBlob(x).sentiment.polarity)
 df["subjectivity_"] = df["context_"].apply(lambda x: TextBlob(x).sentiment.subjectivity)
 # 16º colunas

 # Extração das features para o TFIDF
 vectorizer = TfidfVectorizer()
 X_vec = vectorizer.fit_transform(df["context_"])

 df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())

 # Juntando as features do neosp com o tfidf
 df = df.join(df_vec)
 X = df
 
 grid = GridSearchCV(
 estimator=SVR(kernel='rbf'),
 param_grid={
 'C': [1.1, 5.4, 170, 1001],
 'epsilon': [0.0003, 0.007, 0.0109, 0.019, 0.14, 0.05, 8, 0.2, 3, 2, 7],
 'gamma': [0.7001, 0.008, 0.001, 3.1, 1, 1.3, 5]
 }, 
 cv=10, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

 #print the best parameters from all possible combinations
 grid.fit(X[X.columns[5:16]], y)
 print(grid.best_params_)

In [7]:
import pandas as pd

LIBRARIES = ["ALOY", "CLI"]
 #"APSTUD", 
 #"CLOV", "COMPASS", "CONFCLOUD", "CONFSERVER", "DAEMON", "DM", "DNN", "DURACLOUD", "EVG", "FAB", 
 #"MDL", "MESOS" ,"MULE", "NEXUS", "SERVER", "STL", "TIDOC", "TIMOB", "TISTUD", "XD"]

for lp in LIBRARIES:
 gerar_metricas(lp)

{'C': 5.4, 'epsilon': 0.2, 'gamma': 5}
{'C': 1.1, 'epsilon': 2, 'gamma': 0.7001}
