{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from sklearn.dummy import DummyRegressor\n", "from nltk.corpus import stopwords\n", "from textblob import TextBlob\n", "import textstat\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn import svm\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.feature_selection import SelectKBest\n", "import pandas as pd\n", "from util import escape_tags_and_content, escape_tags, escape_strings, escape_links, escape_hex_character_codes, escape_punctuation_boundaries, escape_odd_spaces\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.model_selection import RepeatedKFold\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.feature_selection import f_classif, f_regression" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def gerar_metricas(project_name):\n", "\n", " ########## SE FOR TAWOS DESCOMENTAR\n", " # df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n", " \n", " ########## SE FOR NEODATASET, SE FOR TAWOS REMOVER\n", " df = pd.read_json(\"database\\\\neo\\\\json\\\\{}.json\".format(project_name))\n", " \n", " ########## SE FOR NEODATASET, SE FOR TAWOS REMOVER\n", " df = df.rename(columns={ \"id\": \"issuekey\", \"created_at\": \"created\", \"weight\": \"storypoint\"})\n", " \n", " # criação de uma nova coluna\n", " df[\"context\"] = df[\"title\"] + df[\"description\"]\n", "\n", " # pré-processamento\n", " df['context'] = df['context'].astype(str)\n", " df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags_and_content(x))\n", " df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags(x))\n", " df[\"context\"] = df[\"context\"].apply(lambda x: escape_strings(x))\n", " df[\"context\"] = df[\"context\"].apply(lambda x: escape_links(x))\n", " df[\"context\"] = df[\"context\"].apply(lambda x: escape_hex_character_codes(x))\n", " df[\"context\"] = df[\"context\"].apply(lambda x: escape_punctuation_boundaries(x))\n", " df[\"context\"] = df[\"context\"].apply(lambda x: escape_odd_spaces(x))\n", "\n", " # removendo stop-words\n", " stop = stopwords.words('english')\n", " df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))\n", "\n", " # SE FOR TAWOS renomeando as colunas porque senão dá um problema com a extração de features do NEOSP\n", " #df = df.rename(columns={ \"issuekey\": \"issuekey_\", \"created\": \"created_\", \"description\": \"description_\", \"title\": \"title_\", \"context\": \"context_\", \"storypoint\": \"storypoint_\"})\n", " \n", " ########## SE FOR NEODATASET, SE FOR TAWOS REMOVER\n", " df = df.rename(columns={ \"issuekey\": \"issuekey_\", \"created\": \"created_\", \"description\": \"description_\", \"title\": \"title_\", \"context\": \"context_\", \"storypoint\": \"storypoint_\"})\n", " \n", " y = df[\"storypoint_\"]\n", " df = df.drop(columns=['storypoint_'])\n", " \n", " ########## SE FOR NEODATASET, SE FOR TAWOS REMOVER\n", " df = df[[\"issuekey_\", \"created_\", \"title_\", \"description_\", \"context_\"]]\n", "\n", " # 5º coluna -> extração das features para o neosp\n", " df[\"gunning_fog_\"] = df['context_'].apply(textstat.gunning_fog)\n", " df[\"flesch_reading_ease_\"] = df['context_'].apply(textstat.flesch_reading_ease)\n", " df[\"flesch_kincaid_grade_\"] = df['context_'].apply(textstat.flesch_kincaid_grade)\n", " df[\"smog_index_\"] = df['context_'].apply(textstat.smog_index)\n", " df[\"coleman_liau_index_\"] = df['context_'].apply(textstat.coleman_liau_index)\n", " df[\"automated_readability_index_\"] = df['context_'].apply(textstat.automated_readability_index)\n", " df[\"dale_chall_readability_score_\"] = df['context_'].apply(textstat.dale_chall_readability_score)\n", " df[\"difficult_words_\"] = df['context_'].apply(textstat.difficult_words)\n", " df[\"linsear_write_formula_\"] = df['context_'].apply(textstat.linsear_write_formula)\n", " df[\"polarity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.polarity)\n", " df[\"subjectivity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.subjectivity)\n", " # 16º colunas\n", "\n", " # Extração das features para o TFIDF\n", " vectorizer = TfidfVectorizer()\n", " X_vec = vectorizer.fit_transform(df[\"context_\"])\n", "\n", " df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())\n", "\n", " # Juntando as features do neosp com o tfidf\n", " df = df.join(df_vec)\n", " X = df\n", "\n", " rkf = RepeatedKFold(n_splits=10, n_repeats=30, random_state=2652124)\n", " \n", " list_results_MAE_MbR, list_results_MAE_NEOSP, list_results_TFIDF_MbR = list(), list(), list()\n", " ### Dummy\n", " model = DummyRegressor(strategy=\"mean\")\n", " list_results_MAE_MbR = cross_val_score(model, X[X.columns[5:6]], y, cv = rkf, scoring=\"neg_mean_absolute_error\")\n", " df_results_MAE_MbR = pd.DataFrame(list_results_MAE_MbR, columns = [\"MAE\"])\n", " df_results_MAE_MbR = df_results_MAE_MbR.apply(lambda x: x*-1)\n", " df_results_MAE_MbR.to_csv(\"metricas/metricas_{}_MbR.csv\".format(project_name),index = False, header=False)\n", " \n", " ##### NEOSP\n", " model = make_pipeline(StandardScaler(), svm.SVR())\n", " list_results_MAE_NEOSP = cross_val_score(model, X[X.columns[5:16]], y, cv = rkf, scoring=\"neg_mean_absolute_error\")\n", " df_results_MAE_NEOSP = pd.DataFrame(list_results_MAE_NEOSP, columns = [\"MAE\"])\n", " df_results_MAE_NEOSP = df_results_MAE_NEOSP.apply(lambda x: x*-1)\n", " df_results_MAE_NEOSP.to_csv(\"metricas/metricas_{}_NEOSP_SVR.csv\".format(project_name), index = False, header=False)\n", " \n", " #### TFIDF\n", " model = make_pipeline(SelectKBest(f_regression, k=50), StandardScaler(), svm.SVR())\n", " list_results_TFIDF_MbR = cross_val_score(model, X[X.columns[16:]], y, cv = rkf, scoring=\"neg_mean_absolute_error\")\n", " df_results_MAE_TFIDF = pd.DataFrame(list_results_TFIDF_MbR, columns = [\"MAE\"])\n", " df_results_MAE_TFIDF = df_results_MAE_TFIDF.apply(lambda x: x*-1)\n", " df_results_MAE_TFIDF.to_csv(\"metricas/metricas_{}_TFIDF.csv\".format(project_name),index = False, header=False)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "#LIBRARIES_TAWOS = [\"ALOY\", \"APSTUD\", \"CLI\", \"CLOV\", \"COMPASS\", \"CONFCLOUD\", \"CONFSERVER\", \"DAEMON\", \"DM\", \"DNN\", \"DURACLOUD\", \"EVG\", \"FAB\", \n", " #\"MDL\", \"MESOS\" ,\"MULE\", \"NEXUS\", \"SERVER\", \"STL\", \"TIDOC\", \"TIMOB\", \"TISTUD\", \"XD\"]\n", "\n", "#LIBRARIES_NEO = [\"7764\", \"250833\", \"734943\", \"2009901\", \"2670515\", \"3828396\",\"3836952\", \n", "# \"4456656\", \"5261717\", \"6206924\", \"7071551\", \"7128869\",\"7603319\", \n", "# \"7776928\", \"10152778\",\"10171263\", \"10171270\", \"10171280\", \"10174980\", \n", "# \"12450835\",\"12584701\",\"12894267\", \"14052249\",\"14976868\", \"15502567\",\n", "# \"19921167\", \"21149814\", \"23285197\", \"28419588\",\"28644964\", \"28847821\"]\n", "\n", "#for project_name in LIBRARIES_NEO:\n", "# gerar_metricas(project_name)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }