{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from sklearn.dummy import DummyRegressor\n", "from nltk.corpus import stopwords\n", "from textblob import TextBlob\n", "import textstat\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn import svm\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.feature_selection import SelectKBest\n", "import pandas as pd\n", "from util import escape_tags_and_content, escape_tags, escape_strings, escape_links, escape_hex_character_codes, escape_punctuation_boundaries, escape_odd_spaces\n", "from sklearn.model_selection import RepeatedKFold\n", "from sklearn.svm import SVR\n", "from sklearn.model_selection import GridSearchCV\n", "import numpy as np\n", "import util" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def gerar_metricas(project_name):\n", "\n", " # carregando os dados\n", " df = pd.read_csv(\"database\\\\tawos\\\\deep\\\\{}_deep-se.csv\".format(project_name))\n", "\n", " # criação de uma nova coluna\n", " df[\"context\"] = df[\"title\"] + df[\"description\"]\n", "\n", " # pré-processamento\n", " df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags_and_content(x))\n", " df[\"context\"] = df[\"context\"].apply(lambda x: escape_tags(x))\n", " df[\"context\"] = df[\"context\"].apply(lambda x: escape_strings(x))\n", " df[\"context\"] = df[\"context\"].apply(lambda x: escape_links(x))\n", " df[\"context\"] = df[\"context\"].apply(lambda x: escape_hex_character_codes(x))\n", " df[\"context\"] = df[\"context\"].apply(lambda x: escape_punctuation_boundaries(x))\n", " df[\"context\"] = df[\"context\"].apply(lambda x: escape_odd_spaces(x))\n", "\n", " # removendo stop-words\n", " stop = stopwords.words('english')\n", " df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))\n", "\n", " # renomeando as colunas porque senão dá um problema com a extração de features do NEOSP\n", " df = df.rename(columns={ \"issuekey\": \"issuekey_\", \"created\": \"created_\", \"description\": \"description_\", \"title\": \"title_\", \"context\": \"context_\", \"storypoint\": \"storypoint_\"})\n", " y = df[\"storypoint_\"]\n", " df = df.drop(columns=['storypoint_'])\n", "\n", " # 5º coluna -> extração das features para o neosp\n", " df[\"gunning_fog_\"] = df['context_'].apply(textstat.gunning_fog)\n", " df[\"flesch_reading_ease_\"] = df['context_'].apply(textstat.flesch_reading_ease)\n", " df[\"flesch_kincaid_grade_\"] = df['context_'].apply(textstat.flesch_kincaid_grade)\n", " df[\"smog_index_\"] = df['context_'].apply(textstat.smog_index)\n", " df[\"coleman_liau_index_\"] = df['context_'].apply(textstat.coleman_liau_index)\n", " df[\"automated_readability_index_\"] = df['context_'].apply(textstat.automated_readability_index)\n", " df[\"dale_chall_readability_score_\"] = df['context_'].apply(textstat.dale_chall_readability_score)\n", " df[\"difficult_words_\"] = df['context_'].apply(textstat.difficult_words)\n", " df[\"linsear_write_formula_\"] = df['context_'].apply(textstat.linsear_write_formula)\n", " df[\"polarity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.polarity)\n", " df[\"subjectivity_\"] = df[\"context_\"].apply(lambda x: TextBlob(x).sentiment.subjectivity)\n", " # 16º colunas\n", "\n", " # Extração das features para o TFIDF\n", " vectorizer = TfidfVectorizer()\n", " X_vec = vectorizer.fit_transform(df[\"context_\"])\n", "\n", " df_vec = pd.DataFrame(data = X_vec.toarray(), columns = vectorizer.get_feature_names_out())\n", "\n", " # Juntando as features do neosp com o tfidf\n", " df = df.join(df_vec)\n", " X = df\n", " \n", " grid = GridSearchCV(\n", " estimator=SVR(kernel='rbf'),\n", " param_grid={\n", " 'C': [1.1, 5.4, 170, 1001],\n", " 'epsilon': [0.0003, 0.007, 0.0109, 0.019, 0.14, 0.05, 8, 0.2, 3, 2, 7],\n", " 'gamma': [0.7001, 0.008, 0.001, 3.1, 1, 1.3, 5]\n", " }, \n", " cv=10, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)\n", "\n", " #print the best parameters from all possible combinations\n", " grid.fit(X[X.columns[5:16]], y)\n", " print(grid.best_params_)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'C': 5.4, 'epsilon': 0.2, 'gamma': 5}\n", "{'C': 1.1, 'epsilon': 2, 'gamma': 0.7001}\n" ] } ], "source": [ "import pandas as pd\n", "\n", "LIBRARIES = [\"ALOY\", \"CLI\"]\n", " #\"APSTUD\", \n", " #\"CLOV\", \"COMPASS\", \"CONFCLOUD\", \"CONFSERVER\", \"DAEMON\", \"DM\", \"DNN\", \"DURACLOUD\", \"EVG\", \"FAB\", \n", " #\"MDL\", \"MESOS\" ,\"MULE\", \"NEXUS\", \"SERVER\", \"STL\", \"TIDOC\", \"TIMOB\", \"TISTUD\", \"XD\"]\n", "\n", "for lp in LIBRARIES:\n", " gerar_metricas(lp)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }