{ "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" }, "orig_nbformat": 2, "kernelspec": { "name": "python383jvsc74a57bd01cb9a1c850fd1d16c5b98054247a74b7b7a12849bcfa00436ba202c2a9e2bbb2", "display_name": "Python 3.8.3 64-bit ('py38': conda)" } }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import sys\n", "nb_dir = os.path.split(os.getcwd())[0]\n", "if nb_dir not in sys.path:\n", " sys.path.append(nb_dir)\n", "\n", "import numpy as np\n", "import pandas as pd\n", "# import modin.pandas as mpd\n", "import spacy\n", "from src.configs import ModelConfigs, Languages\n", "from src.utils import wordifier, TextPreprocessor, encode\n", "\n", "from textacy.preprocessing import make_pipeline, remove, replace, normalize\n", "from tqdm import trange\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.utils import resample\n", "import multiprocessing as mp\n", "# import dask.dataframe as dask_df\n", "from stqdm import stqdm\n", "stqdm.pandas()\n", "\n", "from tqdm import trange\n", "\n", "import os\n", "# os.environ[\"MODIN_ENGINE\"] = \"ray\" # Modin will use Ray\n", "\n", "import vaex\n", "pd.set_option(\"display.max_colwidth\", None)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "path = \"../../../../Downloads/wordify_10000_copy.xlsx\"" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "df = pd.read_excel(path, dtype=str).dropna()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "# df = pd.read_excel(\"../data/test_de.xlsx\")\n", "# mdf = mpd.read_csv(\"../data/test_en.csv\")\n", "language = \"English\"\n", "nlp = spacy.load(Languages[language].value, exclude=[\"parser\", \"ner\", \"pos\", \"tok2vec\"])" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "prep = TextPreprocessor(\n", " language=\"English\", \n", " cleaning_steps=list(TextPreprocessor._cleaning_options().keys()),\n", " lemmatizer_when=None,\n", ")" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "100%|██████████| 9939/9939 [00:06<00:00, 1431.09it/s]\n" ] } ], "source": [ "df[\"p_text\"] = prep.fit_transform(df[\"text\"])" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "X, y, X_names, y_names = encode(df[\"p_text\"], df[\"label\"]).values()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "clf = LogisticRegression(\n", " penalty=\"l1\",\n", " C=0.05,#ModelConfigs.PENALTIES.value[np.random.randint(len(ModelConfigs.PENALTIES.value))],\n", " solver=\"liblinear\",\n", " multi_class=\"auto\",\n", " max_iter=500,\n", " class_weight=\"balanced\",\n", ")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "CPU times: user 1.45 s, sys: 10.6 ms, total: 1.46 s\nWall time: 1.46 s\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "LogisticRegression(C=0.05, class_weight='balanced', max_iter=500, penalty='l1',\n", " solver='liblinear')" ] }, "metadata": {}, "execution_count": 22 } ], "source": [ "%%time\n", "clf.fit(X, y)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ " 6%|▌ | 28/500 [01:01<27:33, 3.50s/it]/Users/49796/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/svm/_base.py:976: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", " warnings.warn(\"Liblinear failed to converge, increase \"\n", " 31%|███ | 156/500 [06:18<13:54, 2.43s/it]\n" ] }, { "output_type": "error", "ename": "KeyboardInterrupt", "evalue": "", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;31m# fit\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 41\u001b[0;31m \u001b[0mclf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 42\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 1354\u001b[0m \u001b[0;34m\" 'solver' is set to 'liblinear'. Got 'n_jobs'\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1355\u001b[0m \" = {}.\".format(effective_n_jobs(self.n_jobs)))\n\u001b[0;32m-> 1356\u001b[0;31m self.coef_, self.intercept_, n_iter_ = _fit_liblinear(\n\u001b[0m\u001b[1;32m 1357\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_intercept\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mintercept_scaling\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1358\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclass_weight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpenalty\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdual\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mverbose\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/svm/_base.py\u001b[0m in \u001b[0;36m_fit_liblinear\u001b[0;34m(X, y, C, fit_intercept, intercept_scaling, class_weight, penalty, dual, verbose, max_iter, tol, random_state, multi_class, loss, epsilon, sample_weight)\u001b[0m\n\u001b[1;32m 964\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 965\u001b[0m \u001b[0msolver_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_liblinear_solver_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmulti_class\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpenalty\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdual\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 966\u001b[0;31m raw_coef_, n_iter_ = liblinear.train_wrap(\n\u001b[0m\u001b[1;32m 967\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_ind\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misspmatrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msolver_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbias\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 968\u001b[0m \u001b[0mclass_weight_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_iter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrnd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miinfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'i'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "n_instances, n_features = X.shape\n", "n_classes = len(y_names)\n", "\n", "# NOTE: the * 10 / 10 trick is to have \"nice\" round-ups\n", "sample_fraction = np.ceil((n_features / n_instances) * 10) / 10\n", "\n", "sample_size = min(\n", " # this is the maximum supported\n", " ModelConfigs.MAX_SELECTION.value,\n", " # at minimum you want MIN_SELECTION but in general you want\n", " # n_instances * sample_fraction\n", " max(ModelConfigs.MIN_SELECTION.value, int(n_instances * sample_fraction)),\n", " # however if previous one is bigger the the available instances take\n", " # the number of available instances\n", " n_instances,\n", ")\n", "\n", "# TODO: might want to try out something to subsample features at each iteration\n", "\n", "# initialize coefficient matrices\n", "pos_scores = np.zeros((n_classes, n_features), dtype=int)\n", "neg_scores = np.zeros((n_classes, n_features), dtype=int)\n", "\n", "for _ in trange(ModelConfigs.NUM_ITERS.value):\n", "\n", " # run randomized regression\n", " clf = LogisticRegression(\n", " penalty=\"l1\",\n", " C=ModelConfigs.PENALTIES.value[np.random.randint(len(ModelConfigs.PENALTIES.value))],\n", " solver=\"liblinear\",\n", " multi_class=\"auto\",\n", " max_iter=500,\n", " class_weight=\"balanced\",\n", " )\n", "\n", " # sample indices to subsample matrix\n", " selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size)\n", "\n", " # fit\n", " try:\n", " clf.fit(X[selection], y[selection])\n", " except ValueError:\n", " continue\n", "\n", " # record coefficients\n", " if n_classes == 2:\n", " pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)\n", " neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)\n", " pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)\n", " neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)\n", " else:\n", " pos_scores += clf.coef_ > 0\n", " neg_scores += clf.coef_ < 0" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# normalize\n", "pos_scores = pos_scores / ModelConfigs.NUM_ITERS.value\n", "neg_scores = neg_scores / ModelConfigs.NUM_ITERS.value\n", "\n", "# get only active features\n", "pos_positions = np.where(pos_scores >= ModelConfigs.SELECTION_THRESHOLD.value, pos_scores, 0)\n", "neg_positions = np.where(neg_scores >= ModelConfigs.SELECTION_THRESHOLD.value, neg_scores, 0)\n", "\n", "# prepare DataFrame\n", "pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())]\n", "neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())]\n", "\n", "posdf = pd.DataFrame(pos, columns=\"word score label\".split()).sort_values([\"label\", \"score\"], ascending=False)\n", "negdf = pd.DataFrame(neg, columns=\"word score label\".split()).sort_values([\"label\", \"score\"], ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ] }