from typing import Dict, List, Tuple import numpy as np import pandas as pd import streamlit as st from pandas.core.frame import DataFrame from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import LabelEncoder from sklearn.utils import resample from .configs import InputTransformConfigs, ModelConfigs def input_transform(text: pd.Series, labels: pd.Series, configs=InputTransformConfigs) -> Dict[str, np.ndarray]: """ Encodes text in mathematical object ameanable to training algorithm """ tfidf_vectorizer = TfidfVectorizer( input="content", # default: file already in memory encoding="utf-8", # default decode_error="strict", # default strip_accents=None, # do nothing lowercase=False, # do nothing preprocessor=None, # do nothing - default tokenizer=None, # default stop_words=None, # do nothing analyzer="word", ngram_range=configs.NGRAM_RANGE.value, # maximum 3-ngrams min_df=configs.MIN_DF.value, max_df=configs.MAX_DF.value, sublinear_tf=configs.SUBLINEAR.value, ) label_encoder = LabelEncoder() X = tfidf_vectorizer.fit_transform(text.values) y = label_encoder.fit_transform(labels.values) return { "X": X, "y": y, "X_names": np.array(tfidf_vectorizer.get_feature_names_out()), "y_names": label_encoder.classes_, } def wordifier( X: np.ndarray, y: np.ndarray, X_names: List[str], y_names: List[str], configs=ModelConfigs ) -> List[Tuple[str, float, str]]: n_instances, n_features = X.shape n_classes = len(y_names) # NOTE: the * 10 / 10 trick is to have "nice" round-ups sample_fraction = np.ceil((n_features / n_instances) * 10) / 10 sample_size = min( # this is the maximum supported configs.MAX_SELECTION.value, # at minimum you want MIN_SELECTION but in general you want # n_instances * sample_fraction max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)), # however if previous one is bigger the the available instances take # the number of available instances n_instances, ) # TODO: might want to try out something to subsample features at each iteration # initialize coefficient matrices pos_scores = np.zeros((n_classes, n_features), dtype=int) neg_scores = np.zeros((n_classes, n_features), dtype=int) pbar = st.progress(0) for i, _ in enumerate(range(configs.NUM_ITERS.value)): # run randomized regression clf = LogisticRegression( penalty="l1", C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))], solver="liblinear", multi_class="auto", max_iter=500, class_weight="balanced", ) # sample indices to subsample matrix selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size) # fit try: clf.fit(X[selection], y[selection]) except ValueError: continue # record coefficients if n_classes == 2: pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0) neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0) pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0) neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0) else: pos_scores += clf.coef_ > 0 neg_scores += clf.coef_ < 0 pbar.progress(round(i / configs.NUM_ITERS.value, 1)) # normalize pos_scores = pos_scores / configs.NUM_ITERS.value neg_scores = neg_scores / configs.NUM_ITERS.value # get only active features pos_positions = np.where(pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0) neg_positions = np.where(neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0) # prepare DataFrame pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())] neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())] return pos, neg def output_transform(pos: List[Tuple[str, float, str]], neg: List[Tuple[str, float, str]]) -> DataFrame: posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(["label", "score"], ascending=False) posdf["correlation"] = "positive" negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(["label", "score"], ascending=False) negdf["correlation"] = "negative" output = pd.concat([posdf, negdf], ignore_index=False, axis=0) output.columns = output.columns.str.title() return output