Spaces:

waidhoferj
/

major-matcher

Runtime error

App Files Files Community

waidhoferj commited on Jan 15, 2023

Commit

aadb779

0 Parent(s):

first commit

Browse files

Files changed (36) hide show

.gitattributes +6 -0
.gitignore +135 -0
README.md +31 -0
app.py +55 -0
classifiers/bert.py +137 -0
classifiers/mlp.py +156 -0
data/course_sentences.csv +3 -0
data/courses.csv +3 -0
data/program_courses.csv +3 -0
data/program_descriptions.csv +3 -0
data/stopwords/course_prefixes.txt +3 -0
data/stopwords/invalid_description_phrases.txt +3 -0
data/stopwords/other_words.txt +3 -0
embeddings/bert.py +65 -0
embeddings/word2vec.py +99 -0
environment.yml +21 -0
explore.ipynb +0 -0
preprocessing/build_data_dict.py +135 -0
preprocessing/course_scraper.py +213 -0
preprocessing/format_input_data.py +48 -0
preprocessing/helper.py +57 -0
test.py +84 -0
train.py +34 -0
weights/bert_classifier_deployment_weights/config.json +3 -0
weights/bert_classifier_deployment_weights/optimizer.pt +3 -0
weights/bert_classifier_deployment_weights/pytorch_model.bin +3 -0
weights/bert_classifier_deployment_weights/rng_state.pth +3 -0
weights/bert_classifier_deployment_weights/scheduler.pt +3 -0
weights/bert_classifier_deployment_weights/special_tokens_map.json +3 -0
weights/bert_classifier_deployment_weights/tokenizer.json +3 -0
weights/bert_classifier_deployment_weights/tokenizer_config.json +3 -0
weights/bert_classifier_deployment_weights/trainer_state.json +3 -0
weights/bert_classifier_deployment_weights/training_args.bin +3 -0
weights/bert_classifier_deployment_weights/vocab.txt +3 -0
weights/major_classifier/config.json +3 -0
weights/major_classifier/weights.pt +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,6 @@

+*.json filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.txt filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,135 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# Model weights
+figures
+.DS_Store

README.md ADDED Viewed

	@@ -0,0 +1,31 @@

+---
+title: Major Matcher
+emoji: 🎓
+colorFrom: green
+colorTo: yellow
+sdk: gradio
+python_version: 3.10.8
+sdk_version: 3.15.0
+app_file: app.py
+pinned: false
+---
+# Major Matcher
+A tool for matching student interests to areas of study.
+## Getting Started
+1. Set up python environment:
+```
+conda env create --file environment.yml
+conda activate major-matcher
+```
+## Project Layout
+- `embeddings`: Sklearn-style transformers that encode natural language into latent embedding vectors.
+- `classifiers`: Model architectures for classifying college majors.
+- `test.py`: Evaluation and demo code for all models.
+- `train.py`: Training loops for models.

app.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import gradio as gr
+from classifiers.bert import BertClassifier
+import os
+import numpy as np
+from functools import cache
+from preprocessing.helper import get_recommendations
+CONFIG_FILE = os.path.join("weights", "bert_classifier_deployment_weights")
+N_SUGGESTIONS = 3
+@cache
+def get_model(config_path: str) -> BertClassifier:
+    bert_classifier = BertClassifier(device="mps")
+    bert_classifier.load_weights(config_path)
+    return bert_classifier
+def predict(interests: str) -> list[str]:
+    bert_classifier = get_model(CONFIG_FILE)
+    probs = bert_classifier.predict_proba(interests)
+    labels = np.array(bert_classifier.labels)
+    results_mask = (-probs).argsort(-1)[:,:N_SUGGESTIONS]
+    suggested_majors = labels[results_mask][0].tolist()
+    confidences = probs[0][results_mask[0]]
+    confidences /= confidences.sum()
+    confidences = confidences.tolist()
+    return dict(zip(suggested_majors, confidences))
+def demo():
+    title = "Major Matcher"
+    description = "Describe your interests and the model will suggest a compatible college major."
+    example_interests = [
+        "I really enjoy spending time with animals.",
+        "I like playing music and dancing.",
+        "A good book makes me happy."
+    ]
+    app = gr.Interface(
+        title=title,
+        description=description,
+        inputs=gr.TextArea(
+            label="Describe your interests",
+            placeholder="I really enjoy..."
+        ),
+        fn=predict,
+        outputs=gr.Label(label="Suggested Majors"),
+        examples=example_interests
+    )
+    return app
+if __name__ == "__main__":
+    demo().launch()

classifiers/bert.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import pipeline
+from sklearn.base import BaseEstimator, ClassifierMixin
+import numpy as np
+from typing import List, Tuple
+from sklearn.model_selection import train_test_split
+from sklearn.utils.class_weight import compute_class_weight
+from transformers import AutoTokenizer
+from transformers import DataCollatorWithPadding
+from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
+from torch.utils.data import Dataset
+from pathlib import Path
+import json
+from numpy.typing import NDArray
+class BertClassifier(BaseEstimator, ClassifierMixin):
+    def __init__(self, seed=42, epochs=5, device="cpu"):
+        super().__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+        self.seed = seed
+        self.epochs = epochs
+        self.model = None
+        self.labels = None
+        self.device=device
+    def _get_classes(self, y: List[str]) -> Tuple[NDArray, List[str]]:
+        labels = sorted(set(y))
+        ids = [i for i in range(len(labels))]
+        return ids, labels
+    def _compute_metrics(self,eval_pairs):
+            logits, labels = eval_pairs
+            n = 3
+            ordered_choices = (-logits).argsort(-1)[:,:n]
+            metrics = {}
+            metrics["top_n_accuracy"] = np.mean([label in choices for label, choices in zip(labels, ordered_choices)])
+            metrics["accuracy"] = np.mean(labels == ordered_choices[:,0])
+            return metrics
+    def load_weights(self, path:str):
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            path).to(self.device)
+        self.labels = list(self.model.config.label2id.keys())
+    def _tokenize(self, texts:List[str]) -> torch.Tensor:
+        return self.tokenizer(texts, padding=True,
+            truncation=True,
+            max_length=100,
+            return_tensors="pt").to(self.device)
+    def fit(self, X:List[str], y:List[str]):
+        ids, labels = self._get_classes(y)
+        self.labels = labels
+        id2label = dict(zip(ids,labels))
+        label2id = dict(zip(labels,ids))
+        X = self._tokenize(X)
+        dataset = [{"input_ids": text, "label": label2id[label]} for text, label in zip(X["input_ids"],y)]
+        train_ds, test_ds = train_test_split(dataset, shuffle=True, random_state=self.seed, train_size=0.85)
+        batch_size = 64
+        model = AutoModelForSequenceClassification.from_pretrained(
+            "distilbert-base-uncased", num_labels=len(labels), id2label=id2label, label2id=label2id
+        ).to(self.device)
+        weights_path="weights/bert_classifier"
+        training_args = TrainingArguments(
+            output_dir=weights_path,
+            learning_rate=2e-5,
+            per_device_train_batch_size=batch_size,
+            per_device_eval_batch_size=batch_size,
+            num_train_epochs=self.epochs,
+            weight_decay=0.01,
+            evaluation_strategy="epoch",
+            save_strategy="epoch",
+            load_best_model_at_end=True,
+            push_to_hub=False,
+            use_mps_device=self.device=="mps"
+        )
+        class_weights = torch.Tensor()
+        trainer = WeightedTrainer(
+            class_ids=ids,
+            model=model,
+            args=training_args,
+            train_dataset=train_ds,
+            eval_dataset=test_ds,
+            tokenizer=self.tokenizer,
+            compute_metrics=self._compute_metrics
+        )
+        trainer.train()
+        model.eval()
+        self.model = model
+    def predict_proba(self, X:List[str]) -> NDArray:
+        if self.model is None:
+            raise Exception("Fit the model before inference.")
+        tokens = self._tokenize(X)
+        with torch.no_grad():
+            logits = self.model(**tokens).logits
+            return F.softmax(logits, -1).cpu().numpy()
+    def predict(self, X:List[str])-> List[str]:
+        preds = self.predict_proba(X)
+        return [self.labels[i] for i in preds.argmax(-1)]
+class WeightedTrainer(Trainer):
+    def __init__(self,class_ids, train_dataset, *args, **kwargs):
+        super().__init__(train_dataset=train_dataset, *args,**kwargs)
+        y_train = [y["label"] for y in train_dataset]
+        class_weights = compute_class_weight("balanced", classes=class_ids, y=y_train).astype("float32")
+        class_weights = torch.from_numpy(class_weights).to(self.args.device.type)
+        self.criteria = nn.CrossEntropyLoss(weight=class_weights)
+    def compute_loss(self, model, inputs, return_outputs=False):
+        labels = inputs.get("labels")
+        # forward pass
+        outputs = model(**inputs)
+        logits = outputs.get("logits")
+        loss = self.criteria(logits.view(-1, self.model.config.num_labels), labels.view(-1))
+        return (loss, outputs) if return_outputs else loss

classifiers/mlp.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from sklearn.base import BaseEstimator, ClassifierMixin
+import numpy as np
+from typing import List, Tuple
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.utils.class_weight import compute_class_weight
+import json
+import os
+class MajorMlpClassifier(BaseEstimator, ClassifierMixin):
+    def __init__(self, device="cpu", seed=42, epochs=200, patience:int=None):
+        super().__init__()
+        self.device = device
+        self.seed = seed
+        self.model = None
+        self.epochs = epochs
+        self.patience = patience if patience is not None else epochs
+        self.class_labels = None
+    def _preprocess_features(self, X: np.ndarray) -> np.ndarray:
+        return torch.from_numpy(X).to(self.device)
+    def _preprocess_labels(self, y: List[str]) -> np.ndarray:
+        unique_labels = np.array(self._get_classes(y))
+        one_hot = np.array([
+            unique_labels == label
+            for label in y
+        ], dtype="float32")
+        return torch.from_numpy(one_hot).to(self.device)
+    def _get_classes(self, y: List[str]) -> List[str]:
+        return sorted(set(y))
+    def fit(self, X:np.ndarray, y:List[str]):
+        """
+        Args:
+            X: embeddings of shape (n_sentences, embedding_size)
+            y: program labels that match with each sentence
+        """
+        self.class_labels = np.array(self._get_classes(y))
+        class_weights = compute_class_weight("balanced", classes=self.class_labels, y=y).astype("float32")
+        class_weights = torch.from_numpy(class_weights).to(self.device)
+        X, y = self._preprocess_features(X), self._preprocess_labels(y)
+        x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=self.seed, shuffle=True)
+        should_stop = EarlyStopping(self.patience)
+        val_loss = np.inf
+        model = ProgramClassifierNetwork(x_train.shape[1], y_train.shape[1])
+        model = model.to(self.device)
+        optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
+        criterion = nn.CrossEntropyLoss(weight=class_weights)
+        epoch = 0
+        while not should_stop.step(val_loss) and epoch < self.epochs:
+            preds = model(x_train)
+            loss = criterion(preds, y_train)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            with torch.no_grad():
+                val_preds = model(x_val)
+                val_loss = criterion(val_preds, y_val).item()
+            epoch += 1
+        model.eval()
+        self.model = model
+    def predict_proba(self, X:np.ndarray) -> np.ndarray:
+        X = self._preprocess_features(X)
+        if self.model is None:
+            raise Exception("Train model with fit() before predicting.")
+        with torch.no_grad():
+            logits = self.model(X)
+            return F.softmax(logits, dim=-1).cpu().numpy()
+    def predict(self, X:np.ndarray) -> List[str]:
+        """
+        Args:
+            X: embeddings of shape (n_sentences, embedding_size)
+        Returns:
+            predicted classes for each embedding
+        """
+        pred_i = self.predict_proba(X).argmax(-1)
+        return self.class_labels[pred_i]
+    def save_weights(self,path:str):
+        os.makedirs(path, exist_ok=True)
+        weights_path = os.path.join(path, "weights.pt")
+        config_path = os.path.join(path,"config.json")
+        torch.save(self.model.state_dict(), weights_path)
+        state = {
+            "device": self.device,
+            "seed": self.seed,
+            "epochs": self.epochs,
+            "patience": self.patience,
+            "class_labels": list(self.class_labels)
+        }
+        with open(config_path, "w") as f:
+            json.dump(state, f)
+    def load_weights(self, path:str):
+        weights_path = os.path.join(path, "weights.pt")
+        config_path = os.path.join(path,"config.json")
+        state_dict = torch.load(weights_path)
+        input_size = int(state_dict["input_size"].item())
+        n_classes = int(state_dict["n_classes"].item())
+        model = ProgramClassifierNetwork(input_size,n_classes).to(self.device)
+        model.load_state_dict(state_dict)
+        model.eval()
+        self.model = model
+        with open(config_path, "r") as f:
+            config = json.load(f)
+        config["class_labels"] = np.array(config["class_labels"]) if config["class_labels"] is not None else None
+        self.__dict__.update(config)
+class ProgramClassifierNetwork(nn.Module):
+    def __init__(self, input_size:int, n_classes:int) -> None:
+        super().__init__()
+        self.input_size = nn.Parameter(torch.Tensor([input_size]), requires_grad=False)
+        self.n_classes = nn.Parameter(torch.Tensor([n_classes]), requires_grad=False)
+        self.classifier = nn.Sequential(
+            nn.BatchNorm1d(input_size),
+            nn.Linear(input_size, 512),
+            nn.ReLU(),
+            nn.Linear(512, 256),
+            nn.ReLU(),
+            nn.Linear(256, 128),
+            nn.ReLU(),
+            nn.Linear(128, n_classes),
+        )
+    def forward(self,x):
+        return self.classifier(x)
+class EarlyStopping:
+    def __init__(self, patience=0):
+        self.patience = patience
+        self.last_measure = np.inf
+        self.consecutive_increase = 0
+    def step(self, val) -> bool:
+        if self.last_measure <= val:
+            self.consecutive_increase +=1
+        else:
+            self.consecutive_increase = 0
+        self.last_measure = val
+        return self.patience < self.consecutive_increase

data/course_sentences.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65fb25d9fdb21cf3bee8894bb5a364d1e82501994fa80c517ca8d6b449ef195c
+size 366591

data/courses.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e3b6db7c601b4dd5e618305d19c69aab4443201d3d3f9eaecba71848188a627
+size 2057639

data/program_courses.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac10e70cb89c2bea821a06b5789acaf7f0bdf6fd25effacdb5f62439740c7a05
+size 778110

data/program_descriptions.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73be736b1119baa9a4f7e4beeb40a91f57044a63a514fce5ca21c6a128185c76
+size 15395

data/stopwords/course_prefixes.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b12b360d88612aa019620e1ac37dbe370d56cc9f706dd57937a97b4167777dc1
+size 445

data/stopwords/invalid_description_phrases.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3c1c6990d131626c39fe30ef03c2ef60f0800262441a1c11de6845f1d4bd36e
+size 258

data/stopwords/other_words.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f89c5d0e1b455e5f489f34e362e7b9b6ebdd3a78bcf5b88c43cbd6421093d82b
+size 1417

embeddings/bert.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+from transformers import BertModel, BertTokenizer
+from sklearn.base import BaseEstimator, TransformerMixin
+import numpy as np
+import pandas as pd
+from sklearn.neighbors import KNeighborsClassifier
+class BertSentenceEmbedder(BaseEstimator, TransformerMixin):
+    def __init__(self, device="cpu",padding_length=50):
+        """
+        Args:
+            `device`: pytorch device for inference. Either 'cpu' or a specific type of GPU.
+            `padding_length`: The max sentence token length. Shorter sentences are padded to this length.
+        """
+        self._device = device
+        self._tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
+        self._model = model.to(device)
+        self._model.eval()
+        self._padding_length = padding_length
+    def transform(self, X:list) -> np.ndarray:
+        """
+        Transforms sentences into embeddings
+        Args:
+            `X`: a dataset of sentences of shape (n_sentences,)
+        Returns:
+            Embeddings of the provided sentences of shape (n_sentences, embedding_dims)
+        """
+        tokens = self._tokenizer(
+            X,
+            return_token_type_ids=False,
+            return_attention_mask=False,
+            padding=True,
+            truncation=True,
+            max_length=self._padding_length,
+            return_tensors="pt"
+            )
+        tokens = tokens["input_ids"].to(self._device)
+        with torch.no_grad():
+           hidden_states = self._model(
+                            input_ids=tokens,
+                            output_hidden_states=True
+                            )["hidden_states"]
+        embeddings = torch.cat(hidden_states[-4:], dim=-1)
+        embeddings = torch.mean(embeddings, dim=1)
+        return embeddings.cpu().numpy()
+if __name__ == "__main__":
+    df = pd.read_csv("course_sentences.csv")
+    embedder = BertSentenceEmbedder("mps", padding_length=1000)
+    embeddings = embedder.transform(list(df["sentence"]))
+    labels = df["program"]
+    classifier = KNeighborsClassifier(n_neighbors=10)
+    classifier.fit(embeddings, labels)
+    num_suggestions = 10
+    prompt = "Covers methods currently available to address complexity, including systems thinking, model based systems engineering and life cycle governance."
+    embedding = embedder.transform([prompt])
+    probs = classifier.predict_proba(embedding)[0]
+    idx = np.argsort(-probs)[:num_suggestions]
+    label_map = np.array(sorted(set(labels)))
+    print(prompt, label_map[idx], probs[idx])

embeddings/word2vec.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from typing import List
+from sklearn.base import BaseEstimator, TransformerMixin
+import numpy as np
+from glob import iglob
+import pandas as pd
+from sklearn.neighbors import KNeighborsClassifier
+from gensim.models import Word2Vec
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from functools import cache
+class Word2VecEmbedder(BaseEstimator, TransformerMixin):
+    def __init__(self, vector_size=100):
+        self.model = None
+        self.stop_words = get_stopwords()
+        self.vector_size = vector_size
+    def _preprocess(self, text:str) -> List[str]:
+       words = word_tokenize(text)
+       only_keywords = [word for word in words
+                        if word not in self.stop_words
+                        and word.isalpha()]
+       return only_keywords
+    def fit(self, sentences:List[str]):
+        sentences = [self._preprocess(t) for t in sentences]
+        self.model = Word2Vec(sentences, vector_size=self.vector_size, window=5, min_count=1, workers=4)
+    def transform(self, X:List[str]) -> List[List[np.ndarray]]:
+        if self.model is None:
+            raise Exception("fit model before transforming")
+        sents = map(self._preprocess,X)
+        def get_embedding(word):
+            try:
+                return self.model.wv[word]
+            except:
+                return np.zeros((self.vector_size,))
+        return [[get_embedding(word) for word in sent] for sent in sents]
+    def latent_distance(self,text1:str, text2:str) -> float:
+        first_tokens, second_tokens = self.transform([text1])[0], self.transform([text2])[0]
+        sum_dist = 0.0
+        for t1 in first_tokens:
+            for t2 in second_tokens:
+                sum_dist += np.sum((t2-t1)**2)**0.5
+        return sum_dist / float(len(first_tokens) * len(second_tokens))
+@cache
+def get_stopwords():
+    words = set()
+    for stop_file in iglob("stopwords/*.txt"):
+        with open(stop_file, "r") as f:
+            words.update(l.lower() for l in f.readlines())
+    return set(stopwords.words('english')) | words
+def test_latent_dist():
+    df = pd.read_csv("course_sentences.csv")
+    embedder = Word2VecEmbedder()
+    sentences = list(df["sentence"])
+    embedder.fit(sentences)
+    show_dist = lambda s1, s2: print(s1 + "\n", s2 + "\n", embedder.latent_distance(s1,s2))
+    show_dist(*(["This is the same sentence."] * 2))
+    show_dist("artificial intelligence is my passion", "I really enjoy computer science")
+    show_dist("artificial intelligence is my passion", "I really enjoy archeology")
+def test_pipeline():
+    df = pd.read_csv("course_sentences.csv")
+    embedder = Word2VecEmbedder()
+    sentences = list(df["sentence"])
+    embedder.fit(sentences)
+    embeddings = embedder.transform(sentences)
+    labels = df["program"]
+    classifier = KNeighborsClassifier(n_neighbors=10)
+    classifier.fit(embeddings, labels)
+    num_suggestions = 10
+    prompt = "Covers methods currently available to address complexity, including systems thinking, model based systems engineering and life cycle governance."
+    embedding = embedder.transform([prompt])
+    probs = classifier.predict_proba(embedding)[0]
+    idx = np.argsort(-probs)[:num_suggestions]
+    label_map = np.array(sorted(set(labels)))
+    print(prompt, label_map[idx], probs[idx])
+if __name__ == "__main__":
+    test_latent_dist()

environment.yml ADDED Viewed

	@@ -0,0 +1,21 @@

+name: major-matcher
+channels:
+  - anaconda
+  - conda-forge
+dependencies:
+  - python=3.10
+  - pytorch
+  - torchvision
+  - matplotlib
+  - numpy
+  - pandas
+  - transformers
+  - ipykernel
+  - scikit-learn
+  - beautifulsoup4
+  - nltk
+  - request
+  - seaborn
+  - pip:
+      - gradio
+      - wakepy

explore.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessing/build_data_dict.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import collections
+import pandas as pd
+import course_scraper
+from nltk.corpus import stopwords
+import string
+import re
+DESCRIPTION = "Description"
+COURSE_PREFIX = "Course Prefix"
+words_to_remove = ["lectures", "per", "two", "and/or", "``", "''", "laboratory", "course", "courses", "work",
+                   "students", "units", "total", "selected", "may", "major", "'s", "quarter", "and/or", "report", "undergraduate", "format",
+                   "laboratory", "limited", "topics", "fulfills", "including", "topic", "catalogs", "list", "earlier", "overview", "impact",
+                   "required", "open", "study", "class", "grading", "credit/no", "individual", "kine", "new", "within", "offered",
+                   "laboratories", "include", "use", "using", "used", "basic", "student", "current", "related", "practice",
+                   "online", "examination", "formal", "quality", "one", "time", "must", "maximum", "hours", "effects"]
+ge_areas = ["a", "b", "c", "d", "e", "f",
+            "area", "areas", "uscp", "upper-division"]
+year = ["2017-19", "2019-20"]
+stopwords_to_remove = ["ge", "credit", "class", "topics", "course", "following", "student", "units", "section", "study", "k", "unit", "week", "used",
+                       "division", "catalogs", "graduate", "selected", "courses", "may", "majors", "format", "emphasis", "area", "hours", "emphasized",
+                       "non", "based", "application", "applications", "classroom", "introduction", "students", "crosslisted", "focus", "methods", "completion",
+                       "required", "implementation", "u", "better", "part", "fields", "completed", "taken", "well", "grade", "present", "basic", "etc"
+                       "graduates", "variety", "context", "presented", "instruction", "quarter", "projects", "meet", "fulfills", "enroll", "enrollment",
+                       "requirement", "studies", "surveys", "planning", "discussion", "assessment", "role", "field", "preparation", "principles", "evaluation",
+                       "techniques", "selection", "practices", "concepts", "faculty", "theories", "issues", "paid", "usually", "quarters", "independent",
+                       "fundamentals", "project", "senior"]
+def generate_ge_prefixes():
+    letters = ["a", "b", "c", "d", "e", "f"]
+    numbers = list(range(1, 8))
+    pairs = []
+    for letter in letters:
+        for num in numbers:
+            pairs.append(letter + str(num))
+    return pairs
+ge_prefixes = generate_ge_prefixes()
+filter_set = set(stopwords.words('english'))
+filter_set.update(string.punctuation, words_to_remove, stopwords_to_remove,
+                  ge_areas, year, ge_prefixes)
+def preprocess(text):
+    text_input = re.sub('[^a-zA-Z1-9]+', ' ', str(text))
+    output = re.sub(r'\d+', '', text_input)
+    return output.lower().strip()
+def clean_text(text):
+    # add spaces and replace leading "and" or "&"
+    return re.sub('^(and|&)', '', text.replace('\xa0', " ")).strip()
+def remove_stopwords(text):
+    filtered_words = [word.lower()
+                      for word in text.split() if word.lower() not in filter_set]
+    return " ".join(filtered_words)
+def build_word_course_dict():
+    df = pd.read_csv(course_scraper.FILE_NAME)
+    df[DESCRIPTION] = df[DESCRIPTION].map(preprocess)
+    df[DESCRIPTION] = df[DESCRIPTION].map(remove_stopwords)
+    word_course_dict = collections.defaultdict(list)
+    for index, row in df.iterrows():
+        description = row[DESCRIPTION]
+        prefix = row[COURSE_PREFIX]
+        prefixes = format_course_prefixes(prefix)
+        for word in description.split(' '):
+            word_course_dict[word] += prefixes
+    return word_course_dict
+def format_course_prefixes(prefix_str: str):
+    # Returns a list of course prefixes
+    formatted_prefixes = []
+    # one course listed
+    if "/" not in prefix_str:
+        formatted_prefixes.append(prefix_str.replace(" ", "-"))
+        return formatted_prefixes
+    # multiple courses
+    split_prefixes = re.split('/| ', prefix_str)
+    course_number_count = len([e for e in split_prefixes if e.isdigit()])
+    if course_number_count == 1:
+        # crosslisted courses with different depts, same number (HIST/HNRS 335)
+        course_num = split_prefixes[-1]
+        for prefix in split_prefixes[:-1]:
+            formatted_prefixes.append(f'{prefix}-{course_num}')
+        return formatted_prefixes
+    else:
+    # crosslisted courses with different numbers (HNRS 304/ISLA 303)
+        for i in range(0,len(split_prefixes)-1,2):
+            prefix = split_prefixes[i]
+            course_num = split_prefixes[i+1]
+            formatted_prefixes.append(f'{prefix}-{course_num}')
+        return formatted_prefixes
+def build_course_program_dict():
+    df = pd.read_csv("program_courses.csv")
+    df["Program"] = df["Program"].map(clean_text)
+    program_course_dict = collections.defaultdict(list)
+    for index, row in df.iterrows():
+        program = row["Program"]
+        course_prefix = row["Course Prefix"]
+        # handle mulitple prefixes EX. CPE/CSC 123
+        prefixes = format_course_prefixes(course_prefix)
+        # print(program, course_prefix, prefixes)
+        for prefix in prefixes:
+            program_course_dict[prefix] += [program]
+    return program_course_dict
+if __name__ == "__main__":
+    d = build_word_course_dict()
+    program_course_dict = build_course_program_dict()
+    print(d)
+    # print(program_course_dict)

preprocessing/course_scraper.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+from typing import List, Tuple
+FILE_NAME: str = "courses.csv"
+MATRIX_FILE_NAME: str = "catalog_matrix.csv"
+BASE_COURSE_CATALOG_URL: str = "https://catalog.calpoly.edu/coursesaz/"
+BASE_COLLEGES_DEPT_URL: str = "https://catalog.calpoly.edu/coursesaz/#courseprefixestext"
+def getTextWithinParentheses(text: str):
+    return text[text.find('(')+1:text.find(')')]
+def clean_link_text(text):
+    return text.replace(')', "").replace('(', "").strip()
+def parse_college_html(college_html) -> List[Tuple[str, List[str]]]:
+    depts = []
+    college_dept_pairs = []
+    current_college = None
+    for child in college_html.children:
+        tag = child.name
+        text = child.text
+        # dept
+        if(tag == 'a'):
+            depts.append(clean_link_text(text))
+        # college
+        elif(tag == 'strong'):
+            if(current_college is not None):
+                college_dept_pairs.append([current_college, depts])
+                depts = []
+            current_college = text.strip()
+    college_dept_pairs.append((current_college, depts))
+    return college_dept_pairs
+def parse_college_department_html(html) -> List[str]:
+    depts = []
+    for dept_html in html.children:
+        if(dept_html.name is not None):
+            dept = dept_html.text.split('(')[0].strip()
+            prefixes = [a.text.replace(')', "")
+                        for a in dept_html.find_all("a")]
+            depts.append((dept, prefixes))
+    return depts
+# iterates through all depts of a college
+# {
+#     'Biological Science': [BIO, BOT MCRO, MSCI],
+#     'Chemistry and Biochemisty': [CHEM]
+#       ...
+# }
+def create_depts_dict(depts):
+    d = {}
+    for dept in depts:
+        d |= {
+            dept[0]: dept[1]
+        }
+    return d
+def scrape_course_prefixes():
+    DEPT_PREFIXES_ID: str = "courseprefixestextcontainer"
+    course_page = requests.get(BASE_COLLEGES_DEPT_URL)
+    soup = soup = BeautifulSoup(course_page.content, "html.parser")
+    colleges_html = soup.find(id=DEPT_PREFIXES_ID).div
+    college_dept_dict = {}
+    current_college = None
+    depts = []
+    for child in colleges_html.children:
+        tag = child.name
+        if (tag == "ul"):  # nested departments
+            depts = parse_college_department_html(child)
+        elif (tag == 'p'):  # college
+            college_dept_pairs = parse_college_html(child)
+            # more than one, so add all but the last to dictionary
+            if(len(college_dept_pairs) > 1):
+                # add current college and departments
+                college_dept_dict |= {
+                    current_college[0]: {
+                        current_college[0]: current_college[1]
+                    }
+                }
+                college_dept_dict[current_college[0]
+                                  ] |= create_depts_dict(depts)
+                # add all other colleges except for the last one
+                for i in range(len(college_dept_pairs)-1):
+                    # add colleges with no departments
+                    college_dept_dict |= {
+                        college_dept_pairs[i][0]: {
+                            college_dept_pairs[i][0]: college_dept_pairs[i][1]
+                        }
+                    }
+                # set current college to last in pairs
+                current_college = college_dept_pairs[-1]
+            else:
+                if (current_college is not None):
+                    # add current college and departments
+                    college_dept_dict |= {
+                        current_college[0]: {
+                            current_college[0]: current_college[1]
+                        }
+                    }
+                    # add college departments
+                    college_dept_dict[current_college[0]
+                                      ] |= create_depts_dict(depts)
+                # set new current college
+                current_college = college_dept_pairs[0]
+    # add last current college
+    college_dept_dict |= {
+        current_college[0]: {
+            current_college[0]: current_college[1]
+        }
+    }
+    # add college departments
+    college_dept_dict[current_college[0]
+                      ] |= create_depts_dict(depts)
+    return college_dept_dict
+def extract_course_info(data, college, dept, prefix):
+    prefix = prefix.lower()
+    url = f'{BASE_COURSE_CATALOG_URL}/{prefix}'
+    page = requests.get(url)
+    # scrape data
+    soup = BeautifulSoup(page.content, "html.parser")
+    courses = soup.find_all("div", class_="courseblock")
+    if (college == dept):
+        dept = f'{dept} Dept'
+    for c in courses:
+        course_name: List[str] = c.find(
+            "p", class_="courseblocktitle").strong.contents[0].split(".")
+        course_num: str = course_name[0].replace(
+            "\xa0", "-").strip()  # replace nonbreaking space
+        name: str = course_name[1].strip()
+        units: str = c.find("span", class_="courseblockhours").text.strip()
+        description: str = c.find(
+            "div", class_="courseblockdesc").p.text.strip()
+        data.append([college, dept, course_num,
+                    name, units, description, college+dept, dept+course_num])
+def scrape_courses(prefixes_dict):
+    data = []
+    for college in prefixes_dict.keys():
+        for dept in prefixes_dict[college]:
+            for prefix in prefixes_dict[college][dept]:
+                if(prefix):
+                    extract_course_info(data, college, dept, prefix)
+    return data
+def build_df(data):
+    column_names = ["College", "Dept", "Course Prefix",
+                    "Course Name", "Units", "Description", "College+Dept", "Dept+CourseNum"]
+    df = pd.DataFrame(data, columns=column_names)
+    return df
+def find_match(course_list, matrix):
+    for row in matrix.index:
+        for col in matrix.columns:
+            match = not(course_list[(course_list['College+Dept'] == row+col)
+                                    ].empty) or not(course_list[(course_list['Dept+CourseNum'] == row+col)].empty)
+            if(match):
+                matrix.loc[row, col] = 1
+                # print(row, col)
+    return
+def build_adj_matrix(course_list: pd.DataFrame):
+    colleges = list(course_list["College"].unique())
+    depts = list(course_list["Dept"].unique())
+    courses = list(course_list["Course Prefix"].unique())
+    indices = [(1, college) for college in colleges] + \
+        [(2, dept) for dept in depts] + [(3, course) for course in courses]
+    multi_index = pd.MultiIndex.from_tuples(indices)
+    adj_matrix = pd.DataFrame(index=multi_index, columns=multi_index).fillna(0)
+    # grab necessary sections
+    colleges_to_depts = adj_matrix.loc[1, 2]
+    depts_to_courses = adj_matrix.loc[2, 3]
+    # mark matches
+    find_match(course_list, colleges_to_depts)
+    find_match(course_list, depts_to_courses)
+    return adj_matrix
+if __name__ == "__main__":
+    prefixes_dict = scrape_course_prefixes()
+    courses = scrape_courses(prefixes_dict)
+    course_list = build_df(courses)
+    course_list.to_csv(FILE_NAME, index=False)
+    # course_list = pd.read_csv(FILE_NAME)
+    # adj_matrix = build_adj_matrix(course_list)
+    # adj_matrix.to_csv(MATRIX_FILE_NAME)

preprocessing/format_input_data.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import pandas as pd
+from nltk.tokenize import sent_tokenize
+from build_data_dict import build_course_program_dict
+OUTPUT_FILE = "course_sentences.csv"
+invalid_phrases = [line.rstrip('\n') for line in open(
+        'stopwords/invalid_description_phrases.txt')]  # Load .txt file line by line
+def is_valid_sentence(sentence):
+    if sentence == "":
+        return False;
+    return all(phrase not in sentence.lower() for phrase in invalid_phrases)
+if __name__ == "__main__":
+    courses_df = pd.read_csv("courses.csv")
+    course_program_dict = build_course_program_dict()
+    rows = []
+    for course, programs in course_program_dict.items():
+        # only capture unique courses
+        if (len(programs) > 1):
+            continue
+        course_row = courses_df.loc[courses_df['Course Prefix'] == course]
+        if(len(course_row["Description"].values) == 0):
+            continue;
+        course_description = course_row["Description"].values[0]
+        sentences = sent_tokenize(course_description)
+        sentences = [sentence.strip() for sentence in sentences if is_valid_sentence(sentence)]
+        # if a course belongs to more than one program, use the department as the program
+        if len(programs) > 1:
+            dept = course_row["Dept"].values[0]
+            for sentence in sentences:
+                rows.append([sentence, course, dept])
+            continue
+        else:
+            for program in programs:
+                for sentence in sentences:
+                    rows.append([sentence, course, program])
+    output_df = pd.DataFrame(rows, columns=["sentence", "course", "program"])
+    output_df.to_csv(OUTPUT_FILE, index=False)

preprocessing/helper.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import re
+import pandas as pd
+import numpy as np
+from typing import Tuple, List
+from sklearn.metrics import confusion_matrix
+import seaborn as sns
+PROGRAM = "Program"
+def clean_text(text):
+    text_input = re.sub('[^a-zA-Z1-9]+', ' ', str(text))
+    output = re.sub(r'\d+', '', text_input)
+    return output.lower().strip()
+def get_num_courses_per_program():
+    df = pd.read_csv('program_courses.csv')
+    return df.groupby([PROGRAM])[PROGRAM].count()
+def load_data(num_majors=20, include_majors=[]) -> Tuple[List[str], np.ndarray]:
+    """
+    Loads and preprocesses `course_sentences` data.
+    """
+    courses = pd.read_csv("course_sentences.csv").drop(["course"], axis=1).dropna()
+    descriptions = pd.read_csv("program_descriptions.csv").rename(columns={"description": "sentence"}).dropna()
+    df = pd.concat([courses, descriptions], axis=0, ignore_index=True)
+    majors = list(df.groupby("program").count().sort_values(by=["sentence"], ascending=False).index)
+    majors = include_majors + majors
+    majors = majors[:num_majors]
+    df = df[df["program"].isin(majors)]
+    sentences = list(df["sentence"])
+    labels = np.array(df["program"])
+    return sentences, labels
+def plot_confusion_matrix(y_true:List[str], y_pred:List[str], classes:List[str]):
+    """Plots a confusion matrix"""
+    cm = confusion_matrix(y_true, y_pred, labels=classes)
+    cm_df=pd.DataFrame(data=cm, index=classes, columns=classes)
+    sns.heatmap(cm_df, annot=True)
+def get_recommendations(probs:np.ndarray, labels:List[str], n=5) -> List[List[str]]:
+    """
+    Args:
+        `probs`: predictions array of shape (n_inputs,n_classes)
+        `labels`: class labels of shape (n_classes,)
+        `n`: number of recommendations
+    Returns:
+        Top labels based on a probability distribution
+    """
+    np_labels = np.array(labels)
+    return np_labels[(-probs).argsort(-1)[:,:n]]

test.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from classifiers.mlp import MajorMlpClassifier
+from embeddings.bert import BertSentenceEmbedder
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.neural_network import MLPClassifier
+from classifiers.bert import BertClassifier
+import pandas as pd
+import numpy as np
+from typing import Tuple
+from sklearn.model_selection import train_test_split, cross_val_score
+from sklearn.metrics import classification_report
+from helper import load_data, get_recommendations, plot_confusion_matrix
+import matplotlib.pyplot as plt
+import os
+device = "mps"
+def evaluate(load_weights=False):
+    """
+    Performs basic train/test split evaluation.
+    """
+    os.makedirs("figures", exist_ok=True)
+    sentences, labels = load_data(num_majors=40)
+    embedder = BertSentenceEmbedder(device, padding_length=1000)
+    seed = 2
+    x_train, x_test, y_train, y_test = train_test_split(
+        sentences, labels, random_state=seed, shuffle=True, train_size=0.8
+    )
+    train_embeddings = embedder.transform(x_train)
+    test_embeddings = embedder.transform(x_test)
+    knn = KNeighborsClassifier()
+    mlp = MajorMlpClassifier(device)
+    bert_classifier = BertClassifier(
+        device=device,
+        epochs=25,
+    )
+    if load_weights:
+        mlp.load_weights("weights/major_classifier")
+        bert_classifier.load_weights("weights/bert_classifier_deployment_weights")
+    else:
+        bert_classifier.fit(x_train, y_train)
+        mlp.fit(train_embeddings, y_train)
+    knn.fit(train_embeddings, y_train)
+    class_labels = np.array(bert_classifier.labels)
+    def report(name, classifier, x, y, n=3):
+        probs = classifier.predict_proba(x)
+        ordered_choices = class_labels[(-probs).argsort(-1)[:, :n]]
+        preds = ordered_choices[:, 0]
+        print(name)
+        print(
+            f"Top {n} accuracy",
+            np.mean([label in choices for label, choices in zip(y, ordered_choices)]),
+        )
+        print(classification_report(y, preds))
+        plot_confusion_matrix(y, preds, class_labels)
+        plt.savefig(f"figures/{name}_cm.png")
+        plt.clf()
+    report("bert_classifier", bert_classifier, x_test, y_test)
+    report("KNN", knn, test_embeddings, y_test)
+    report("major_mlp", mlp, test_embeddings, y_test)
+def demo():
+    """
+    Interact with a model on the command line.
+    """
+    bert_classifier = BertClassifier(device="mps")
+    weights_path = os.path.join("weights", "bert_classifier_deployment_weights")
+    bert_classifier.load_weights(weights_path)
+    while True:
+        command = input("Describe your ideal major: ")
+        if command.lower() == "q" or command.lower() == "quit":
+            break
+        probs = bert_classifier.predict_proba(command)
+        labels = bert_classifier.labels
+        print(get_recommendations(probs, labels, n=3)[0])
+if __name__ == "__main__":
+    evaluate()

train.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from classifiers.bert import BertClassifier
+from classifiers.mlp import MajorMlpClassifier
+from embeddings.bert import BertSentenceEmbedder
+import pickle
+from helper import load_data
+def train_bert_classifier(
+    device="cpu",
+    n_classes=40,
+    include_majors=[],
+    epochs=25
+):
+    sentences, labels = load_data(num_majors=n_classes, include_majors=include_majors)
+    bert_classifier = BertClassifier(device=device, epochs=epochs)
+    bert_classifier.fit(sentences, labels)
+def train_major_classifier(
+    device="cpu",
+    n_classes=40,
+    include_majors=[],
+    epochs=200
+):
+    sentences, labels = load_data(num_majors=n_classes, include_majors=include_majors)
+    embedder = BertSentenceEmbedder(device, padding_length=1000)
+    embeddings = embedder.transform(sentences)
+    mlp = MajorMlpClassifier(device, epochs=epochs)
+    mlp.fit(embeddings,labels)
+    mlp.save_weights("weights/major_classifier")
+if __name__ == "__main__":
+    train_major_classifier(device="mps", include_majors=["Computer Science", "Computer Engineering"])

weights/bert_classifier_deployment_weights/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:537432f763d5b9dcbe88df36b8b8e05196a0a2369cd24056fced3b578e1a0da3
+size 3231

weights/bert_classifier_deployment_weights/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0191eb7aa75c118406fed2834d7ffed70c6c5a92afeee8f9962c7e193d11b75
+size 535931845

weights/bert_classifier_deployment_weights/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0699112fa18348be8faf08746a35633e3aad9b3d2c4a1f3e77cf530d098bf53f
+size 267970989

weights/bert_classifier_deployment_weights/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ded3012689b6a3d27eba5707e4bd84c34d9b7ae843c6db5691d7fb371280cc8
+size 13553

weights/bert_classifier_deployment_weights/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1010d34b4b970964c787c0e304bc7e95971298d58fc528d14dc0f6e8fa7e18c8
+size 627

weights/bert_classifier_deployment_weights/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6d346be366a7d1d48332dbc9fdf3bf8960b5d879522b7799ddba59e76237ee3
+size 125

weights/bert_classifier_deployment_weights/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e21aa12acbe7b5d48e4778418cf4976257693629e5e35fdb814273573bc31a4
+size 711649

weights/bert_classifier_deployment_weights/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a6acd3261769a08abd6fa132a83c8b8a2f3ae30f15fe5c0154dbb2a2bc9e4b3
+size 360

weights/bert_classifier_deployment_weights/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9775b1c63a50d33ae4a9bd72253706bdf5183b042c9f21790fb4b40939b57ea2
+size 7812

weights/bert_classifier_deployment_weights/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01fe6269350128c348b6e9ab911302385ab3aa80616ece458d16e5e5f5cd202f
+size 3387

weights/bert_classifier_deployment_weights/vocab.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3
+size 231508

weights/major_classifier/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee3f0b8c7edb437fc01437c125ebcd594838edf1f63d8f4eb33fb64b680e6e9f
+size 1017

weights/major_classifier/weights.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a001e177107249724e77823b8b26b3b7c5a0548d56244b46fffe346e4e0e896
+size 7024568