waidhoferj commited on
Commit
aadb779
·
0 Parent(s):

first commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ *.json filter=lfs diff=lfs merge=lfs -text
2
+ *.csv filter=lfs diff=lfs merge=lfs -text
3
+ *.pt filter=lfs diff=lfs merge=lfs -text
4
+ *.pth filter=lfs diff=lfs merge=lfs -text
5
+ *.bin filter=lfs diff=lfs merge=lfs -text
6
+ *.txt filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ # Model weights
132
+
133
+ figures
134
+ .DS_Store
135
+
README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Major Matcher
3
+ emoji: 🎓
4
+ colorFrom: green
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ python_version: 3.10.8
8
+ sdk_version: 3.15.0
9
+ app_file: app.py
10
+ pinned: false
11
+ ---
12
+
13
+ # Major Matcher
14
+
15
+ A tool for matching student interests to areas of study.
16
+
17
+ ## Getting Started
18
+
19
+ 1. Set up python environment:
20
+
21
+ ```
22
+ conda env create --file environment.yml
23
+ conda activate major-matcher
24
+ ```
25
+
26
+ ## Project Layout
27
+
28
+ - `embeddings`: Sklearn-style transformers that encode natural language into latent embedding vectors.
29
+ - `classifiers`: Model architectures for classifying college majors.
30
+ - `test.py`: Evaluation and demo code for all models.
31
+ - `train.py`: Training loops for models.
app.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from classifiers.bert import BertClassifier
3
+ import os
4
+ import numpy as np
5
+ from functools import cache
6
+ from preprocessing.helper import get_recommendations
7
+
8
+ CONFIG_FILE = os.path.join("weights", "bert_classifier_deployment_weights")
9
+ N_SUGGESTIONS = 3
10
+
11
+
12
+ @cache
13
+ def get_model(config_path: str) -> BertClassifier:
14
+ bert_classifier = BertClassifier(device="mps")
15
+ bert_classifier.load_weights(config_path)
16
+ return bert_classifier
17
+
18
+
19
+ def predict(interests: str) -> list[str]:
20
+ bert_classifier = get_model(CONFIG_FILE)
21
+ probs = bert_classifier.predict_proba(interests)
22
+ labels = np.array(bert_classifier.labels)
23
+ results_mask = (-probs).argsort(-1)[:,:N_SUGGESTIONS]
24
+ suggested_majors = labels[results_mask][0].tolist()
25
+ confidences = probs[0][results_mask[0]]
26
+ confidences /= confidences.sum()
27
+ confidences = confidences.tolist()
28
+ return dict(zip(suggested_majors, confidences))
29
+
30
+
31
+ def demo():
32
+ title = "Major Matcher"
33
+ description = "Describe your interests and the model will suggest a compatible college major."
34
+ example_interests = [
35
+ "I really enjoy spending time with animals.",
36
+ "I like playing music and dancing.",
37
+ "A good book makes me happy."
38
+ ]
39
+
40
+ app = gr.Interface(
41
+ title=title,
42
+ description=description,
43
+ inputs=gr.TextArea(
44
+ label="Describe your interests",
45
+ placeholder="I really enjoy..."
46
+ ),
47
+ fn=predict,
48
+ outputs=gr.Label(label="Suggested Majors"),
49
+ examples=example_interests
50
+ )
51
+ return app
52
+
53
+
54
+ if __name__ == "__main__":
55
+ demo().launch()
classifiers/bert.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from transformers import pipeline
5
+ from sklearn.base import BaseEstimator, ClassifierMixin
6
+ import numpy as np
7
+ from typing import List, Tuple
8
+ from sklearn.model_selection import train_test_split
9
+ from sklearn.utils.class_weight import compute_class_weight
10
+ from transformers import AutoTokenizer
11
+ from transformers import DataCollatorWithPadding
12
+ from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
13
+ from torch.utils.data import Dataset
14
+ from pathlib import Path
15
+ import json
16
+ from numpy.typing import NDArray
17
+
18
+ class BertClassifier(BaseEstimator, ClassifierMixin):
19
+ def __init__(self, seed=42, epochs=5, device="cpu"):
20
+ super().__init__()
21
+ self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
22
+ self.seed = seed
23
+ self.epochs = epochs
24
+ self.model = None
25
+ self.labels = None
26
+ self.device=device
27
+
28
+ def _get_classes(self, y: List[str]) -> Tuple[NDArray, List[str]]:
29
+ labels = sorted(set(y))
30
+ ids = [i for i in range(len(labels))]
31
+ return ids, labels
32
+
33
+ def _compute_metrics(self,eval_pairs):
34
+ logits, labels = eval_pairs
35
+ n = 3
36
+ ordered_choices = (-logits).argsort(-1)[:,:n]
37
+ metrics = {}
38
+ metrics["top_n_accuracy"] = np.mean([label in choices for label, choices in zip(labels, ordered_choices)])
39
+ metrics["accuracy"] = np.mean(labels == ordered_choices[:,0])
40
+ return metrics
41
+
42
+
43
+
44
+ def load_weights(self, path:str):
45
+ self.model = AutoModelForSequenceClassification.from_pretrained(
46
+ path).to(self.device)
47
+ self.labels = list(self.model.config.label2id.keys())
48
+
49
+ def _tokenize(self, texts:List[str]) -> torch.Tensor:
50
+ return self.tokenizer(texts, padding=True,
51
+ truncation=True,
52
+ max_length=100,
53
+ return_tensors="pt").to(self.device)
54
+
55
+
56
+
57
+ def fit(self, X:List[str], y:List[str]):
58
+ ids, labels = self._get_classes(y)
59
+ self.labels = labels
60
+ id2label = dict(zip(ids,labels))
61
+ label2id = dict(zip(labels,ids))
62
+ X = self._tokenize(X)
63
+ dataset = [{"input_ids": text, "label": label2id[label]} for text, label in zip(X["input_ids"],y)]
64
+ train_ds, test_ds = train_test_split(dataset, shuffle=True, random_state=self.seed, train_size=0.85)
65
+ batch_size = 64
66
+
67
+ model = AutoModelForSequenceClassification.from_pretrained(
68
+ "distilbert-base-uncased", num_labels=len(labels), id2label=id2label, label2id=label2id
69
+ ).to(self.device)
70
+ weights_path="weights/bert_classifier"
71
+ training_args = TrainingArguments(
72
+ output_dir=weights_path,
73
+ learning_rate=2e-5,
74
+ per_device_train_batch_size=batch_size,
75
+ per_device_eval_batch_size=batch_size,
76
+ num_train_epochs=self.epochs,
77
+ weight_decay=0.01,
78
+ evaluation_strategy="epoch",
79
+ save_strategy="epoch",
80
+ load_best_model_at_end=True,
81
+ push_to_hub=False,
82
+ use_mps_device=self.device=="mps"
83
+ )
84
+
85
+ class_weights = torch.Tensor()
86
+
87
+ trainer = WeightedTrainer(
88
+ class_ids=ids,
89
+ model=model,
90
+ args=training_args,
91
+ train_dataset=train_ds,
92
+ eval_dataset=test_ds,
93
+ tokenizer=self.tokenizer,
94
+ compute_metrics=self._compute_metrics
95
+ )
96
+
97
+ trainer.train()
98
+ model.eval()
99
+ self.model = model
100
+
101
+
102
+
103
+ def predict_proba(self, X:List[str]) -> NDArray:
104
+ if self.model is None:
105
+ raise Exception("Fit the model before inference.")
106
+ tokens = self._tokenize(X)
107
+ with torch.no_grad():
108
+ logits = self.model(**tokens).logits
109
+ return F.softmax(logits, -1).cpu().numpy()
110
+
111
+
112
+ def predict(self, X:List[str])-> List[str]:
113
+ preds = self.predict_proba(X)
114
+ return [self.labels[i] for i in preds.argmax(-1)]
115
+
116
+
117
+
118
+
119
+ class WeightedTrainer(Trainer):
120
+
121
+ def __init__(self,class_ids, train_dataset, *args, **kwargs):
122
+ super().__init__(train_dataset=train_dataset, *args,**kwargs)
123
+ y_train = [y["label"] for y in train_dataset]
124
+ class_weights = compute_class_weight("balanced", classes=class_ids, y=y_train).astype("float32")
125
+ class_weights = torch.from_numpy(class_weights).to(self.args.device.type)
126
+ self.criteria = nn.CrossEntropyLoss(weight=class_weights)
127
+
128
+ def compute_loss(self, model, inputs, return_outputs=False):
129
+ labels = inputs.get("labels")
130
+ # forward pass
131
+ outputs = model(**inputs)
132
+ logits = outputs.get("logits")
133
+ loss = self.criteria(logits.view(-1, self.model.config.num_labels), labels.view(-1))
134
+ return (loss, outputs) if return_outputs else loss
135
+
136
+
137
+
classifiers/mlp.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from sklearn.base import BaseEstimator, ClassifierMixin
5
+ import numpy as np
6
+ from typing import List, Tuple
7
+ import pandas as pd
8
+ from sklearn.model_selection import train_test_split
9
+ from sklearn.utils.class_weight import compute_class_weight
10
+ import json
11
+ import os
12
+
13
+ class MajorMlpClassifier(BaseEstimator, ClassifierMixin):
14
+ def __init__(self, device="cpu", seed=42, epochs=200, patience:int=None):
15
+ super().__init__()
16
+ self.device = device
17
+ self.seed = seed
18
+ self.model = None
19
+ self.epochs = epochs
20
+ self.patience = patience if patience is not None else epochs
21
+ self.class_labels = None
22
+
23
+
24
+ def _preprocess_features(self, X: np.ndarray) -> np.ndarray:
25
+ return torch.from_numpy(X).to(self.device)
26
+
27
+ def _preprocess_labels(self, y: List[str]) -> np.ndarray:
28
+ unique_labels = np.array(self._get_classes(y))
29
+ one_hot = np.array([
30
+ unique_labels == label
31
+ for label in y
32
+ ], dtype="float32")
33
+
34
+ return torch.from_numpy(one_hot).to(self.device)
35
+
36
+ def _get_classes(self, y: List[str]) -> List[str]:
37
+ return sorted(set(y))
38
+
39
+ def fit(self, X:np.ndarray, y:List[str]):
40
+ """
41
+ Args:
42
+ X: embeddings of shape (n_sentences, embedding_size)
43
+ y: program labels that match with each sentence
44
+ """
45
+ self.class_labels = np.array(self._get_classes(y))
46
+ class_weights = compute_class_weight("balanced", classes=self.class_labels, y=y).astype("float32")
47
+ class_weights = torch.from_numpy(class_weights).to(self.device)
48
+ X, y = self._preprocess_features(X), self._preprocess_labels(y)
49
+ x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=self.seed, shuffle=True)
50
+ should_stop = EarlyStopping(self.patience)
51
+ val_loss = np.inf
52
+ model = ProgramClassifierNetwork(x_train.shape[1], y_train.shape[1])
53
+ model = model.to(self.device)
54
+ optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
55
+ criterion = nn.CrossEntropyLoss(weight=class_weights)
56
+ epoch = 0
57
+ while not should_stop.step(val_loss) and epoch < self.epochs:
58
+ preds = model(x_train)
59
+ loss = criterion(preds, y_train)
60
+ optimizer.zero_grad()
61
+ loss.backward()
62
+ optimizer.step()
63
+ with torch.no_grad():
64
+ val_preds = model(x_val)
65
+ val_loss = criterion(val_preds, y_val).item()
66
+ epoch += 1
67
+ model.eval()
68
+ self.model = model
69
+
70
+ def predict_proba(self, X:np.ndarray) -> np.ndarray:
71
+ X = self._preprocess_features(X)
72
+ if self.model is None:
73
+ raise Exception("Train model with fit() before predicting.")
74
+ with torch.no_grad():
75
+ logits = self.model(X)
76
+ return F.softmax(logits, dim=-1).cpu().numpy()
77
+
78
+ def predict(self, X:np.ndarray) -> List[str]:
79
+ """
80
+ Args:
81
+ X: embeddings of shape (n_sentences, embedding_size)
82
+ Returns:
83
+ predicted classes for each embedding
84
+ """
85
+ pred_i = self.predict_proba(X).argmax(-1)
86
+ return self.class_labels[pred_i]
87
+
88
+ def save_weights(self,path:str):
89
+ os.makedirs(path, exist_ok=True)
90
+ weights_path = os.path.join(path, "weights.pt")
91
+ config_path = os.path.join(path,"config.json")
92
+ torch.save(self.model.state_dict(), weights_path)
93
+ state = {
94
+ "device": self.device,
95
+ "seed": self.seed,
96
+ "epochs": self.epochs,
97
+ "patience": self.patience,
98
+ "class_labels": list(self.class_labels)
99
+ }
100
+ with open(config_path, "w") as f:
101
+ json.dump(state, f)
102
+
103
+
104
+ def load_weights(self, path:str):
105
+ weights_path = os.path.join(path, "weights.pt")
106
+ config_path = os.path.join(path,"config.json")
107
+ state_dict = torch.load(weights_path)
108
+ input_size = int(state_dict["input_size"].item())
109
+ n_classes = int(state_dict["n_classes"].item())
110
+ model = ProgramClassifierNetwork(input_size,n_classes).to(self.device)
111
+ model.load_state_dict(state_dict)
112
+ model.eval()
113
+ self.model = model
114
+ with open(config_path, "r") as f:
115
+ config = json.load(f)
116
+ config["class_labels"] = np.array(config["class_labels"]) if config["class_labels"] is not None else None
117
+ self.__dict__.update(config)
118
+
119
+
120
+
121
+
122
+
123
+ class ProgramClassifierNetwork(nn.Module):
124
+ def __init__(self, input_size:int, n_classes:int) -> None:
125
+ super().__init__()
126
+ self.input_size = nn.Parameter(torch.Tensor([input_size]), requires_grad=False)
127
+ self.n_classes = nn.Parameter(torch.Tensor([n_classes]), requires_grad=False)
128
+ self.classifier = nn.Sequential(
129
+ nn.BatchNorm1d(input_size),
130
+ nn.Linear(input_size, 512),
131
+ nn.ReLU(),
132
+ nn.Linear(512, 256),
133
+ nn.ReLU(),
134
+ nn.Linear(256, 128),
135
+ nn.ReLU(),
136
+ nn.Linear(128, n_classes),
137
+ )
138
+
139
+
140
+ def forward(self,x):
141
+ return self.classifier(x)
142
+
143
+ class EarlyStopping:
144
+ def __init__(self, patience=0):
145
+ self.patience = patience
146
+ self.last_measure = np.inf
147
+ self.consecutive_increase = 0
148
+
149
+ def step(self, val) -> bool:
150
+ if self.last_measure <= val:
151
+ self.consecutive_increase +=1
152
+ else:
153
+ self.consecutive_increase = 0
154
+ self.last_measure = val
155
+
156
+ return self.patience < self.consecutive_increase
data/course_sentences.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65fb25d9fdb21cf3bee8894bb5a364d1e82501994fa80c517ca8d6b449ef195c
3
+ size 366591
data/courses.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e3b6db7c601b4dd5e618305d19c69aab4443201d3d3f9eaecba71848188a627
3
+ size 2057639
data/program_courses.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac10e70cb89c2bea821a06b5789acaf7f0bdf6fd25effacdb5f62439740c7a05
3
+ size 778110
data/program_descriptions.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73be736b1119baa9a4f7e4beeb40a91f57044a63a514fce5ca21c6a128185c76
3
+ size 15395
data/stopwords/course_prefixes.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b12b360d88612aa019620e1ac37dbe370d56cc9f706dd57937a97b4167777dc1
3
+ size 445
data/stopwords/invalid_description_phrases.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3c1c6990d131626c39fe30ef03c2ef60f0800262441a1c11de6845f1d4bd36e
3
+ size 258
data/stopwords/other_words.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f89c5d0e1b455e5f489f34e362e7b9b6ebdd3a78bcf5b88c43cbd6421093d82b
3
+ size 1417
embeddings/bert.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import BertModel, BertTokenizer
3
+ from sklearn.base import BaseEstimator, TransformerMixin
4
+ import numpy as np
5
+ import pandas as pd
6
+ from sklearn.neighbors import KNeighborsClassifier
7
+ class BertSentenceEmbedder(BaseEstimator, TransformerMixin):
8
+ def __init__(self, device="cpu",padding_length=50):
9
+ """
10
+ Args:
11
+ `device`: pytorch device for inference. Either 'cpu' or a specific type of GPU.
12
+ `padding_length`: The max sentence token length. Shorter sentences are padded to this length.
13
+ """
14
+ self._device = device
15
+ self._tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
16
+ model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
17
+ self._model = model.to(device)
18
+ self._model.eval()
19
+ self._padding_length = padding_length
20
+
21
+ def transform(self, X:list) -> np.ndarray:
22
+ """
23
+ Transforms sentences into embeddings
24
+
25
+ Args:
26
+ `X`: a dataset of sentences of shape (n_sentences,)
27
+ Returns:
28
+ Embeddings of the provided sentences of shape (n_sentences, embedding_dims)
29
+ """
30
+ tokens = self._tokenizer(
31
+ X,
32
+ return_token_type_ids=False,
33
+ return_attention_mask=False,
34
+ padding=True,
35
+ truncation=True,
36
+ max_length=self._padding_length,
37
+ return_tensors="pt"
38
+ )
39
+ tokens = tokens["input_ids"].to(self._device)
40
+ with torch.no_grad():
41
+ hidden_states = self._model(
42
+ input_ids=tokens,
43
+ output_hidden_states=True
44
+ )["hidden_states"]
45
+ embeddings = torch.cat(hidden_states[-4:], dim=-1)
46
+ embeddings = torch.mean(embeddings, dim=1)
47
+ return embeddings.cpu().numpy()
48
+
49
+
50
+
51
+ if __name__ == "__main__":
52
+ df = pd.read_csv("course_sentences.csv")
53
+ embedder = BertSentenceEmbedder("mps", padding_length=1000)
54
+ embeddings = embedder.transform(list(df["sentence"]))
55
+ labels = df["program"]
56
+ classifier = KNeighborsClassifier(n_neighbors=10)
57
+ classifier.fit(embeddings, labels)
58
+ num_suggestions = 10
59
+
60
+ prompt = "Covers methods currently available to address complexity, including systems thinking, model based systems engineering and life cycle governance."
61
+ embedding = embedder.transform([prompt])
62
+ probs = classifier.predict_proba(embedding)[0]
63
+ idx = np.argsort(-probs)[:num_suggestions]
64
+ label_map = np.array(sorted(set(labels)))
65
+ print(prompt, label_map[idx], probs[idx])
embeddings/word2vec.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import List
3
+ from sklearn.base import BaseEstimator, TransformerMixin
4
+ import numpy as np
5
+ from glob import iglob
6
+ import pandas as pd
7
+ from sklearn.neighbors import KNeighborsClassifier
8
+ from gensim.models import Word2Vec
9
+ from nltk.tokenize import word_tokenize
10
+ from nltk.corpus import stopwords
11
+ from functools import cache
12
+
13
+
14
+
15
+
16
+ class Word2VecEmbedder(BaseEstimator, TransformerMixin):
17
+ def __init__(self, vector_size=100):
18
+ self.model = None
19
+ self.stop_words = get_stopwords()
20
+ self.vector_size = vector_size
21
+
22
+ def _preprocess(self, text:str) -> List[str]:
23
+ words = word_tokenize(text)
24
+ only_keywords = [word for word in words
25
+ if word not in self.stop_words
26
+ and word.isalpha()]
27
+ return only_keywords
28
+
29
+ def fit(self, sentences:List[str]):
30
+ sentences = [self._preprocess(t) for t in sentences]
31
+ self.model = Word2Vec(sentences, vector_size=self.vector_size, window=5, min_count=1, workers=4)
32
+
33
+
34
+ def transform(self, X:List[str]) -> List[List[np.ndarray]]:
35
+ if self.model is None:
36
+ raise Exception("fit model before transforming")
37
+ sents = map(self._preprocess,X)
38
+ def get_embedding(word):
39
+ try:
40
+ return self.model.wv[word]
41
+ except:
42
+ return np.zeros((self.vector_size,))
43
+
44
+ return [[get_embedding(word) for word in sent] for sent in sents]
45
+
46
+ def latent_distance(self,text1:str, text2:str) -> float:
47
+ first_tokens, second_tokens = self.transform([text1])[0], self.transform([text2])[0]
48
+ sum_dist = 0.0
49
+ for t1 in first_tokens:
50
+ for t2 in second_tokens:
51
+ sum_dist += np.sum((t2-t1)**2)**0.5
52
+ return sum_dist / float(len(first_tokens) * len(second_tokens))
53
+
54
+ @cache
55
+ def get_stopwords():
56
+ words = set()
57
+ for stop_file in iglob("stopwords/*.txt"):
58
+ with open(stop_file, "r") as f:
59
+ words.update(l.lower() for l in f.readlines())
60
+ return set(stopwords.words('english')) | words
61
+
62
+
63
+
64
+ def test_latent_dist():
65
+ df = pd.read_csv("course_sentences.csv")
66
+ embedder = Word2VecEmbedder()
67
+ sentences = list(df["sentence"])
68
+ embedder.fit(sentences)
69
+
70
+ show_dist = lambda s1, s2: print(s1 + "\n", s2 + "\n", embedder.latent_distance(s1,s2))
71
+ show_dist(*(["This is the same sentence."] * 2))
72
+ show_dist("artificial intelligence is my passion", "I really enjoy computer science")
73
+ show_dist("artificial intelligence is my passion", "I really enjoy archeology")
74
+
75
+
76
+
77
+
78
+
79
+ def test_pipeline():
80
+ df = pd.read_csv("course_sentences.csv")
81
+ embedder = Word2VecEmbedder()
82
+ sentences = list(df["sentence"])
83
+ embedder.fit(sentences)
84
+ embeddings = embedder.transform(sentences)
85
+ labels = df["program"]
86
+ classifier = KNeighborsClassifier(n_neighbors=10)
87
+ classifier.fit(embeddings, labels)
88
+ num_suggestions = 10
89
+
90
+ prompt = "Covers methods currently available to address complexity, including systems thinking, model based systems engineering and life cycle governance."
91
+ embedding = embedder.transform([prompt])
92
+ probs = classifier.predict_proba(embedding)[0]
93
+ idx = np.argsort(-probs)[:num_suggestions]
94
+ label_map = np.array(sorted(set(labels)))
95
+ print(prompt, label_map[idx], probs[idx])
96
+
97
+
98
+ if __name__ == "__main__":
99
+ test_latent_dist()
environment.yml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: major-matcher
2
+ channels:
3
+ - anaconda
4
+ - conda-forge
5
+ dependencies:
6
+ - python=3.10
7
+ - pytorch
8
+ - torchvision
9
+ - matplotlib
10
+ - numpy
11
+ - pandas
12
+ - transformers
13
+ - ipykernel
14
+ - scikit-learn
15
+ - beautifulsoup4
16
+ - nltk
17
+ - request
18
+ - seaborn
19
+ - pip:
20
+ - gradio
21
+ - wakepy
explore.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
preprocessing/build_data_dict.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import collections
2
+ import pandas as pd
3
+ import course_scraper
4
+ from nltk.corpus import stopwords
5
+ import string
6
+ import re
7
+
8
+ DESCRIPTION = "Description"
9
+ COURSE_PREFIX = "Course Prefix"
10
+
11
+ words_to_remove = ["lectures", "per", "two", "and/or", "``", "''", "laboratory", "course", "courses", "work",
12
+ "students", "units", "total", "selected", "may", "major", "'s", "quarter", "and/or", "report", "undergraduate", "format",
13
+ "laboratory", "limited", "topics", "fulfills", "including", "topic", "catalogs", "list", "earlier", "overview", "impact",
14
+ "required", "open", "study", "class", "grading", "credit/no", "individual", "kine", "new", "within", "offered",
15
+ "laboratories", "include", "use", "using", "used", "basic", "student", "current", "related", "practice",
16
+ "online", "examination", "formal", "quality", "one", "time", "must", "maximum", "hours", "effects"]
17
+ ge_areas = ["a", "b", "c", "d", "e", "f",
18
+ "area", "areas", "uscp", "upper-division"]
19
+ year = ["2017-19", "2019-20"]
20
+
21
+ stopwords_to_remove = ["ge", "credit", "class", "topics", "course", "following", "student", "units", "section", "study", "k", "unit", "week", "used",
22
+ "division", "catalogs", "graduate", "selected", "courses", "may", "majors", "format", "emphasis", "area", "hours", "emphasized",
23
+ "non", "based", "application", "applications", "classroom", "introduction", "students", "crosslisted", "focus", "methods", "completion",
24
+ "required", "implementation", "u", "better", "part", "fields", "completed", "taken", "well", "grade", "present", "basic", "etc"
25
+ "graduates", "variety", "context", "presented", "instruction", "quarter", "projects", "meet", "fulfills", "enroll", "enrollment",
26
+ "requirement", "studies", "surveys", "planning", "discussion", "assessment", "role", "field", "preparation", "principles", "evaluation",
27
+ "techniques", "selection", "practices", "concepts", "faculty", "theories", "issues", "paid", "usually", "quarters", "independent",
28
+ "fundamentals", "project", "senior"]
29
+
30
+
31
+ def generate_ge_prefixes():
32
+ letters = ["a", "b", "c", "d", "e", "f"]
33
+ numbers = list(range(1, 8))
34
+
35
+ pairs = []
36
+ for letter in letters:
37
+ for num in numbers:
38
+ pairs.append(letter + str(num))
39
+ return pairs
40
+
41
+
42
+ ge_prefixes = generate_ge_prefixes()
43
+ filter_set = set(stopwords.words('english'))
44
+ filter_set.update(string.punctuation, words_to_remove, stopwords_to_remove,
45
+ ge_areas, year, ge_prefixes)
46
+
47
+
48
+ def preprocess(text):
49
+ text_input = re.sub('[^a-zA-Z1-9]+', ' ', str(text))
50
+ output = re.sub(r'\d+', '', text_input)
51
+ return output.lower().strip()
52
+
53
+
54
+ def clean_text(text):
55
+ # add spaces and replace leading "and" or "&"
56
+ return re.sub('^(and|&)', '', text.replace('\xa0', " ")).strip()
57
+
58
+
59
+ def remove_stopwords(text):
60
+ filtered_words = [word.lower()
61
+ for word in text.split() if word.lower() not in filter_set]
62
+ return " ".join(filtered_words)
63
+
64
+
65
+ def build_word_course_dict():
66
+ df = pd.read_csv(course_scraper.FILE_NAME)
67
+ df[DESCRIPTION] = df[DESCRIPTION].map(preprocess)
68
+ df[DESCRIPTION] = df[DESCRIPTION].map(remove_stopwords)
69
+
70
+ word_course_dict = collections.defaultdict(list)
71
+
72
+ for index, row in df.iterrows():
73
+ description = row[DESCRIPTION]
74
+ prefix = row[COURSE_PREFIX]
75
+
76
+ prefixes = format_course_prefixes(prefix)
77
+
78
+ for word in description.split(' '):
79
+ word_course_dict[word] += prefixes
80
+
81
+ return word_course_dict
82
+
83
+
84
+ def format_course_prefixes(prefix_str: str):
85
+ # Returns a list of course prefixes
86
+ formatted_prefixes = []
87
+
88
+
89
+ # one course listed
90
+ if "/" not in prefix_str:
91
+ formatted_prefixes.append(prefix_str.replace(" ", "-"))
92
+ return formatted_prefixes
93
+
94
+ # multiple courses
95
+ split_prefixes = re.split('/| ', prefix_str)
96
+
97
+ course_number_count = len([e for e in split_prefixes if e.isdigit()])
98
+
99
+ if course_number_count == 1:
100
+ # crosslisted courses with different depts, same number (HIST/HNRS 335)
101
+ course_num = split_prefixes[-1]
102
+ for prefix in split_prefixes[:-1]:
103
+ formatted_prefixes.append(f'{prefix}-{course_num}')
104
+ return formatted_prefixes
105
+ else:
106
+ # crosslisted courses with different numbers (HNRS 304/ISLA 303)
107
+ for i in range(0,len(split_prefixes)-1,2):
108
+ prefix = split_prefixes[i]
109
+ course_num = split_prefixes[i+1]
110
+ formatted_prefixes.append(f'{prefix}-{course_num}')
111
+ return formatted_prefixes
112
+
113
+ def build_course_program_dict():
114
+ df = pd.read_csv("program_courses.csv")
115
+ df["Program"] = df["Program"].map(clean_text)
116
+
117
+ program_course_dict = collections.defaultdict(list)
118
+ for index, row in df.iterrows():
119
+ program = row["Program"]
120
+ course_prefix = row["Course Prefix"]
121
+
122
+ # handle mulitple prefixes EX. CPE/CSC 123
123
+ prefixes = format_course_prefixes(course_prefix)
124
+ # print(program, course_prefix, prefixes)
125
+ for prefix in prefixes:
126
+ program_course_dict[prefix] += [program]
127
+
128
+ return program_course_dict
129
+
130
+
131
+ if __name__ == "__main__":
132
+ d = build_word_course_dict()
133
+ program_course_dict = build_course_program_dict()
134
+ print(d)
135
+ # print(program_course_dict)
preprocessing/course_scraper.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ from typing import List, Tuple
5
+
6
+ FILE_NAME: str = "courses.csv"
7
+ MATRIX_FILE_NAME: str = "catalog_matrix.csv"
8
+
9
+ BASE_COURSE_CATALOG_URL: str = "https://catalog.calpoly.edu/coursesaz/"
10
+ BASE_COLLEGES_DEPT_URL: str = "https://catalog.calpoly.edu/coursesaz/#courseprefixestext"
11
+
12
+
13
+ def getTextWithinParentheses(text: str):
14
+ return text[text.find('(')+1:text.find(')')]
15
+
16
+
17
+ def clean_link_text(text):
18
+ return text.replace(')', "").replace('(', "").strip()
19
+
20
+
21
+ def parse_college_html(college_html) -> List[Tuple[str, List[str]]]:
22
+ depts = []
23
+ college_dept_pairs = []
24
+ current_college = None
25
+ for child in college_html.children:
26
+ tag = child.name
27
+ text = child.text
28
+ # dept
29
+ if(tag == 'a'):
30
+ depts.append(clean_link_text(text))
31
+ # college
32
+ elif(tag == 'strong'):
33
+ if(current_college is not None):
34
+ college_dept_pairs.append([current_college, depts])
35
+ depts = []
36
+ current_college = text.strip()
37
+ college_dept_pairs.append((current_college, depts))
38
+ return college_dept_pairs
39
+
40
+
41
+ def parse_college_department_html(html) -> List[str]:
42
+ depts = []
43
+ for dept_html in html.children:
44
+ if(dept_html.name is not None):
45
+ dept = dept_html.text.split('(')[0].strip()
46
+ prefixes = [a.text.replace(')', "")
47
+ for a in dept_html.find_all("a")]
48
+ depts.append((dept, prefixes))
49
+ return depts
50
+
51
+
52
+ # iterates through all depts of a college
53
+ # {
54
+ # 'Biological Science': [BIO, BOT MCRO, MSCI],
55
+ # 'Chemistry and Biochemisty': [CHEM]
56
+ # ...
57
+ # }
58
+ def create_depts_dict(depts):
59
+ d = {}
60
+ for dept in depts:
61
+ d |= {
62
+ dept[0]: dept[1]
63
+ }
64
+ return d
65
+
66
+
67
+ def scrape_course_prefixes():
68
+ DEPT_PREFIXES_ID: str = "courseprefixestextcontainer"
69
+ course_page = requests.get(BASE_COLLEGES_DEPT_URL)
70
+ soup = soup = BeautifulSoup(course_page.content, "html.parser")
71
+
72
+ colleges_html = soup.find(id=DEPT_PREFIXES_ID).div
73
+ college_dept_dict = {}
74
+ current_college = None
75
+ depts = []
76
+
77
+ for child in colleges_html.children:
78
+ tag = child.name
79
+ if (tag == "ul"): # nested departments
80
+ depts = parse_college_department_html(child)
81
+ elif (tag == 'p'): # college
82
+ college_dept_pairs = parse_college_html(child)
83
+ # more than one, so add all but the last to dictionary
84
+ if(len(college_dept_pairs) > 1):
85
+ # add current college and departments
86
+ college_dept_dict |= {
87
+ current_college[0]: {
88
+ current_college[0]: current_college[1]
89
+ }
90
+ }
91
+ college_dept_dict[current_college[0]
92
+ ] |= create_depts_dict(depts)
93
+ # add all other colleges except for the last one
94
+ for i in range(len(college_dept_pairs)-1):
95
+ # add colleges with no departments
96
+ college_dept_dict |= {
97
+ college_dept_pairs[i][0]: {
98
+ college_dept_pairs[i][0]: college_dept_pairs[i][1]
99
+ }
100
+ }
101
+ # set current college to last in pairs
102
+ current_college = college_dept_pairs[-1]
103
+ else:
104
+ if (current_college is not None):
105
+ # add current college and departments
106
+ college_dept_dict |= {
107
+ current_college[0]: {
108
+ current_college[0]: current_college[1]
109
+ }
110
+ }
111
+ # add college departments
112
+ college_dept_dict[current_college[0]
113
+ ] |= create_depts_dict(depts)
114
+ # set new current college
115
+ current_college = college_dept_pairs[0]
116
+
117
+ # add last current college
118
+ college_dept_dict |= {
119
+ current_college[0]: {
120
+ current_college[0]: current_college[1]
121
+ }
122
+ }
123
+ # add college departments
124
+ college_dept_dict[current_college[0]
125
+ ] |= create_depts_dict(depts)
126
+
127
+ return college_dept_dict
128
+
129
+
130
+ def extract_course_info(data, college, dept, prefix):
131
+ prefix = prefix.lower()
132
+ url = f'{BASE_COURSE_CATALOG_URL}/{prefix}'
133
+ page = requests.get(url)
134
+
135
+ # scrape data
136
+ soup = BeautifulSoup(page.content, "html.parser")
137
+ courses = soup.find_all("div", class_="courseblock")
138
+
139
+ if (college == dept):
140
+ dept = f'{dept} Dept'
141
+
142
+ for c in courses:
143
+ course_name: List[str] = c.find(
144
+ "p", class_="courseblocktitle").strong.contents[0].split(".")
145
+ course_num: str = course_name[0].replace(
146
+ "\xa0", "-").strip() # replace nonbreaking space
147
+ name: str = course_name[1].strip()
148
+ units: str = c.find("span", class_="courseblockhours").text.strip()
149
+ description: str = c.find(
150
+ "div", class_="courseblockdesc").p.text.strip()
151
+ data.append([college, dept, course_num,
152
+ name, units, description, college+dept, dept+course_num])
153
+
154
+
155
+ def scrape_courses(prefixes_dict):
156
+ data = []
157
+ for college in prefixes_dict.keys():
158
+ for dept in prefixes_dict[college]:
159
+ for prefix in prefixes_dict[college][dept]:
160
+ if(prefix):
161
+ extract_course_info(data, college, dept, prefix)
162
+ return data
163
+
164
+
165
+ def build_df(data):
166
+ column_names = ["College", "Dept", "Course Prefix",
167
+ "Course Name", "Units", "Description", "College+Dept", "Dept+CourseNum"]
168
+ df = pd.DataFrame(data, columns=column_names)
169
+ return df
170
+
171
+
172
+ def find_match(course_list, matrix):
173
+ for row in matrix.index:
174
+ for col in matrix.columns:
175
+ match = not(course_list[(course_list['College+Dept'] == row+col)
176
+ ].empty) or not(course_list[(course_list['Dept+CourseNum'] == row+col)].empty)
177
+ if(match):
178
+ matrix.loc[row, col] = 1
179
+ # print(row, col)
180
+ return
181
+
182
+
183
+ def build_adj_matrix(course_list: pd.DataFrame):
184
+ colleges = list(course_list["College"].unique())
185
+ depts = list(course_list["Dept"].unique())
186
+ courses = list(course_list["Course Prefix"].unique())
187
+
188
+ indices = [(1, college) for college in colleges] + \
189
+ [(2, dept) for dept in depts] + [(3, course) for course in courses]
190
+
191
+ multi_index = pd.MultiIndex.from_tuples(indices)
192
+ adj_matrix = pd.DataFrame(index=multi_index, columns=multi_index).fillna(0)
193
+
194
+ # grab necessary sections
195
+ colleges_to_depts = adj_matrix.loc[1, 2]
196
+ depts_to_courses = adj_matrix.loc[2, 3]
197
+
198
+ # mark matches
199
+ find_match(course_list, colleges_to_depts)
200
+ find_match(course_list, depts_to_courses)
201
+ return adj_matrix
202
+
203
+
204
+ if __name__ == "__main__":
205
+ prefixes_dict = scrape_course_prefixes()
206
+ courses = scrape_courses(prefixes_dict)
207
+ course_list = build_df(courses)
208
+ course_list.to_csv(FILE_NAME, index=False)
209
+
210
+ # course_list = pd.read_csv(FILE_NAME)
211
+
212
+ # adj_matrix = build_adj_matrix(course_list)
213
+ # adj_matrix.to_csv(MATRIX_FILE_NAME)
preprocessing/format_input_data.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ from nltk.tokenize import sent_tokenize
4
+ from build_data_dict import build_course_program_dict
5
+
6
+ OUTPUT_FILE = "course_sentences.csv"
7
+
8
+ invalid_phrases = [line.rstrip('\n') for line in open(
9
+ 'stopwords/invalid_description_phrases.txt')] # Load .txt file line by line
10
+
11
+ def is_valid_sentence(sentence):
12
+ if sentence == "":
13
+ return False;
14
+ return all(phrase not in sentence.lower() for phrase in invalid_phrases)
15
+
16
+ if __name__ == "__main__":
17
+ courses_df = pd.read_csv("courses.csv")
18
+ course_program_dict = build_course_program_dict()
19
+
20
+ rows = []
21
+ for course, programs in course_program_dict.items():
22
+ # only capture unique courses
23
+ if (len(programs) > 1):
24
+ continue
25
+
26
+ course_row = courses_df.loc[courses_df['Course Prefix'] == course]
27
+
28
+ if(len(course_row["Description"].values) == 0):
29
+ continue;
30
+
31
+ course_description = course_row["Description"].values[0]
32
+ sentences = sent_tokenize(course_description)
33
+ sentences = [sentence.strip() for sentence in sentences if is_valid_sentence(sentence)]
34
+
35
+
36
+ # if a course belongs to more than one program, use the department as the program
37
+ if len(programs) > 1:
38
+ dept = course_row["Dept"].values[0]
39
+ for sentence in sentences:
40
+ rows.append([sentence, course, dept])
41
+ continue
42
+ else:
43
+ for program in programs:
44
+ for sentence in sentences:
45
+ rows.append([sentence, course, program])
46
+
47
+ output_df = pd.DataFrame(rows, columns=["sentence", "course", "program"])
48
+ output_df.to_csv(OUTPUT_FILE, index=False)
preprocessing/helper.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+ import numpy as np
4
+ from typing import Tuple, List
5
+ from sklearn.metrics import confusion_matrix
6
+ import seaborn as sns
7
+
8
+ PROGRAM = "Program"
9
+
10
+
11
+ def clean_text(text):
12
+ text_input = re.sub('[^a-zA-Z1-9]+', ' ', str(text))
13
+ output = re.sub(r'\d+', '', text_input)
14
+ return output.lower().strip()
15
+
16
+
17
+ def get_num_courses_per_program():
18
+ df = pd.read_csv('program_courses.csv')
19
+ return df.groupby([PROGRAM])[PROGRAM].count()
20
+
21
+
22
+ def load_data(num_majors=20, include_majors=[]) -> Tuple[List[str], np.ndarray]:
23
+ """
24
+ Loads and preprocesses `course_sentences` data.
25
+ """
26
+ courses = pd.read_csv("course_sentences.csv").drop(["course"], axis=1).dropna()
27
+ descriptions = pd.read_csv("program_descriptions.csv").rename(columns={"description": "sentence"}).dropna()
28
+ df = pd.concat([courses, descriptions], axis=0, ignore_index=True)
29
+ majors = list(df.groupby("program").count().sort_values(by=["sentence"], ascending=False).index)
30
+ majors = include_majors + majors
31
+ majors = majors[:num_majors]
32
+ df = df[df["program"].isin(majors)]
33
+ sentences = list(df["sentence"])
34
+ labels = np.array(df["program"])
35
+
36
+ return sentences, labels
37
+
38
+ def plot_confusion_matrix(y_true:List[str], y_pred:List[str], classes:List[str]):
39
+ """Plots a confusion matrix"""
40
+ cm = confusion_matrix(y_true, y_pred, labels=classes)
41
+ cm_df=pd.DataFrame(data=cm, index=classes, columns=classes)
42
+ sns.heatmap(cm_df, annot=True)
43
+
44
+
45
+
46
+ def get_recommendations(probs:np.ndarray, labels:List[str], n=5) -> List[List[str]]:
47
+ """
48
+ Args:
49
+ `probs`: predictions array of shape (n_inputs,n_classes)
50
+ `labels`: class labels of shape (n_classes,)
51
+ `n`: number of recommendations
52
+ Returns:
53
+ Top labels based on a probability distribution
54
+ """
55
+ np_labels = np.array(labels)
56
+ return np_labels[(-probs).argsort(-1)[:,:n]]
57
+
test.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from classifiers.mlp import MajorMlpClassifier
2
+ from embeddings.bert import BertSentenceEmbedder
3
+ from sklearn.neighbors import KNeighborsClassifier
4
+ from sklearn.neural_network import MLPClassifier
5
+ from classifiers.bert import BertClassifier
6
+ import pandas as pd
7
+ import numpy as np
8
+ from typing import Tuple
9
+ from sklearn.model_selection import train_test_split, cross_val_score
10
+ from sklearn.metrics import classification_report
11
+ from helper import load_data, get_recommendations, plot_confusion_matrix
12
+ import matplotlib.pyplot as plt
13
+ import os
14
+
15
+ device = "mps"
16
+
17
+
18
+ def evaluate(load_weights=False):
19
+ """
20
+ Performs basic train/test split evaluation.
21
+ """
22
+ os.makedirs("figures", exist_ok=True)
23
+ sentences, labels = load_data(num_majors=40)
24
+ embedder = BertSentenceEmbedder(device, padding_length=1000)
25
+
26
+ seed = 2
27
+ x_train, x_test, y_train, y_test = train_test_split(
28
+ sentences, labels, random_state=seed, shuffle=True, train_size=0.8
29
+ )
30
+ train_embeddings = embedder.transform(x_train)
31
+ test_embeddings = embedder.transform(x_test)
32
+ knn = KNeighborsClassifier()
33
+ mlp = MajorMlpClassifier(device)
34
+ bert_classifier = BertClassifier(
35
+ device=device,
36
+ epochs=25,
37
+ )
38
+
39
+ if load_weights:
40
+ mlp.load_weights("weights/major_classifier")
41
+ bert_classifier.load_weights("weights/bert_classifier_deployment_weights")
42
+ else:
43
+ bert_classifier.fit(x_train, y_train)
44
+ mlp.fit(train_embeddings, y_train)
45
+ knn.fit(train_embeddings, y_train)
46
+ class_labels = np.array(bert_classifier.labels)
47
+
48
+ def report(name, classifier, x, y, n=3):
49
+ probs = classifier.predict_proba(x)
50
+ ordered_choices = class_labels[(-probs).argsort(-1)[:, :n]]
51
+ preds = ordered_choices[:, 0]
52
+ print(name)
53
+ print(
54
+ f"Top {n} accuracy",
55
+ np.mean([label in choices for label, choices in zip(y, ordered_choices)]),
56
+ )
57
+ print(classification_report(y, preds))
58
+ plot_confusion_matrix(y, preds, class_labels)
59
+ plt.savefig(f"figures/{name}_cm.png")
60
+ plt.clf()
61
+
62
+ report("bert_classifier", bert_classifier, x_test, y_test)
63
+ report("KNN", knn, test_embeddings, y_test)
64
+ report("major_mlp", mlp, test_embeddings, y_test)
65
+
66
+
67
+ def demo():
68
+ """
69
+ Interact with a model on the command line.
70
+ """
71
+ bert_classifier = BertClassifier(device="mps")
72
+ weights_path = os.path.join("weights", "bert_classifier_deployment_weights")
73
+ bert_classifier.load_weights(weights_path)
74
+ while True:
75
+ command = input("Describe your ideal major: ")
76
+ if command.lower() == "q" or command.lower() == "quit":
77
+ break
78
+ probs = bert_classifier.predict_proba(command)
79
+ labels = bert_classifier.labels
80
+ print(get_recommendations(probs, labels, n=3)[0])
81
+
82
+
83
+ if __name__ == "__main__":
84
+ evaluate()
train.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from classifiers.bert import BertClassifier
2
+ from classifiers.mlp import MajorMlpClassifier
3
+ from embeddings.bert import BertSentenceEmbedder
4
+ import pickle
5
+ from helper import load_data
6
+
7
+
8
+ def train_bert_classifier(
9
+ device="cpu",
10
+ n_classes=40,
11
+ include_majors=[],
12
+ epochs=25
13
+ ):
14
+ sentences, labels = load_data(num_majors=n_classes, include_majors=include_majors)
15
+ bert_classifier = BertClassifier(device=device, epochs=epochs)
16
+ bert_classifier.fit(sentences, labels)
17
+
18
+
19
+ def train_major_classifier(
20
+ device="cpu",
21
+ n_classes=40,
22
+ include_majors=[],
23
+ epochs=200
24
+ ):
25
+ sentences, labels = load_data(num_majors=n_classes, include_majors=include_majors)
26
+ embedder = BertSentenceEmbedder(device, padding_length=1000)
27
+ embeddings = embedder.transform(sentences)
28
+ mlp = MajorMlpClassifier(device, epochs=epochs)
29
+ mlp.fit(embeddings,labels)
30
+ mlp.save_weights("weights/major_classifier")
31
+
32
+
33
+ if __name__ == "__main__":
34
+ train_major_classifier(device="mps", include_majors=["Computer Science", "Computer Engineering"])
weights/bert_classifier_deployment_weights/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:537432f763d5b9dcbe88df36b8b8e05196a0a2369cd24056fced3b578e1a0da3
3
+ size 3231
weights/bert_classifier_deployment_weights/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0191eb7aa75c118406fed2834d7ffed70c6c5a92afeee8f9962c7e193d11b75
3
+ size 535931845
weights/bert_classifier_deployment_weights/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0699112fa18348be8faf08746a35633e3aad9b3d2c4a1f3e77cf530d098bf53f
3
+ size 267970989
weights/bert_classifier_deployment_weights/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ded3012689b6a3d27eba5707e4bd84c34d9b7ae843c6db5691d7fb371280cc8
3
+ size 13553
weights/bert_classifier_deployment_weights/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1010d34b4b970964c787c0e304bc7e95971298d58fc528d14dc0f6e8fa7e18c8
3
+ size 627
weights/bert_classifier_deployment_weights/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6d346be366a7d1d48332dbc9fdf3bf8960b5d879522b7799ddba59e76237ee3
3
+ size 125
weights/bert_classifier_deployment_weights/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e21aa12acbe7b5d48e4778418cf4976257693629e5e35fdb814273573bc31a4
3
+ size 711649
weights/bert_classifier_deployment_weights/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a6acd3261769a08abd6fa132a83c8b8a2f3ae30f15fe5c0154dbb2a2bc9e4b3
3
+ size 360
weights/bert_classifier_deployment_weights/trainer_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9775b1c63a50d33ae4a9bd72253706bdf5183b042c9f21790fb4b40939b57ea2
3
+ size 7812
weights/bert_classifier_deployment_weights/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01fe6269350128c348b6e9ab911302385ab3aa80616ece458d16e5e5f5cd202f
3
+ size 3387
weights/bert_classifier_deployment_weights/vocab.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3
3
+ size 231508
weights/major_classifier/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee3f0b8c7edb437fc01437c125ebcd594838edf1f63d8f4eb33fb64b680e6e9f
3
+ size 1017
weights/major_classifier/weights.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a001e177107249724e77823b8b26b3b7c5a0548d56244b46fffe346e4e0e896
3
+ size 7024568