# Modeling NCAA Tournament Basketball games

The thought process is to build a neural network that can predict a teams tournament <br>
performance on a per game basis. Then we can use these predicted metrics to run a monte carlo <br>
style simulation and select whichever team is most likley to win. <br>

In [1]:
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim

import pandas as pd
import os


# check to make sure if there are any gpu's available for faster training
def get_device() -> str:
    if torch.cuda.is_available():
        return "cuda"
    if torch.backends.mps.is_available():
        return "mps" 
    return "cpu"

# mps not working correctly on my m1 macbook air so just doing cpu for now
# DEVICE = get_device()
DEVICE = "cpu"

# universal data directory for this project
DATA_DIR = os.path.join("..", "data") 

In [2]:
all_games_df = pd.read_csv(os.path.join(DATA_DIR, "MDetailedAggregatedGames.csv"))
all_games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 655 entries, 0 to 654
Columns: 1068 entries, Unnamed: 0 to Seed
dtypes: float64(672), int64(388), object(8)
memory usage: 5.3+ MB


In [3]:
all_games_df.head()

Unnamed: 0.1,Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,...,tourney_DR_max,tourney_DR_mean,tourney_DR_median,tourney_DR_std,tourney_DR_sum,ConfAbbrev,TeamName,FirstD1Season,LastD1Season,Seed
0,0,2003,40,1266,63,1458,54,H,0,24,...,21.666667,21.666667,21.666667,21.666667,21.666667,big_ten,Wisconsin,1985,2024,Y05
1,5,2003,97,1266,68,1448,61,H,0,21,...,26.0,26.0,26.0,26.0,26.0,acc,Wake Forest,1985,2024,W02
2,9,2003,115,1266,78,1257,73,A,0,26,...,24.0,24.0,24.0,24.0,24.0,cusa,Louisville,1985,2024,W04
3,12,2003,138,1266,101,1281,92,N,1,35,...,26.0,26.0,26.0,26.0,26.0,big_twelve,Missouri,1985,2024,Y06
4,19,2003,143,1266,77,1338,74,N,0,28,...,21.333333,21.333333,21.333333,21.333333,21.333333,big_east,Pittsburgh,1985,2024,Y02


# Feature Selection

In [4]:
target_df = all_games_df[["tourney_Score_mean", "tourney_Score_std", "tourney_Score_max", "tourney_Score_min"]]

features_df = all_games_df[[col for col in all_games_df if col.startswith("reg") and "_W" not in col and "_L" not in col and "sum" not in col]]
# features_df = features_df.select_dtypes(include="number")

# split data into training and testing data sets
X_train, X_test, y_train, y_test = train_test_split(
    features_df.astype(float),
    target_df.astype(float),
    train_size=0.8,
    random_state=8,
)

In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 524 entries, 5 to 451
Data columns (total 71 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   reg_Games         524 non-null    float64
 1   reg_Score_min     524 non-null    float64
 2   reg_Score_max     524 non-null    float64
 3   reg_Score_mean    524 non-null    float64
 4   reg_Score_median  524 non-null    float64
 5   reg_Score_std     524 non-null    float64
 6   reg_FGM_min       524 non-null    float64
 7   reg_FGM_max       524 non-null    float64
 8   reg_FGM_mean      524 non-null    float64
 9   reg_FGM_median    524 non-null    float64
 10  reg_FGM_std       524 non-null    float64
 11  reg_FGA_min       524 non-null    float64
 12  reg_FGA_max       524 non-null    float64
 13  reg_FGA_mean      524 non-null    float64
 14  reg_FGA_median    524 non-null    float64
 15  reg_FGA_std       524 non-null    float64
 16  reg_FTM_min       524 non-null    float64
 1

In [6]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 524 entries, 5 to 451
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tourney_Score_mean  524 non-null    float64
 1   tourney_Score_std   524 non-null    float64
 2   tourney_Score_max   524 non-null    float64
 3   tourney_Score_min   524 non-null    float64
dtypes: float64(4)
memory usage: 20.5 KB


In [7]:
# convert all datasets into tensors and register them 
# with the device (cuda, mps or cpu)
X_trainT = torch.Tensor(
    X_train.values,
).float().to(DEVICE)

X_testT = torch.Tensor(
    X_test.values,
).float().to(DEVICE)

y_trainT = torch.Tensor(
    y_train.values,
).float().to(DEVICE)

y_testT = torch.Tensor(
    y_test.values,
).float().to(DEVICE)

# Building Neural Network

In [8]:
num_features = len(X_train.columns)

class MadnessNN(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.input_layer = nn.Linear(num_features, 64)
        self.activation_func = nn.ReLU()
        self.layer1 = nn.Linear(64, 32)
        self.layer2 = nn.Linear(32, 16)
        self.layer3 = nn.Linear(16, 8)
        self.output_layer = nn.Linear(8, 4)

    def forward(self, x):
        x = self.input_layer(x)
        x = self.activation_func(x)
        x = self.layer1(x)
        x = self.activation_func(x)
        x = self.layer2(x)
        x = self.activation_func(x)
        x = self.layer3(x)
        x = self.activation_func(x)
        x = self.output_layer(x)
        x = self.activation_func(x)
        return x


# Training Loop

In [21]:
torch.manual_seed(1)

model5000 = MadnessNN()
optimizer = optim.Adam(lr=0.001, params=model5000.parameters())
loss_fn = nn.MSELoss()
epochs = 5000

for epoch in range(1, epochs + 1):
    pred = model5000(X_trainT)
    loss = loss_fn(pred, y_trainT)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if epoch % 500 == 0:
        print(f"[{epoch} / {epochs}] Loss = {loss}") 


[500 / 5000] Loss = 40.454681396484375
[1000 / 5000] Loss = 39.701454162597656
[1500 / 5000] Loss = 39.055484771728516
[2000 / 5000] Loss = 38.53948974609375
[2500 / 5000] Loss = 38.149085998535156
[3000 / 5000] Loss = 37.87413024902344
[3500 / 5000] Loss = 37.6934928894043
[4000 / 5000] Loss = 37.573673248291016
[4500 / 5000] Loss = 37.48927307128906
[5000 / 5000] Loss = 37.43183135986328


In [22]:
# save
torch.save(model5000, os.path.join("models", "model5000.pth"))

In [23]:
# evaluate
model5000.eval()

with torch.no_grad():
    pred = model5000(X_testT)
    loss = loss_fn(pred, y_testT)
    print(f"MSE on testing data: {loss}")


MSE on testing data: 47.071144104003906
