Nigl

File size: 20,291 Bytes

8176fea

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "998997dd",
   "metadata": {},
   "source": [
    "# Modeling NCAA Tournament Basketball games\n",
    "\n",
    "The thought process is to build a neural network that can predict a teams tournament <br>\n",
    "performance on a per game basis. Then we can use these predicted metrics to run a monte carlo <br>\n",
    "style simulation and select whichever team is most likley to win. <br>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f0ec30d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim\n",
    "\n",
    "import pandas as pd\n",
    "import os\n",
    "\n",
    "\n",
    "# check to make sure if there are any gpu's available for faster training\n",
    "def get_device() -> str:\n",
    "    if torch.cuda.is_available():\n",
    "        return \"cuda\"\n",
    "    if torch.backends.mps.is_available():\n",
    "        return \"mps\" \n",
    "    return \"cpu\"\n",
    "\n",
    "# mps not working correctly on my m1 macbook air so just doing cpu for now\n",
    "# DEVICE = get_device()\n",
    "DEVICE = \"cpu\"\n",
    "\n",
    "# universal data directory for this project\n",
    "DATA_DIR = os.path.join(\"..\", \"data\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b820f210",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 655 entries, 0 to 654\n",
      "Columns: 1068 entries, Unnamed: 0 to Seed\n",
      "dtypes: float64(672), int64(388), object(8)\n",
      "memory usage: 5.3+ MB\n"
     ]
    }
   ],
   "source": [
    "all_games_df = pd.read_csv(os.path.join(DATA_DIR, \"MDetailedAggregatedGames.csv\"))\n",
    "all_games_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "02ebc500",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Season</th>\n",
       "      <th>DayNum</th>\n",
       "      <th>WTeamID</th>\n",
       "      <th>WScore</th>\n",
       "      <th>LTeamID</th>\n",
       "      <th>LScore</th>\n",
       "      <th>WLoc</th>\n",
       "      <th>NumOT</th>\n",
       "      <th>WFGM</th>\n",
       "      <th>...</th>\n",
       "      <th>tourney_DR_max</th>\n",
       "      <th>tourney_DR_mean</th>\n",
       "      <th>tourney_DR_median</th>\n",
       "      <th>tourney_DR_std</th>\n",
       "      <th>tourney_DR_sum</th>\n",
       "      <th>ConfAbbrev</th>\n",
       "      <th>TeamName</th>\n",
       "      <th>FirstD1Season</th>\n",
       "      <th>LastD1Season</th>\n",
       "      <th>Seed</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>2003</td>\n",
       "      <td>40</td>\n",
       "      <td>1266</td>\n",
       "      <td>63</td>\n",
       "      <td>1458</td>\n",
       "      <td>54</td>\n",
       "      <td>H</td>\n",
       "      <td>0</td>\n",
       "      <td>24</td>\n",
       "      <td>...</td>\n",
       "      <td>21.666667</td>\n",
       "      <td>21.666667</td>\n",
       "      <td>21.666667</td>\n",
       "      <td>21.666667</td>\n",
       "      <td>21.666667</td>\n",
       "      <td>big_ten</td>\n",
       "      <td>Wisconsin</td>\n",
       "      <td>1985</td>\n",
       "      <td>2024</td>\n",
       "      <td>Y05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5</td>\n",
       "      <td>2003</td>\n",
       "      <td>97</td>\n",
       "      <td>1266</td>\n",
       "      <td>68</td>\n",
       "      <td>1448</td>\n",
       "      <td>61</td>\n",
       "      <td>H</td>\n",
       "      <td>0</td>\n",
       "      <td>21</td>\n",
       "      <td>...</td>\n",
       "      <td>26.000000</td>\n",
       "      <td>26.000000</td>\n",
       "      <td>26.000000</td>\n",
       "      <td>26.000000</td>\n",
       "      <td>26.000000</td>\n",
       "      <td>acc</td>\n",
       "      <td>Wake Forest</td>\n",
       "      <td>1985</td>\n",
       "      <td>2024</td>\n",
       "      <td>W02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9</td>\n",
       "      <td>2003</td>\n",
       "      <td>115</td>\n",
       "      <td>1266</td>\n",
       "      <td>78</td>\n",
       "      <td>1257</td>\n",
       "      <td>73</td>\n",
       "      <td>A</td>\n",
       "      <td>0</td>\n",
       "      <td>26</td>\n",
       "      <td>...</td>\n",
       "      <td>24.000000</td>\n",
       "      <td>24.000000</td>\n",
       "      <td>24.000000</td>\n",
       "      <td>24.000000</td>\n",
       "      <td>24.000000</td>\n",
       "      <td>cusa</td>\n",
       "      <td>Louisville</td>\n",
       "      <td>1985</td>\n",
       "      <td>2024</td>\n",
       "      <td>W04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>12</td>\n",
       "      <td>2003</td>\n",
       "      <td>138</td>\n",
       "      <td>1266</td>\n",
       "      <td>101</td>\n",
       "      <td>1281</td>\n",
       "      <td>92</td>\n",
       "      <td>N</td>\n",
       "      <td>1</td>\n",
       "      <td>35</td>\n",
       "      <td>...</td>\n",
       "      <td>26.000000</td>\n",
       "      <td>26.000000</td>\n",
       "      <td>26.000000</td>\n",
       "      <td>26.000000</td>\n",
       "      <td>26.000000</td>\n",
       "      <td>big_twelve</td>\n",
       "      <td>Missouri</td>\n",
       "      <td>1985</td>\n",
       "      <td>2024</td>\n",
       "      <td>Y06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>19</td>\n",
       "      <td>2003</td>\n",
       "      <td>143</td>\n",
       "      <td>1266</td>\n",
       "      <td>77</td>\n",
       "      <td>1338</td>\n",
       "      <td>74</td>\n",
       "      <td>N</td>\n",
       "      <td>0</td>\n",
       "      <td>28</td>\n",
       "      <td>...</td>\n",
       "      <td>21.333333</td>\n",
       "      <td>21.333333</td>\n",
       "      <td>21.333333</td>\n",
       "      <td>21.333333</td>\n",
       "      <td>21.333333</td>\n",
       "      <td>big_east</td>\n",
       "      <td>Pittsburgh</td>\n",
       "      <td>1985</td>\n",
       "      <td>2024</td>\n",
       "      <td>Y02</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 1068 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0  Season  DayNum  WTeamID  WScore  LTeamID  LScore WLoc  NumOT  \\\n",
       "0           0    2003      40     1266      63     1458      54    H      0   \n",
       "1           5    2003      97     1266      68     1448      61    H      0   \n",
       "2           9    2003     115     1266      78     1257      73    A      0   \n",
       "3          12    2003     138     1266     101     1281      92    N      1   \n",
       "4          19    2003     143     1266      77     1338      74    N      0   \n",
       "\n",
       "   WFGM  ...  tourney_DR_max  tourney_DR_mean  tourney_DR_median  \\\n",
       "0    24  ...       21.666667        21.666667          21.666667   \n",
       "1    21  ...       26.000000        26.000000          26.000000   \n",
       "2    26  ...       24.000000        24.000000          24.000000   \n",
       "3    35  ...       26.000000        26.000000          26.000000   \n",
       "4    28  ...       21.333333        21.333333          21.333333   \n",
       "\n",
       "   tourney_DR_std  tourney_DR_sum  ConfAbbrev     TeamName  FirstD1Season  \\\n",
       "0       21.666667       21.666667     big_ten    Wisconsin           1985   \n",
       "1       26.000000       26.000000         acc  Wake Forest           1985   \n",
       "2       24.000000       24.000000        cusa   Louisville           1985   \n",
       "3       26.000000       26.000000  big_twelve     Missouri           1985   \n",
       "4       21.333333       21.333333    big_east   Pittsburgh           1985   \n",
       "\n",
       "   LastD1Season  Seed  \n",
       "0          2024   Y05  \n",
       "1          2024   W02  \n",
       "2          2024   W04  \n",
       "3          2024   Y06  \n",
       "4          2024   Y02  \n",
       "\n",
       "[5 rows x 1068 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_games_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "58e4fee8",
   "metadata": {},
   "source": [
    "# Feature Selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "1251726e",
   "metadata": {},
   "outputs": [],
   "source": [
    "target_df = all_games_df[[\"tourney_Score_mean\", \"tourney_Score_std\", \"tourney_Score_max\", \"tourney_Score_min\"]]\n",
    "\n",
    "features_df = all_games_df[[col for col in all_games_df if col.startswith(\"reg\") and \"_W\" not in col and \"_L\" not in col and \"sum\" not in col]]\n",
    "# features_df = features_df.select_dtypes(include=\"number\")\n",
    "\n",
    "# split data into training and testing data sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    features_df.astype(float),\n",
    "    target_df.astype(float),\n",
    "    train_size=0.8,\n",
    "    random_state=8,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "28478189",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 524 entries, 5 to 451\n",
      "Data columns (total 71 columns):\n",
      " #   Column            Non-Null Count  Dtype  \n",
      "---  ------            --------------  -----  \n",
      " 0   reg_Games         524 non-null    float64\n",
      " 1   reg_Score_min     524 non-null    float64\n",
      " 2   reg_Score_max     524 non-null    float64\n",
      " 3   reg_Score_mean    524 non-null    float64\n",
      " 4   reg_Score_median  524 non-null    float64\n",
      " 5   reg_Score_std     524 non-null    float64\n",
      " 6   reg_FGM_min       524 non-null    float64\n",
      " 7   reg_FGM_max       524 non-null    float64\n",
      " 8   reg_FGM_mean      524 non-null    float64\n",
      " 9   reg_FGM_median    524 non-null    float64\n",
      " 10  reg_FGM_std       524 non-null    float64\n",
      " 11  reg_FGA_min       524 non-null    float64\n",
      " 12  reg_FGA_max       524 non-null    float64\n",
      " 13  reg_FGA_mean      524 non-null    float64\n",
      " 14  reg_FGA_median    524 non-null    float64\n",
      " 15  reg_FGA_std       524 non-null    float64\n",
      " 16  reg_FTM_min       524 non-null    float64\n",
      " 17  reg_FTM_max       524 non-null    float64\n",
      " 18  reg_FTM_mean      524 non-null    float64\n",
      " 19  reg_FTM_median    524 non-null    float64\n",
      " 20  reg_FTM_std       524 non-null    float64\n",
      " 21  reg_FTA_min       524 non-null    float64\n",
      " 22  reg_FTA_max       524 non-null    float64\n",
      " 23  reg_FTA_mean      524 non-null    float64\n",
      " 24  reg_FTA_median    524 non-null    float64\n",
      " 25  reg_FTA_std       524 non-null    float64\n",
      " 26  reg_Ast_min       524 non-null    float64\n",
      " 27  reg_Ast_max       524 non-null    float64\n",
      " 28  reg_Ast_mean      524 non-null    float64\n",
      " 29  reg_Ast_median    524 non-null    float64\n",
      " 30  reg_Ast_std       524 non-null    float64\n",
      " 31  reg_Blk_min       524 non-null    float64\n",
      " 32  reg_Blk_max       524 non-null    float64\n",
      " 33  reg_Blk_mean      524 non-null    float64\n",
      " 34  reg_Blk_median    524 non-null    float64\n",
      " 35  reg_Blk_std       524 non-null    float64\n",
      " 36  reg_PF_min        524 non-null    float64\n",
      " 37  reg_PF_max        524 non-null    float64\n",
      " 38  reg_PF_mean       524 non-null    float64\n",
      " 39  reg_PF_median     524 non-null    float64\n",
      " 40  reg_PF_std        524 non-null    float64\n",
      " 41  reg_Stl_min       524 non-null    float64\n",
      " 42  reg_Stl_max       524 non-null    float64\n",
      " 43  reg_Stl_mean      524 non-null    float64\n",
      " 44  reg_Stl_median    524 non-null    float64\n",
      " 45  reg_Stl_std       524 non-null    float64\n",
      " 46  reg_TO_min        524 non-null    float64\n",
      " 47  reg_TO_max        524 non-null    float64\n",
      " 48  reg_TO_mean       524 non-null    float64\n",
      " 49  reg_TO_median     524 non-null    float64\n",
      " 50  reg_TO_std        524 non-null    float64\n",
      " 51  reg_FGM3_min      524 non-null    float64\n",
      " 52  reg_FGM3_max      524 non-null    float64\n",
      " 53  reg_FGM3_mean     524 non-null    float64\n",
      " 54  reg_FGM3_median   524 non-null    float64\n",
      " 55  reg_FGM3_std      524 non-null    float64\n",
      " 56  reg_FGA3_min      524 non-null    float64\n",
      " 57  reg_FGA3_max      524 non-null    float64\n",
      " 58  reg_FGA3_mean     524 non-null    float64\n",
      " 59  reg_FGA3_median   524 non-null    float64\n",
      " 60  reg_FGA3_std      524 non-null    float64\n",
      " 61  reg_OR_min        524 non-null    float64\n",
      " 62  reg_OR_max        524 non-null    float64\n",
      " 63  reg_OR_mean       524 non-null    float64\n",
      " 64  reg_OR_median     524 non-null    float64\n",
      " 65  reg_OR_std        524 non-null    float64\n",
      " 66  reg_DR_min        524 non-null    float64\n",
      " 67  reg_DR_max        524 non-null    float64\n",
      " 68  reg_DR_mean       524 non-null    float64\n",
      " 69  reg_DR_median     524 non-null    float64\n",
      " 70  reg_DR_std        524 non-null    float64\n",
      "dtypes: float64(71)\n",
      "memory usage: 294.8 KB\n"
     ]
    }
   ],
   "source": [
    "X_train.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "04f4a0a6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 524 entries, 5 to 451\n",
      "Data columns (total 4 columns):\n",
      " #   Column              Non-Null Count  Dtype  \n",
      "---  ------              --------------  -----  \n",
      " 0   tourney_Score_mean  524 non-null    float64\n",
      " 1   tourney_Score_std   524 non-null    float64\n",
      " 2   tourney_Score_max   524 non-null    float64\n",
      " 3   tourney_Score_min   524 non-null    float64\n",
      "dtypes: float64(4)\n",
      "memory usage: 20.5 KB\n"
     ]
    }
   ],
   "source": [
    "y_train.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "40094cd0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# convert all datasets into tensors and register them \n",
    "# with the device (cuda, mps or cpu)\n",
    "X_trainT = torch.Tensor(\n",
    "    X_train.values,\n",
    ").float().to(DEVICE)\n",
    "\n",
    "X_testT = torch.Tensor(\n",
    "    X_test.values,\n",
    ").float().to(DEVICE)\n",
    "\n",
    "y_trainT = torch.Tensor(\n",
    "    y_train.values,\n",
    ").float().to(DEVICE)\n",
    "\n",
    "y_testT = torch.Tensor(\n",
    "    y_test.values,\n",
    ").float().to(DEVICE)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "20bceb9a",
   "metadata": {},
   "source": [
    "# Building Neural Network"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "7b0573ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "num_features = len(X_train.columns)\n",
    "\n",
    "class MadnessNN(nn.Module):\n",
    "    def __init__(self) -> None:\n",
    "        super().__init__()\n",
    "        self.input_layer = nn.Linear(num_features, 64)\n",
    "        self.activation_func = nn.ReLU()\n",
    "        self.layer1 = nn.Linear(64, 32)\n",
    "        self.layer2 = nn.Linear(32, 16)\n",
    "        self.layer3 = nn.Linear(16, 8)\n",
    "        self.output_layer = nn.Linear(8, 4)\n",
    "\n",
    "    def forward(self, x):\n",
    "        x = self.input_layer(x)\n",
    "        x = self.activation_func(x)\n",
    "        x = self.layer1(x)\n",
    "        x = self.activation_func(x)\n",
    "        x = self.layer2(x)\n",
    "        x = self.activation_func(x)\n",
    "        x = self.layer3(x)\n",
    "        x = self.activation_func(x)\n",
    "        x = self.output_layer(x)\n",
    "        x = self.activation_func(x)\n",
    "        return x\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "061e2b52",
   "metadata": {},
   "source": [
    "# Training Loop"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "db035b9d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[500 / 5000] Loss = 40.454681396484375\n",
      "[1000 / 5000] Loss = 39.701454162597656\n",
      "[1500 / 5000] Loss = 39.055484771728516\n",
      "[2000 / 5000] Loss = 38.53948974609375\n",
      "[2500 / 5000] Loss = 38.149085998535156\n",
      "[3000 / 5000] Loss = 37.87413024902344\n",
      "[3500 / 5000] Loss = 37.6934928894043\n",
      "[4000 / 5000] Loss = 37.573673248291016\n",
      "[4500 / 5000] Loss = 37.48927307128906\n",
      "[5000 / 5000] Loss = 37.43183135986328\n"
     ]
    }
   ],
   "source": [
    "torch.manual_seed(1)\n",
    "\n",
    "model5000 = MadnessNN()\n",
    "optimizer = optim.Adam(lr=0.001, params=model5000.parameters())\n",
    "loss_fn = nn.MSELoss()\n",
    "epochs = 5000\n",
    "\n",
    "for epoch in range(1, epochs + 1):\n",
    "    pred = model5000(X_trainT)\n",
    "    loss = loss_fn(pred, y_trainT)\n",
    "    loss.backward()\n",
    "    optimizer.step()\n",
    "    optimizer.zero_grad()\n",
    "\n",
    "    if epoch % 500 == 0:\n",
    "        print(f\"[{epoch} / {epochs}] Loss = {loss}\") \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "b62fd19c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# save\n",
    "torch.save(model5000, os.path.join(\"models\", \"model5000.pth\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "17694dc7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MSE on testing data: 47.071144104003906\n"
     ]
    }
   ],
   "source": [
    "# evaluate\n",
    "model5000.eval()\n",
    "\n",
    "with torch.no_grad():\n",
    "    pred = model5000(X_testT)\n",
    "    loss = loss_fn(pred, y_testT)\n",
    "    print(f\"MSE on testing data: {loss}\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}