re trained the models on only tournament games and the ChalkSeedDiff

Browse files

added as a feature. This really helped the womens neural network, but
the mens one is maybe a little bit worse

Files changed (10) hide show

data/AllSuperDetailedGames.csv +2 -2
data/AllTeamsAgg.csv +1 -1
models/Mnn10k.pth +2 -2
models/Wnn10k.pth +2 -2
src/__pycache__/visual_eval.cpython-311.pyc +0 -0
src/baseline.ipynb +3 -3
src/nn.ipynb +0 -0
src/pre_processing.ipynb +258 -201
src/visual_eval.py +33 -0
src/visualizations.py +0 -27

data/AllSuperDetailedGames.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:02656ec3af193d2e1823e9b1d1914a7293d77b957739ec6407b67b5453df7878
-size 978121784

 version https://git-lfs.github.com/spec/v1
+oid sha256:3ec95fd20de671096891e8426969303b879e0720e52e6da3dc55a0369ba98787
+size 1046244854

data/AllTeamsAgg.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d1b9dae17132c76ed88c7ea1972c0149cdb954d3bad7b0fc07b90c8bda66fdce
 size 31040659

 version https://git-lfs.github.com/spec/v1
+oid sha256:bdbaeb6c905ad5f6480ffe9acb3ebb75ffe8905954826574410c0d8c94a12826
 size 31040659

models/Mnn10k.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7863257c283f71b41da57decaaad44c8175b0428e5b339252f290e9de5f58298
-size 18914

 version https://git-lfs.github.com/spec/v1
+oid sha256:500d7ddd0596b59cf7fbfab9abc7d8f50278f8269f4399e9cb5222fe843418a7
+size 19170

models/Wnn10k.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ae171639c1af16b2b6a199cbd915641cd77f7f2414d81472b46cab3520c7040a
-size 18914

 version https://git-lfs.github.com/spec/v1
+oid sha256:1de0daa46afa742af4a7d31a026b54917abfca4c972546208d50b827e2fe4119
+size 19170

src/__pycache__/visual_eval.cpython-311.pyc ADDED Viewed

Binary file (2.7 kB). View file

src/baseline.ipynb CHANGED Viewed

@@ -68,7 +68,7 @@
     "# games_df[\"BaselinePrediction\"] = games_df.apply(\n",
     "#     lambda row: predict_baseline(row),\n",
     "#     axis=1,\n",
-    "# )\n"
    ]
   },
   {
@@ -461,7 +461,7 @@
     "wmns_actual_T = torch.tensor(\n",
     "    wmns_subset[\"Win\"].values,\n",
     "    dtype=torch.float32,\n",
-    ")\n"
    ]
   },
   {
@@ -513,7 +513,7 @@
     "plt.ylabel(\"True Positive Rate\")\n",
     "\n",
     "plt.tight_layout()\n",
-    "plt.show()\n"
    ]
   },
   {

     "# games_df[\"BaselinePrediction\"] = games_df.apply(\n",
     "#     lambda row: predict_baseline(row),\n",
     "#     axis=1,\n",
+    "# )"
    ]
   },
   {
     "wmns_actual_T = torch.tensor(\n",
     "    wmns_subset[\"Win\"].values,\n",
     "    dtype=torch.float32,\n",
+    ")"
    ]
   },
   {
     "plt.ylabel(\"True Positive Rate\")\n",
     "\n",
     "plt.tight_layout()\n",
+    "plt.show()"
    ]
   },
   {

src/nn.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

src/pre_processing.ipynb CHANGED Viewed

@@ -10,7 +10,7 @@
     "import numpy as np\n",
     "import os\n",
     "\n",
-    "DATA_DIR = os.path.join(\"..\", \"data\") "
    ]
   },
   {
@@ -212,10 +212,16 @@
     }
    ],
    "source": [
-    "detailed_tourney_games_df = pd.concat([\n",
-    "    pd.read_csv(os.path.join(DATA_DIR, \"MNCAATourneyDetailedResults.csv\")).assign(League=\"M\"),\n",
-    "    pd.read_csv(os.path.join(DATA_DIR, \"WNCAATourneyDetailedResults.csv\")).assign(League=\"W\"),\n",
-    "])\n",
     "\n",
     "detailed_tourney_games_df.sample(5, random_state=1)"
    ]
@@ -419,10 +425,16 @@
     }
    ],
    "source": [
-    "detailed_reg_games_df = pd.concat([\n",
-    "    pd.read_csv(os.path.join(DATA_DIR, \"MRegularSeasonDetailedResults.csv\")).assign(League=\"M\"), \n",
-    "    pd.read_csv(os.path.join(DATA_DIR, \"WRegularSeasonDetailedResults.csv\")).assign(League=\"W\"),\n",
-    "])\n",
     "\n",
     "detailed_reg_games_df.sample(5, random_state=1)"
    ]
@@ -445,7 +457,7 @@
     "\n",
     "detailed_metrics = {\n",
     "    \"Score\",\n",
-    "    # \"Loc\", \n",
     "    \"FGM\",\n",
     "    \"FGA\",\n",
     "    \"FGM3\",\n",
@@ -460,8 +472,12 @@
     "    \"PF\",\n",
     "}\n",
     "\n",
-    "w_renamed_cols = {f\"W{col}\": f\"Team{col}\" for col in detailed_metrics} | {f\"L{col}\": f\"Opp{col}\" for col in detailed_metrics}\n",
-    "l_renamed_cols = {f\"L{col}\": f\"Team{col}\" for col in detailed_metrics} | {f\"W{col}\": f\"Opp{col}\" for col in detailed_metrics}"
    ]
   },
   {
@@ -520,22 +536,26 @@
     }
    ],
    "source": [
-    "\n",
-    "detailed_reg_games_df = pd.concat([\n",
-    "    (\n",
-    "        # detailed_reg_games_df[[col for col in detailed_reg_games_df.columns if col != \"LTeamID\"]]\n",
-    "        detailed_reg_games_df[[col for col in detailed_reg_games_df.columns]]\n",
-    "        .assign(GameResult=\"W\")\n",
-    "        .rename(columns=w_renamed_cols | {\"WTeamID\": \"TeamID\", \"LTeamID\": \"OppTeamID\"})\n",
-    "    ),\n",
-    "    (\n",
-    "        # detailed_reg_games_df[[col for col in detailed_reg_games_df.columns if col != \"WTeamID\"]]\n",
-    "        detailed_reg_games_df[[col for col in detailed_reg_games_df.columns]]\n",
-    "        .assign(GameResult=\"L\")\n",
-    "        .rename(columns=l_renamed_cols | {\"LTeamID\": \"TeamID\", \"WTeamID\": \"OppTeamID\"})\n",
-    "    )\n",
-    "\n",
-    "]).reset_index(drop=True)\n",
     "\n",
     "detailed_reg_games_df.info()"
    ]
@@ -597,20 +617,30 @@
    ],
    "source": [
     "# do the same thing for the tournament games\n",
-    "detailed_tourney_games_df = pd.concat([\n",
-    "    (\n",
-    "        # detailed_tourney_games_df[[col for col in detailed_tourney_games_df.columns if col != \"LTeamID\"]]\n",
-    "        detailed_tourney_games_df[[col for col in detailed_tourney_games_df.columns]]\n",
-    "        .assign(GameResult=\"W\")\n",
-    "        .rename(columns=w_renamed_cols | {\"WTeamID\": \"TeamID\", \"LTeamID\": \"OppTeamID\"})\n",
-    "    ),\n",
-    "    (\n",
-    "        # detailed_tourney_games_df[[col for col in detailed_tourney_games_df.columns if col != \"WTeamID\"]]\n",
-    "        detailed_tourney_games_df[[col for col in detailed_tourney_games_df.columns]]\n",
-    "        .assign(GameResult=\"L\")\n",
-    "        .rename(columns=l_renamed_cols | {\"LTeamID\": \"TeamID\", \"WTeamID\": \"OppTeamID\"})\n",
-    "    )\n",
-    "]).reset_index(drop=True)\n",
     "\n",
     "detailed_tourney_games_df.info()"
    ]
@@ -621,7 +651,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "for col in detailed_metrics:\n",
     "    detailed_reg_games_df[f\"{col}Diff\"] = detailed_reg_games_df.apply(\n",
     "        lambda row: row[f\"Team{col}\"] - row[f\"Opp{col}\"],\n",
@@ -631,7 +660,7 @@
     "    detailed_tourney_games_df[f\"{col}Diff\"] = detailed_tourney_games_df.apply(\n",
     "        lambda row: row[f\"Team{col}\"] - row[f\"Opp{col}\"],\n",
     "        axis=1,\n",
-    "    )\n"
    ]
   },
   {
@@ -671,16 +700,16 @@
        "      <th>TeamFGM</th>\n",
        "      <th>TeamFGA</th>\n",
        "      <th>...</th>\n",
-       "      <th>PFDiff</th>\n",
-       "      <th>TODiff</th>\n",
-       "      <th>ORDiff</th>\n",
-       "      <th>FGMDiff</th>\n",
-       "      <th>BlkDiff</th>\n",
        "      <th>FTADiff</th>\n",
-       "      <th>StlDiff</th>\n",
-       "      <th>FGM3Diff</th>\n",
        "      <th>ScoreDiff</th>\n",
        "      <th>FGADiff</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
@@ -697,16 +726,16 @@
        "      <td>21</td>\n",
        "      <td>55</td>\n",
        "      <td>...</td>\n",
-       "      <td>9</td>\n",
-       "      <td>7</td>\n",
        "      <td>-11</td>\n",
-       "      <td>-7</td>\n",
        "      <td>1</td>\n",
        "      <td>-11</td>\n",
        "      <td>-7</td>\n",
-       "      <td>-3</td>\n",
-       "      <td>-28</td>\n",
-       "      <td>-12</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>100732</th>\n",
@@ -721,16 +750,16 @@
        "      <td>23</td>\n",
        "      <td>60</td>\n",
        "      <td>...</td>\n",
-       "      <td>-9</td>\n",
-       "      <td>-6</td>\n",
-       "      <td>-1</td>\n",
-       "      <td>-1</td>\n",
-       "      <td>2</td>\n",
        "      <td>17</td>\n",
-       "      <td>4</td>\n",
-       "      <td>-2</td>\n",
        "      <td>12</td>\n",
        "      <td>-4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>83150</th>\n",
@@ -745,16 +774,16 @@
        "      <td>27</td>\n",
        "      <td>58</td>\n",
        "      <td>...</td>\n",
        "      <td>-5</td>\n",
        "      <td>1</td>\n",
        "      <td>4</td>\n",
-       "      <td>-1</td>\n",
-       "      <td>2</td>\n",
-       "      <td>10</td>\n",
        "      <td>-5</td>\n",
        "      <td>1</td>\n",
        "      <td>13</td>\n",
-       "      <td>-6</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>345009</th>\n",
@@ -769,16 +798,16 @@
        "      <td>19</td>\n",
        "      <td>55</td>\n",
        "      <td>...</td>\n",
-       "      <td>7</td>\n",
-       "      <td>-5</td>\n",
-       "      <td>2</td>\n",
-       "      <td>1</td>\n",
-       "      <td>-3</td>\n",
        "      <td>-11</td>\n",
        "      <td>-3</td>\n",
        "      <td>-1</td>\n",
        "      <td>-7</td>\n",
-       "      <td>13</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>318707</th>\n",
@@ -793,16 +822,16 @@
        "      <td>20</td>\n",
        "      <td>51</td>\n",
        "      <td>...</td>\n",
        "      <td>2</td>\n",
-       "      <td>4</td>\n",
-       "      <td>2</td>\n",
-       "      <td>-3</td>\n",
        "      <td>1</td>\n",
-       "      <td>-11</td>\n",
        "      <td>-7</td>\n",
        "      <td>-3</td>\n",
-       "      <td>-18</td>\n",
-       "      <td>3</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -817,19 +846,19 @@
        "345009    2019       4    3435         58       3292        65    H      0   \n",
        "318707    2013     128    3322         45       3270        63    N      0   \n",
        "\n",
-       "        TeamFGM  TeamFGA  ...  PFDiff  TODiff  ORDiff  FGMDiff  BlkDiff  \\\n",
-       "337067       21       55  ...       9       7     -11       -7        1   \n",
-       "100732       23       60  ...      -9      -6      -1       -1        2   \n",
-       "83150        27       58  ...      -5       1       4       -1        2   \n",
-       "345009       19       55  ...       7      -5       2        1       -3   \n",
-       "318707       20       51  ...       2       4       2       -3        1   \n",
        "\n",
-       "        FTADiff  StlDiff  FGM3Diff  ScoreDiff  FGADiff  \n",
-       "337067      -11       -7        -3        -28      -12  \n",
-       "100732       17        4        -2         12       -4  \n",
-       "83150        10       -5         1         13       -6  \n",
-       "345009      -11       -3        -1         -7       13  \n",
-       "318707      -11       -7        -3        -18        3  \n",
        "\n",
        "[5 rows x 49 columns]"
       ]
@@ -878,10 +907,12 @@
    "source": [
     "# combine the two detailed game dataframes into one for future use\n",
     "\n",
-    "all_detailed_games_df = pd.concat([\n",
-    "    detailed_reg_games_df.assign(GameType=\"reg\"),\n",
-    "    detailed_tourney_games_df.assign(GameType=\"tourney\"),\n",
-    "])"
    ]
   },
   {
@@ -1306,7 +1337,13 @@
    "source": [
     "team_reg_agg = (\n",
     "    detailed_reg_games_df.groupby([\"TeamID\", \"Season\", \"League\"])\n",
-    "    .agg({col: agg_funcs for col in detailed_reg_games_df.select_dtypes(\"number\").columns if col not in exclude_agg_cols})\n",
     "    .reset_index()\n",
     ")\n",
     "\n",
@@ -1668,15 +1705,23 @@
     }
    ],
    "source": [
-    "# aggregate the same metrics for the tournament dataset \n",
     "\n",
     "team_tourney_agg = (\n",
     "    detailed_tourney_games_df.groupby([\"TeamID\", \"Season\", \"League\"])\n",
-    "    .agg({col: agg_funcs for col in detailed_tourney_games_df.select_dtypes(\"number\").columns if col not in exclude_agg_cols})\n",
     "    .reset_index()\n",
     ")\n",
     "\n",
-    "team_tourney_agg.columns = [\" \".join(col).strip() for col in team_tourney_agg.columns.values]\n",
     "\n",
     "team_tourney_agg.sample(10, random_state=1)"
    ]
@@ -1727,83 +1772,83 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1985</td>\n",
-       "      <td>W01</td>\n",
-       "      <td>1207</td>\n",
-       "      <td>M</td>\n",
-       "      <td>big_east</td>\n",
-       "      <td>Georgetown</td>\n",
-       "      <td>1985</td>\n",
-       "      <td>2024</td>\n",
-       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1986</td>\n",
-       "      <td>X04</td>\n",
-       "      <td>1207</td>\n",
-       "      <td>M</td>\n",
-       "      <td>big_east</td>\n",
-       "      <td>Georgetown</td>\n",
-       "      <td>1985</td>\n",
-       "      <td>2024</td>\n",
-       "      <td>4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1987</td>\n",
-       "      <td>X01</td>\n",
-       "      <td>1207</td>\n",
        "      <td>M</td>\n",
-       "      <td>big_east</td>\n",
-       "      <td>Georgetown</td>\n",
-       "      <td>1985</td>\n",
-       "      <td>2024</td>\n",
-       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>1988</td>\n",
-       "      <td>W08</td>\n",
-       "      <td>1207</td>\n",
        "      <td>M</td>\n",
-       "      <td>big_east</td>\n",
-       "      <td>Georgetown</td>\n",
-       "      <td>1985</td>\n",
-       "      <td>2024</td>\n",
-       "      <td>8</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>1989</td>\n",
-       "      <td>W01</td>\n",
-       "      <td>1207</td>\n",
-       "      <td>M</td>\n",
        "      <td>big_east</td>\n",
-       "      <td>Georgetown</td>\n",
-       "      <td>1985</td>\n",
-       "      <td>2024</td>\n",
-       "      <td>1</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "   Season Seed  TeamID League ConfAbbrev    TeamName  FirstD1Season  \\\n",
-       "0    1985  W01    1207      M   big_east  Georgetown           1985   \n",
-       "1    1986  X04    1207      M   big_east  Georgetown           1985   \n",
-       "2    1987  X01    1207      M   big_east  Georgetown           1985   \n",
-       "3    1988  W08    1207      M   big_east  Georgetown           1985   \n",
-       "4    1989  W01    1207      M   big_east  Georgetown           1985   \n",
        "\n",
-       "   LastD1Season  ChalkSeed  \n",
-       "0          2024          1  \n",
-       "1          2024          4  \n",
-       "2          2024          1  \n",
-       "3          2024          8  \n",
-       "4          2024          1  "
       ]
      },
      "execution_count": 15,
@@ -1812,26 +1857,34 @@
     }
    ],
    "source": [
-    "conference_df = pd.concat([\n",
-    "    # pd.read_csv(os.path.join(DATA_DIR, \"MTeamConferences.csv\")).assign(League=\"M\"),\n",
-    "    # pd.read_csv(os.path.join(DATA_DIR, \"WTeamConferences.csv\")).assign(League=\"W\"),\n",
-    "\n",
-    "    pd.read_csv(os.path.join(DATA_DIR, \"MNCAATourneySeeds.csv\")).assign(League=\"M\"),\n",
-    "    pd.read_csv(os.path.join(DATA_DIR, \"WNCAATourneySeeds.csv\")).assign(League=\"W\"),\n",
-    "])\n",
     "\n",
-    "team_conf_seeds_df = (\n",
-    "    conference_df.merge(\n",
-    "        right=(pd.concat([\n",
-    "            # pd.read_csv(os.path.join(DATA_DIR, \"MNCAATourneySeeds.csv\")).assign(League=\"M\"),\n",
-    "            # pd.read_csv(os.path.join(DATA_DIR, \"WNCAATourneySeeds.csv\")).assign(League=\"W\"),\n",
-    "            pd.read_csv(os.path.join(DATA_DIR, \"MTeamConferences.csv\")).assign(League=\"M\"),\n",
-    "            pd.read_csv(os.path.join(DATA_DIR, \"WTeamConferences.csv\")).assign(League=\"W\"),\n",
-    "        ])),\n",
-    "        on=[\"League\", \"Season\", \"TeamID\"],\n",
-    "        how=\"left\",\n",
-    "    )\n",
-    "    .merge(right=pd.read_csv(os.path.join(DATA_DIR, \"MTeams.csv\")), on=\"TeamID\")\n",
     ")\n",
     "\n",
     "team_conf_seeds_df[\"ChalkSeed\"] = team_conf_seeds_df.apply(\n",
@@ -1839,7 +1892,7 @@
     "    axis=1,\n",
     ")\n",
     "\n",
-    "team_conf_seeds_df.head()"
    ]
   },
   {
@@ -2221,10 +2274,10 @@
    "source": [
     "# merge the tournament aggregated metrics with the regular season aggregated metrics\n",
     "team_agg_df = pd.merge(\n",
-    "    left=team_reg_agg, \n",
-    "    right=team_tourney_agg, \n",
     "    how=\"left\",\n",
-    "    on=[\"TeamID\", \"Season\", \"League\"], \n",
     "    suffixes=(\" reg\", \" tourney\"),\n",
     "    validate=\"1:1\",\n",
     ")\n",
@@ -2260,10 +2313,10 @@
      "output_type": "stream",
      "text": [
       "<class 'pandas.core.frame.DataFrame'>\n",
-      "Int64Index: 12857 entries, 0 to 12856\n",
       "Columns: 459 entries, TeamID to ChalkSeed\n",
-      "dtypes: float64(363), int64(92), object(4)\n",
-      "memory usage: 45.1+ MB\n"
      ]
     }
    ],
@@ -2283,7 +2336,7 @@
       "<class 'pandas.core.frame.DataFrame'>\n",
       "Int64Index: 377608 entries, 0 to 377607\n",
       "Columns: 508 entries, Season to ChalkSeed\n",
-      "dtypes: float64(363), int64(138), object(7)\n",
       "memory usage: 1.4+ GB\n"
      ]
     }
@@ -2344,7 +2397,7 @@
       "<class 'pandas.core.frame.DataFrame'>\n",
       "Int64Index: 377608 entries, 0 to 377607\n",
       "Columns: 509 entries, Season to OppChalkSeed\n",
-      "dtypes: float64(364), int64(138), object(7)\n",
       "memory usage: 1.4+ GB\n"
      ]
     }
@@ -2352,7 +2405,9 @@
    "source": [
     "opp_chalk_seed_map = team_conf_seeds_df.groupby(\"TeamID\")[\"ChalkSeed\"].last()\n",
     "\n",
-    "super_detailed_games_df[\"OppChalkSeed\"] = super_detailed_games_df[\"OppTeamID\"].map(opp_chalk_seed_map)\n",
     "\n",
     "super_detailed_games_df.info()"
    ]
@@ -2365,18 +2420,18 @@
     {
      "data": {
       "text/plain": [
-       "0          8.0\n",
-       "1         11.0\n",
-       "2          2.0\n",
-       "3         12.0\n",
-       "4         10.0\n",
-       "          ... \n",
-       "377603     NaN\n",
-       "377604     NaN\n",
-       "377605     NaN\n",
-       "377606     NaN\n",
-       "377607     NaN\n",
-       "Name: OppChalkSeed, Length: 377608, dtype: float64"
       ]
      },
      "execution_count": 22,
@@ -2385,7 +2440,9 @@
     }
    ],
    "source": [
-    "super_detailed_games_df[\"OppChalkSeed\"]"
    ]
   },
   {

     "import numpy as np\n",
     "import os\n",
     "\n",
+    "DATA_DIR = os.path.join(\"..\", \"data\")"
    ]
   },
   {
     }
    ],
    "source": [
+    "detailed_tourney_games_df = pd.concat(\n",
+    "    [\n",
+    "        pd.read_csv(os.path.join(DATA_DIR, \"MNCAATourneyDetailedResults.csv\")).assign(\n",
+    "            League=\"M\"\n",
+    "        ),\n",
+    "        pd.read_csv(os.path.join(DATA_DIR, \"WNCAATourneyDetailedResults.csv\")).assign(\n",
+    "            League=\"W\"\n",
+    "        ),\n",
+    "    ]\n",
+    ")\n",
     "\n",
     "detailed_tourney_games_df.sample(5, random_state=1)"
    ]
     }
    ],
    "source": [
+    "detailed_reg_games_df = pd.concat(\n",
+    "    [\n",
+    "        pd.read_csv(os.path.join(DATA_DIR, \"MRegularSeasonDetailedResults.csv\")).assign(\n",
+    "            League=\"M\"\n",
+    "        ),\n",
+    "        pd.read_csv(os.path.join(DATA_DIR, \"WRegularSeasonDetailedResults.csv\")).assign(\n",
+    "            League=\"W\"\n",
+    "        ),\n",
+    "    ]\n",
+    ")\n",
     "\n",
     "detailed_reg_games_df.sample(5, random_state=1)"
    ]
     "\n",
     "detailed_metrics = {\n",
     "    \"Score\",\n",
+    "    # \"Loc\",\n",
     "    \"FGM\",\n",
     "    \"FGA\",\n",
     "    \"FGM3\",\n",
     "    \"PF\",\n",
     "}\n",
     "\n",
+    "w_renamed_cols = {f\"W{col}\": f\"Team{col}\" for col in detailed_metrics} | {\n",
+    "    f\"L{col}\": f\"Opp{col}\" for col in detailed_metrics\n",
+    "}\n",
+    "l_renamed_cols = {f\"L{col}\": f\"Team{col}\" for col in detailed_metrics} | {\n",
+    "    f\"W{col}\": f\"Opp{col}\" for col in detailed_metrics\n",
+    "}"
    ]
   },
   {
     }
    ],
    "source": [
+    "detailed_reg_games_df = pd.concat(\n",
+    "    [\n",
+    "        (\n",
+    "            # detailed_reg_games_df[[col for col in detailed_reg_games_df.columns if col != \"LTeamID\"]]\n",
+    "            detailed_reg_games_df[[col for col in detailed_reg_games_df.columns]]\n",
+    "            .assign(GameResult=\"W\")\n",
+    "            .rename(\n",
+    "                columns=w_renamed_cols | {\"WTeamID\": \"TeamID\", \"LTeamID\": \"OppTeamID\"}\n",
+    "            )\n",
+    "        ),\n",
+    "        (\n",
+    "            # detailed_reg_games_df[[col for col in detailed_reg_games_df.columns if col != \"WTeamID\"]]\n",
+    "            detailed_reg_games_df[[col for col in detailed_reg_games_df.columns]]\n",
+    "            .assign(GameResult=\"L\")\n",
+    "            .rename(\n",
+    "                columns=l_renamed_cols | {\"LTeamID\": \"TeamID\", \"WTeamID\": \"OppTeamID\"}\n",
+    "            )\n",
+    "        ),\n",
+    "    ]\n",
+    ").reset_index(drop=True)\n",
     "\n",
     "detailed_reg_games_df.info()"
    ]
    ],
    "source": [
     "# do the same thing for the tournament games\n",
+    "detailed_tourney_games_df = pd.concat(\n",
+    "    [\n",
+    "        (\n",
+    "            # detailed_tourney_games_df[[col for col in detailed_tourney_games_df.columns if col != \"LTeamID\"]]\n",
+    "            detailed_tourney_games_df[\n",
+    "                [col for col in detailed_tourney_games_df.columns]\n",
+    "            ]\n",
+    "            .assign(GameResult=\"W\")\n",
+    "            .rename(\n",
+    "                columns=w_renamed_cols | {\"WTeamID\": \"TeamID\", \"LTeamID\": \"OppTeamID\"}\n",
+    "            )\n",
+    "        ),\n",
+    "        (\n",
+    "            # detailed_tourney_games_df[[col for col in detailed_tourney_games_df.columns if col != \"WTeamID\"]]\n",
+    "            detailed_tourney_games_df[\n",
+    "                [col for col in detailed_tourney_games_df.columns]\n",
+    "            ]\n",
+    "            .assign(GameResult=\"L\")\n",
+    "            .rename(\n",
+    "                columns=l_renamed_cols | {\"LTeamID\": \"TeamID\", \"WTeamID\": \"OppTeamID\"}\n",
+    "            )\n",
+    "        ),\n",
+    "    ]\n",
+    ").reset_index(drop=True)\n",
     "\n",
     "detailed_tourney_games_df.info()"
    ]
    "metadata": {},
    "outputs": [],
    "source": [
     "for col in detailed_metrics:\n",
     "    detailed_reg_games_df[f\"{col}Diff\"] = detailed_reg_games_df.apply(\n",
     "        lambda row: row[f\"Team{col}\"] - row[f\"Opp{col}\"],\n",
     "    detailed_tourney_games_df[f\"{col}Diff\"] = detailed_tourney_games_df.apply(\n",
     "        lambda row: row[f\"Team{col}\"] - row[f\"Opp{col}\"],\n",
     "        axis=1,\n",
+    "    )"
    ]
   },
   {
        "      <th>TeamFGM</th>\n",
        "      <th>TeamFGA</th>\n",
        "      <th>...</th>\n",
        "      <th>FTADiff</th>\n",
+       "      <th>PFDiff</th>\n",
        "      <th>ScoreDiff</th>\n",
        "      <th>FGADiff</th>\n",
+       "      <th>BlkDiff</th>\n",
+       "      <th>FGM3Diff</th>\n",
+       "      <th>ORDiff</th>\n",
+       "      <th>StlDiff</th>\n",
+       "      <th>AstDiff</th>\n",
+       "      <th>DRDiff</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "      <td>21</td>\n",
        "      <td>55</td>\n",
        "      <td>...</td>\n",
        "      <td>-11</td>\n",
+       "      <td>9</td>\n",
+       "      <td>-28</td>\n",
+       "      <td>-12</td>\n",
        "      <td>1</td>\n",
+       "      <td>-3</td>\n",
        "      <td>-11</td>\n",
        "      <td>-7</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>-4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>100732</th>\n",
        "      <td>23</td>\n",
        "      <td>60</td>\n",
        "      <td>...</td>\n",
        "      <td>17</td>\n",
+       "      <td>-9</td>\n",
        "      <td>12</td>\n",
        "      <td>-4</td>\n",
+       "      <td>2</td>\n",
+       "      <td>-2</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>4</td>\n",
+       "      <td>11</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>83150</th>\n",
        "      <td>27</td>\n",
        "      <td>58</td>\n",
        "      <td>...</td>\n",
+       "      <td>10</td>\n",
        "      <td>-5</td>\n",
+       "      <td>13</td>\n",
+       "      <td>-6</td>\n",
+       "      <td>2</td>\n",
        "      <td>1</td>\n",
        "      <td>4</td>\n",
        "      <td>-5</td>\n",
        "      <td>1</td>\n",
        "      <td>13</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>345009</th>\n",
        "      <td>19</td>\n",
        "      <td>55</td>\n",
        "      <td>...</td>\n",
        "      <td>-11</td>\n",
+       "      <td>7</td>\n",
+       "      <td>-7</td>\n",
+       "      <td>13</td>\n",
        "      <td>-3</td>\n",
        "      <td>-1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>-3</td>\n",
+       "      <td>4</td>\n",
        "      <td>-7</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>318707</th>\n",
        "      <td>20</td>\n",
        "      <td>51</td>\n",
        "      <td>...</td>\n",
+       "      <td>-11</td>\n",
        "      <td>2</td>\n",
+       "      <td>-18</td>\n",
+       "      <td>3</td>\n",
        "      <td>1</td>\n",
+       "      <td>-3</td>\n",
+       "      <td>2</td>\n",
        "      <td>-7</td>\n",
+       "      <td>2</td>\n",
        "      <td>-3</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "345009    2019       4    3435         58       3292        65    H      0   \n",
        "318707    2013     128    3322         45       3270        63    N      0   \n",
        "\n",
+       "        TeamFGM  TeamFGA  ...  FTADiff  PFDiff  ScoreDiff  FGADiff  BlkDiff  \\\n",
+       "337067       21       55  ...      -11       9        -28      -12        1   \n",
+       "100732       23       60  ...       17      -9         12       -4        2   \n",
+       "83150        27       58  ...       10      -5         13       -6        2   \n",
+       "345009       19       55  ...      -11       7         -7       13       -3   \n",
+       "318707       20       51  ...      -11       2        -18        3        1   \n",
        "\n",
+       "        FGM3Diff  ORDiff  StlDiff  AstDiff  DRDiff  \n",
+       "337067        -3     -11       -7       -1      -4  \n",
+       "100732        -2      -1        4       11       1  \n",
+       "83150          1       4       -5        1      13  \n",
+       "345009        -1       2       -3        4      -7  \n",
+       "318707        -3       2       -7        2      -3  \n",
        "\n",
        "[5 rows x 49 columns]"
       ]
    "source": [
     "# combine the two detailed game dataframes into one for future use\n",
     "\n",
+    "all_detailed_games_df = pd.concat(\n",
+    "    [\n",
+    "        detailed_reg_games_df.assign(GameType=\"reg\"),\n",
+    "        detailed_tourney_games_df.assign(GameType=\"tourney\"),\n",
+    "    ]\n",
+    ")"
    ]
   },
   {
    "source": [
     "team_reg_agg = (\n",
     "    detailed_reg_games_df.groupby([\"TeamID\", \"Season\", \"League\"])\n",
+    "    .agg(\n",
+    "        {\n",
+    "            col: agg_funcs\n",
+    "            for col in detailed_reg_games_df.select_dtypes(\"number\").columns\n",
+    "            if col not in exclude_agg_cols\n",
+    "        }\n",
+    "    )\n",
     "    .reset_index()\n",
     ")\n",
     "\n",
     }
    ],
    "source": [
+    "# aggregate the same metrics for the tournament dataset\n",
     "\n",
     "team_tourney_agg = (\n",
     "    detailed_tourney_games_df.groupby([\"TeamID\", \"Season\", \"League\"])\n",
+    "    .agg(\n",
+    "        {\n",
+    "            col: agg_funcs\n",
+    "            for col in detailed_tourney_games_df.select_dtypes(\"number\").columns\n",
+    "            if col not in exclude_agg_cols\n",
+    "        }\n",
+    "    )\n",
     "    .reset_index()\n",
     ")\n",
     "\n",
+    "team_tourney_agg.columns = [\n",
+    "    \" \".join(col).strip() for col in team_tourney_agg.columns.values\n",
+    "]\n",
     "\n",
     "team_tourney_agg.sample(10, random_state=1)"
    ]
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
+       "      <th>3591</th>\n",
+       "      <td>2004</td>\n",
+       "      <td>X02</td>\n",
+       "      <td>3243</td>\n",
+       "      <td>W</td>\n",
+       "      <td>big_twelve</td>\n",
+       "      <td>Kansas St</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>3528</th>\n",
+       "      <td>2013</td>\n",
+       "      <td>Y01</td>\n",
+       "      <td>3124</td>\n",
+       "      <td>W</td>\n",
+       "      <td>big_twelve</td>\n",
+       "      <td>Baylor</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>1891</th>\n",
+       "      <td>2003</td>\n",
+       "      <td>W02</td>\n",
+       "      <td>1448</td>\n",
        "      <td>M</td>\n",
+       "      <td>acc</td>\n",
+       "      <td>Wake Forest</td>\n",
+       "      <td>1985.0</td>\n",
+       "      <td>2024.0</td>\n",
+       "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>778</th>\n",
+       "      <td>2019</td>\n",
+       "      <td>Y01</td>\n",
+       "      <td>1314</td>\n",
        "      <td>M</td>\n",
+       "      <td>acc</td>\n",
+       "      <td>North Carolina</td>\n",
+       "      <td>1985.0</td>\n",
+       "      <td>2024.0</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>2932</th>\n",
+       "      <td>2019</td>\n",
+       "      <td>X05</td>\n",
+       "      <td>3266</td>\n",
+       "      <td>W</td>\n",
        "      <td>big_east</td>\n",
+       "      <td>Marquette</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>5</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
+       "      Season Seed  TeamID League  ConfAbbrev        TeamName  FirstD1Season  \\\n",
+       "3591    2004  X02    3243      W  big_twelve       Kansas St            NaN   \n",
+       "3528    2013  Y01    3124      W  big_twelve          Baylor            NaN   \n",
+       "1891    2003  W02    1448      M         acc     Wake Forest         1985.0   \n",
+       "778     2019  Y01    1314      M         acc  North Carolina         1985.0   \n",
+       "2932    2019  X05    3266      W    big_east       Marquette            NaN   \n",
        "\n",
+       "      LastD1Season  ChalkSeed  \n",
+       "3591           NaN          2  \n",
+       "3528           NaN          1  \n",
+       "1891        2024.0          2  \n",
+       "778         2024.0          1  \n",
+       "2932           NaN          5  "
       ]
      },
      "execution_count": 15,
     }
    ],
    "source": [
+    "conference_df = pd.concat(\n",
+    "    [\n",
+    "        pd.read_csv(os.path.join(DATA_DIR, \"MNCAATourneySeeds.csv\")).assign(League=\"M\"),\n",
+    "        pd.read_csv(os.path.join(DATA_DIR, \"WNCAATourneySeeds.csv\")).assign(League=\"W\"),\n",
+    "    ]\n",
+    ")\n",
     "\n",
+    "team_conf_seeds_df = conference_df.merge(\n",
+    "    right=(\n",
+    "        pd.concat(\n",
+    "            [\n",
+    "                pd.read_csv(os.path.join(DATA_DIR, \"MTeamConferences.csv\")).assign(\n",
+    "                    League=\"M\"\n",
+    "                ),\n",
+    "                pd.read_csv(os.path.join(DATA_DIR, \"WTeamConferences.csv\")).assign(\n",
+    "                    League=\"W\"\n",
+    "                ),\n",
+    "            ]\n",
+    "        )\n",
+    "    ),\n",
+    "    on=[\"League\", \"Season\", \"TeamID\"],\n",
+    "    how=\"left\",\n",
+    ").merge(right=(\n",
+    "    pd.concat([\n",
+    "        pd.read_csv(os.path.join(DATA_DIR, \"MTeams.csv\")),\n",
+    "        pd.read_csv(os.path.join(DATA_DIR, \"WTeams.csv\")),\n",
+    "    ])),\n",
+    "    on=\"TeamID\",\n",
     ")\n",
     "\n",
     "team_conf_seeds_df[\"ChalkSeed\"] = team_conf_seeds_df.apply(\n",
     "    axis=1,\n",
     ")\n",
     "\n",
+    "team_conf_seeds_df.sample(5, random_state=1)"
    ]
   },
   {
    "source": [
     "# merge the tournament aggregated metrics with the regular season aggregated metrics\n",
     "team_agg_df = pd.merge(\n",
+    "    left=team_reg_agg,\n",
+    "    right=team_tourney_agg,\n",
     "    how=\"left\",\n",
+    "    on=[\"TeamID\", \"Season\", \"League\"],\n",
     "    suffixes=(\" reg\", \" tourney\"),\n",
     "    validate=\"1:1\",\n",
     ")\n",
      "output_type": "stream",
      "text": [
       "<class 'pandas.core.frame.DataFrame'>\n",
+      "Int64Index: 13305 entries, 0 to 13304\n",
       "Columns: 459 entries, TeamID to ChalkSeed\n",
+      "dtypes: float64(453), int64(2), object(4)\n",
+      "memory usage: 46.7+ MB\n"
      ]
     }
    ],
       "<class 'pandas.core.frame.DataFrame'>\n",
       "Int64Index: 377608 entries, 0 to 377607\n",
       "Columns: 508 entries, Season to ChalkSeed\n",
+      "dtypes: float64(453), int64(48), object(7)\n",
       "memory usage: 1.4+ GB\n"
      ]
     }
       "<class 'pandas.core.frame.DataFrame'>\n",
       "Int64Index: 377608 entries, 0 to 377607\n",
       "Columns: 509 entries, Season to OppChalkSeed\n",
+      "dtypes: float64(454), int64(48), object(7)\n",
       "memory usage: 1.4+ GB\n"
      ]
     }
    "source": [
     "opp_chalk_seed_map = team_conf_seeds_df.groupby(\"TeamID\")[\"ChalkSeed\"].last()\n",
     "\n",
+    "super_detailed_games_df[\"OppChalkSeed\"] = super_detailed_games_df[\"OppTeamID\"].map(\n",
+    "    opp_chalk_seed_map\n",
+    ")\n",
     "\n",
     "super_detailed_games_df.info()"
    ]
     {
      "data": {
       "text/plain": [
+       "0         2.0\n",
+       "1        -4.0\n",
+       "2         1.0\n",
+       "3         NaN\n",
+       "4        -9.0\n",
+       "         ... \n",
+       "377603    1.0\n",
+       "377604    2.0\n",
+       "377605   -1.0\n",
+       "377606   -2.0\n",
+       "377607   -1.0\n",
+       "Name: ChalkSeedDiff, Length: 377608, dtype: float64"
       ]
      },
      "execution_count": 22,
     }
    ],
    "source": [
+    "super_detailed_games_df[\"ChalkSeedDiff\"] = (\n",
+    "    super_detailed_games_df[\"ChalkSeed\"] - super_detailed_games_df[\"OppChalkSeed\"]\n",
+    ")"
    ]
   },
   {

src/visual_eval.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from sklearn.metrics import roc_curve, precision_recall_curve
+import matplotlib.pyplot as plt
+import numpy as np
+def eval_binary_classification(pred: np.array, true: np.array):
+    plt.figure(figsize=(12, 6))
+    eval_roc_curve(pred, true)
+    eval_pr_curve(pred, true)
+    plt.tight_layout()
+    plt.show()
+def eval_pr_curve(pred: np.array, true: np.array):
+    precision, recall, _ = precision_recall_curve(true, pred)
+    plt.subplot(1, 2, 1)
+    plt.plot(recall, precision, label="Precision-Recall Curve", color="red")
+    plt.ylim(0)
+    plt.xlabel("Recall")
+    plt.ylabel("Precision")
+    plt.title("Precision-Recall Curve")
+    plt.legend(loc="lower right")
+def eval_roc_curve(pred: np.array, true: np.array) -> None:
+    false_pos_rate, true_pos_rate, _ = roc_curve(true, pred)
+    plt.subplot(1, 2, 2)
+    plt.plot(false_pos_rate, true_pos_rate, label="ROC Curve")
+    plt.plot([0, 1], [0, 1], linestyle="--", label="Random Guessing Model")
+    plt.title("ROC Curve vs. Random")
+    plt.xlabel("False Positive Rate")
+    plt.ylabel("True Positive Rate")
+    plt.legend(loc="lower right")

src/visualizations.py DELETED Viewed

@@ -1,27 +0,0 @@
-import numpy as np
-import matplotlib.pyplot as plt
-from sklearn.metrics import (
-    roc_curve,
-    roc_auc_score,
-    precision_recall_curve,
-)
-def roc_plot(y_true: np.array, y_pred: np.array):
-    fpr, tpr, _ = roc_curve(y_true, y_pred)
-    # Plot ROC Curve
-    plt.plot(fpr, tpr, label="ROC Curve")
-    plt.plot([0, 1], [0, 1], linestyle="--", label="Random Model")
-    plt.title("ROC Curve vs. Random Model")
-    plt.xlabel("False Positive Rate")
-    plt.ylabel("True Positive Rate")
-    plt.legend("lower right")
-    plt.tight_layout()
-    plt.show()
-def precision_recall_plot(y_true: np.array, y_pred: np.array, baseline_pred: np.array):
-    ...