copying the src from the github repository

Browse files

Files changed (4) hide show

src/m_pp.ipynb +634 -0
src/mens_monte_carlo.ipynb +45 -0
src/mens_nn.ipynb +613 -0
src/mens_pre_processing.ipynb +0 -0

src/m_pp.ipynb ADDED Viewed

	@@ -0,0 +1,634 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import os\n",
+    "\n",
+    "DATA_DIR = os.path.join(\"..\", \"data\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 1315 entries, 0 to 1314\n",
+      "Data columns (total 38 columns):\n",
+      " #   Column    Non-Null Count  Dtype \n",
+      "---  ------    --------------  ----- \n",
+      " 0   Season    1315 non-null   int64 \n",
+      " 1   DayNum    1315 non-null   int64 \n",
+      " 2   WTeamID   1315 non-null   int64 \n",
+      " 3   WScore    1315 non-null   int64 \n",
+      " 4   LTeamID   1315 non-null   int64 \n",
+      " 5   LScore    1315 non-null   int64 \n",
+      " 6   WLoc      1315 non-null   int64 \n",
+      " 7   NumOT     1315 non-null   int64 \n",
+      " 8   WFGM      1315 non-null   int64 \n",
+      " 9   WFGA      1315 non-null   int64 \n",
+      " 10  WFGM3     1315 non-null   int64 \n",
+      " 11  WFGA3     1315 non-null   int64 \n",
+      " 12  WFTM      1315 non-null   int64 \n",
+      " 13  WFTA      1315 non-null   int64 \n",
+      " 14  WOR       1315 non-null   int64 \n",
+      " 15  WDR       1315 non-null   int64 \n",
+      " 16  WAst      1315 non-null   int64 \n",
+      " 17  WTO       1315 non-null   int64 \n",
+      " 18  WStl      1315 non-null   int64 \n",
+      " 19  WBlk      1315 non-null   int64 \n",
+      " 20  WPF       1315 non-null   int64 \n",
+      " 21  LFGM      1315 non-null   int64 \n",
+      " 22  LFGA      1315 non-null   int64 \n",
+      " 23  LFGM3     1315 non-null   int64 \n",
+      " 24  LFGA3     1315 non-null   int64 \n",
+      " 25  LFTM      1315 non-null   int64 \n",
+      " 26  LFTA      1315 non-null   int64 \n",
+      " 27  LOR       1315 non-null   int64 \n",
+      " 28  LDR       1315 non-null   int64 \n",
+      " 29  LAst      1315 non-null   int64 \n",
+      " 30  LTO       1315 non-null   int64 \n",
+      " 31  LStl      1315 non-null   int64 \n",
+      " 32  LBlk      1315 non-null   int64 \n",
+      " 33  LPF       1315 non-null   int64 \n",
+      " 34  GameType  1315 non-null   object\n",
+      " 35  WPA       1315 non-null   int64 \n",
+      " 36  LPA       1315 non-null   int64 \n",
+      " 37  LLoc      1315 non-null   int64 \n",
+      "dtypes: int64(37), object(1)\n",
+      "memory usage: 390.5+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "tourney_games_df = pd.read_csv(\n",
+    "    os.path.join(DATA_DIR, \"MNCAATourneyDetailedResults.csv\")\n",
+    ")\n",
+    "\n",
+    "tourney_games_df[\"GameType\"] = \"tourney\"\n",
+    "\n",
+    "tourney_games_df[\"WPA\"] = tourney_games_df[\"LScore\"]\n",
+    "tourney_games_df[\"LPA\"] = tourney_games_df[\"WScore\"]\n",
+    "\n",
+    "tourney_games_df[\"LLoc\"] = tourney_games_df[\"WLoc\"].apply(lambda x: 0 if x == \"A\" else 1)\n",
+    "tourney_games_df[\"WLoc\"] = tourney_games_df[\"LLoc\"].apply(lambda x: 0 if x == \"A\" else 1)\n",
+    "\n",
+    "tourney_games_df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 111817 entries, 0 to 111816\n",
+      "Data columns (total 38 columns):\n",
+      " #   Column    Non-Null Count   Dtype \n",
+      "---  ------    --------------   ----- \n",
+      " 0   Season    111817 non-null  int64 \n",
+      " 1   DayNum    111817 non-null  int64 \n",
+      " 2   WTeamID   111817 non-null  int64 \n",
+      " 3   WScore    111817 non-null  int64 \n",
+      " 4   LTeamID   111817 non-null  int64 \n",
+      " 5   LScore    111817 non-null  int64 \n",
+      " 6   WLoc      111817 non-null  int64 \n",
+      " 7   NumOT     111817 non-null  int64 \n",
+      " 8   WFGM      111817 non-null  int64 \n",
+      " 9   WFGA      111817 non-null  int64 \n",
+      " 10  WFGM3     111817 non-null  int64 \n",
+      " 11  WFGA3     111817 non-null  int64 \n",
+      " 12  WFTM      111817 non-null  int64 \n",
+      " 13  WFTA      111817 non-null  int64 \n",
+      " 14  WOR       111817 non-null  int64 \n",
+      " 15  WDR       111817 non-null  int64 \n",
+      " 16  WAst      111817 non-null  int64 \n",
+      " 17  WTO       111817 non-null  int64 \n",
+      " 18  WStl      111817 non-null  int64 \n",
+      " 19  WBlk      111817 non-null  int64 \n",
+      " 20  WPF       111817 non-null  int64 \n",
+      " 21  LFGM      111817 non-null  int64 \n",
+      " 22  LFGA      111817 non-null  int64 \n",
+      " 23  LFGM3     111817 non-null  int64 \n",
+      " 24  LFGA3     111817 non-null  int64 \n",
+      " 25  LFTM      111817 non-null  int64 \n",
+      " 26  LFTA      111817 non-null  int64 \n",
+      " 27  LOR       111817 non-null  int64 \n",
+      " 28  LDR       111817 non-null  int64 \n",
+      " 29  LAst      111817 non-null  int64 \n",
+      " 30  LTO       111817 non-null  int64 \n",
+      " 31  LStl      111817 non-null  int64 \n",
+      " 32  LBlk      111817 non-null  int64 \n",
+      " 33  LPF       111817 non-null  int64 \n",
+      " 34  GameType  111817 non-null  object\n",
+      " 35  WPA       111817 non-null  int64 \n",
+      " 36  LPA       111817 non-null  int64 \n",
+      " 37  LLoc      111817 non-null  int64 \n",
+      "dtypes: int64(37), object(1)\n",
+      "memory usage: 32.4+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "reg_games_df = pd.read_csv(\n",
+    "    os.path.join(DATA_DIR, \"MRegularSeasonDetailedResults.csv\")\n",
+    ")\n",
+    "\n",
+    "reg_games_df[\"GameType\"] = \"reg\"\n",
+    "\n",
+    "# points allowed column\n",
+    "reg_games_df[\"WPA\"] = reg_games_df[\"LScore\"]\n",
+    "reg_games_df[\"LPA\"] = reg_games_df[\"WScore\"]\n",
+    "\n",
+    "# loser location column\n",
+    "reg_games_df[\"LLoc\"] = reg_games_df[\"WLoc\"].apply(lambda x: 0 if x == \"A\" else 1)\n",
+    "reg_games_df[\"WLoc\"] = reg_games_df[\"LLoc\"].apply(lambda x: 0 if x == \"A\" else 1)\n",
+    "\n",
+    "reg_games_df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def flatten_multi_idx(df: pd.DataFrame) -> None:\n",
+    "    df.columns = [\"_\".join(filter(None, col)) for col in df.columns.to_flat_index()]\n",
+    "\n",
+    "\n",
+    "def summarize_teams(df: pd.DataFrame) -> pd.DataFrame:\n",
+    "    other_cols = {\"TeamID\", \"WTeamID\", \"LTeamID\", \"DayNum\", \"Season\", \"GameType\", \"total_games\"}\n",
+    "    agg_funcs = [np.sum, np.mean, np.median, np.std, np.min, np.max]\n",
+    "    dfs = {}\n",
+    "    subsets = [\"W\", \"L\"]\n",
+    "    for subset in subsets:\n",
+    "        sub = df[[col for col in df.columns if subset in col or col in other_cols]]\n",
+    "        agg_df = sub \\\n",
+    "            .groupby([f\"{subset}TeamID\", \"Season\"]) \\\n",
+    "            .agg({col: agg_funcs for col in sub.columns if col not in other_cols}) \\\n",
+    "            .reset_index()\n",
+    "        \n",
+    "        flatten_multi_idx(agg_df)\n",
+    "        agg_df[f\"total{subset}\"] = df \\\n",
+    "            .groupby([f\"{subset}TeamID\", \"Season\"])[f\"{subset}TeamID\"] \\\n",
+    "            .transform(\"count\")\n",
+    "        dfs[subset] = agg_df\n",
+    "\n",
+    "    merged = pd.merge(\n",
+    "        left=dfs[\"W\"],\n",
+    "        right=dfs[\"L\"],\n",
+    "        left_on=[\"WTeamID\", \"Season\"],\n",
+    "        right_on=[\"LTeamID\", \"Season\"],\n",
+    "    )\n",
+    "\n",
+    "    merged[\"total_games\"] = merged[\"totalW\"] + merged[\"totalL\"]\n",
+    "    merged[\"TeamID\"] = merged[\"WTeamID\"]\n",
+    "    merged.drop([\"WTeamID\", \"LTeamID\"], axis=1, inplace=True)\n",
+    "    return merged\n",
+    "\n",
+    "    # overall_stats_df = merged[[\"TeamID\", \"Season\", \"total_games\", \"WPA_sum\", \"LPA_sum\", \"total_games\"]]\n",
+    "    # # Combine stats from games won and games lost\n",
+    "    # overall_stats_df[\"TotalPA\"] = overall_stats_df[\"WPA_sum\"] + overall_stats_df[\"LPA_sum\"]\n",
+    "    return merged\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reg_agg_df = summarize_teams(reg_games_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Season</th>\n",
+       "      <th>WScore_sum</th>\n",
+       "      <th>WScore_mean</th>\n",
+       "      <th>WScore_median</th>\n",
+       "      <th>WScore_std</th>\n",
+       "      <th>WScore_min</th>\n",
+       "      <th>WScore_max</th>\n",
+       "      <th>WLoc_sum_x</th>\n",
+       "      <th>WLoc_mean_x</th>\n",
+       "      <th>WLoc_median_x</th>\n",
+       "      <th>...</th>\n",
+       "      <th>LPA_max</th>\n",
+       "      <th>LLoc_sum</th>\n",
+       "      <th>LLoc_mean</th>\n",
+       "      <th>LLoc_median</th>\n",
+       "      <th>LLoc_std</th>\n",
+       "      <th>LLoc_min</th>\n",
+       "      <th>LLoc_max</th>\n",
+       "      <th>totalL</th>\n",
+       "      <th>total_games</th>\n",
+       "      <th>TeamID</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2014</td>\n",
+       "      <td>160</td>\n",
+       "      <td>80.000000</td>\n",
+       "      <td>80.0</td>\n",
+       "      <td>9.899495</td>\n",
+       "      <td>73</td>\n",
+       "      <td>87</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>103</td>\n",
+       "      <td>14</td>\n",
+       "      <td>0.736842</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.452414</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>6</td>\n",
+       "      <td>23</td>\n",
+       "      <td>1101</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2015</td>\n",
+       "      <td>542</td>\n",
+       "      <td>77.428571</td>\n",
+       "      <td>72.0</td>\n",
+       "      <td>11.012979</td>\n",
+       "      <td>65</td>\n",
+       "      <td>95</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>102</td>\n",
+       "      <td>15</td>\n",
+       "      <td>0.714286</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.462910</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>28</td>\n",
+       "      <td>1101</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2016</td>\n",
+       "      <td>704</td>\n",
+       "      <td>78.222222</td>\n",
+       "      <td>79.0</td>\n",
+       "      <td>9.257129</td>\n",
+       "      <td>62</td>\n",
+       "      <td>91</td>\n",
+       "      <td>9</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>108</td>\n",
+       "      <td>13</td>\n",
+       "      <td>0.722222</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.460889</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>15</td>\n",
+       "      <td>38</td>\n",
+       "      <td>1101</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2017</td>\n",
+       "      <td>669</td>\n",
+       "      <td>74.333333</td>\n",
+       "      <td>71.0</td>\n",
+       "      <td>7.648529</td>\n",
+       "      <td>65</td>\n",
+       "      <td>85</td>\n",
+       "      <td>9</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>89</td>\n",
+       "      <td>11</td>\n",
+       "      <td>0.687500</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.478714</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>10</td>\n",
+       "      <td>27</td>\n",
+       "      <td>1101</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2018</td>\n",
+       "      <td>915</td>\n",
+       "      <td>76.250000</td>\n",
+       "      <td>77.0</td>\n",
+       "      <td>7.484833</td>\n",
+       "      <td>62</td>\n",
+       "      <td>88</td>\n",
+       "      <td>12</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>88</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0.600000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.507093</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>8</td>\n",
+       "      <td>30</td>\n",
+       "      <td>1101</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7600</th>\n",
+       "      <td>2023</td>\n",
+       "      <td>920</td>\n",
+       "      <td>70.769231</td>\n",
+       "      <td>73.0</td>\n",
+       "      <td>9.047595</td>\n",
+       "      <td>51</td>\n",
+       "      <td>82</td>\n",
+       "      <td>13</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>102</td>\n",
+       "      <td>13</td>\n",
+       "      <td>0.764706</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.437237</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>14</td>\n",
+       "      <td>29</td>\n",
+       "      <td>1476</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7601</th>\n",
+       "      <td>2024</td>\n",
+       "      <td>128</td>\n",
+       "      <td>64.000000</td>\n",
+       "      <td>64.0</td>\n",
+       "      <td>9.899495</td>\n",
+       "      <td>57</td>\n",
+       "      <td>71</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>107</td>\n",
+       "      <td>17</td>\n",
+       "      <td>0.739130</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.448978</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "      <td>25</td>\n",
+       "      <td>1476</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7602</th>\n",
+       "      <td>2023</td>\n",
+       "      <td>864</td>\n",
+       "      <td>72.000000</td>\n",
+       "      <td>74.0</td>\n",
+       "      <td>10.206950</td>\n",
+       "      <td>53</td>\n",
+       "      <td>84</td>\n",
+       "      <td>12</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>97</td>\n",
+       "      <td>15</td>\n",
+       "      <td>0.750000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.444262</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>20</td>\n",
+       "      <td>34</td>\n",
+       "      <td>1477</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7603</th>\n",
+       "      <td>2024</td>\n",
+       "      <td>483</td>\n",
+       "      <td>80.500000</td>\n",
+       "      <td>80.0</td>\n",
+       "      <td>17.683325</td>\n",
+       "      <td>57</td>\n",
+       "      <td>101</td>\n",
+       "      <td>6</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>90</td>\n",
+       "      <td>10</td>\n",
+       "      <td>0.625000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.500000</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>9</td>\n",
+       "      <td>33</td>\n",
+       "      <td>1477</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7604</th>\n",
+       "      <td>2024</td>\n",
+       "      <td>578</td>\n",
+       "      <td>82.571429</td>\n",
+       "      <td>80.0</td>\n",
+       "      <td>7.345228</td>\n",
+       "      <td>74</td>\n",
+       "      <td>94</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>96</td>\n",
+       "      <td>12</td>\n",
+       "      <td>0.857143</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.363137</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>12</td>\n",
+       "      <td>26</td>\n",
+       "      <td>1478</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>7605 rows × 203 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      Season  WScore_sum  WScore_mean  WScore_median  WScore_std  WScore_min  \\\n",
+       "0       2014         160    80.000000           80.0    9.899495          73   \n",
+       "1       2015         542    77.428571           72.0   11.012979          65   \n",
+       "2       2016         704    78.222222           79.0    9.257129          62   \n",
+       "3       2017         669    74.333333           71.0    7.648529          65   \n",
+       "4       2018         915    76.250000           77.0    7.484833          62   \n",
+       "...      ...         ...          ...            ...         ...         ...   \n",
+       "7600    2023         920    70.769231           73.0    9.047595          51   \n",
+       "7601    2024         128    64.000000           64.0    9.899495          57   \n",
+       "7602    2023         864    72.000000           74.0   10.206950          53   \n",
+       "7603    2024         483    80.500000           80.0   17.683325          57   \n",
+       "7604    2024         578    82.571429           80.0    7.345228          74   \n",
+       "\n",
+       "      WScore_max  WLoc_sum_x  WLoc_mean_x  WLoc_median_x  ...  LPA_max  \\\n",
+       "0             87           2          1.0            1.0  ...      103   \n",
+       "1             95           7          1.0            1.0  ...      102   \n",
+       "2             91           9          1.0            1.0  ...      108   \n",
+       "3             85           9          1.0            1.0  ...       89   \n",
+       "4             88          12          1.0            1.0  ...       88   \n",
+       "...          ...         ...          ...            ...  ...      ...   \n",
+       "7600          82          13          1.0            1.0  ...      102   \n",
+       "7601          71           2          1.0            1.0  ...      107   \n",
+       "7602          84          12          1.0            1.0  ...       97   \n",
+       "7603         101           6          1.0            1.0  ...       90   \n",
+       "7604          94           7          1.0            1.0  ...       96   \n",
+       "\n",
+       "      LLoc_sum  LLoc_mean  LLoc_median  LLoc_std  LLoc_min  LLoc_max  totalL  \\\n",
+       "0           14   0.736842          1.0  0.452414         0         1       6   \n",
+       "1           15   0.714286          1.0  0.462910         0         1       5   \n",
+       "2           13   0.722222          1.0  0.460889         0         1      15   \n",
+       "3           11   0.687500          1.0  0.478714         0         1      10   \n",
+       "4            9   0.600000          1.0  0.507093         0         1       8   \n",
+       "...        ...        ...          ...       ...       ...       ...     ...   \n",
+       "7600        13   0.764706          1.0  0.437237         0         1      14   \n",
+       "7601        17   0.739130          1.0  0.448978         0         1       5   \n",
+       "7602        15   0.750000          1.0  0.444262         0         1      20   \n",
+       "7603        10   0.625000          1.0  0.500000         0         1       9   \n",
+       "7604        12   0.857143          1.0  0.363137         0         1      12   \n",
+       "\n",
+       "      total_games  TeamID  \n",
+       "0              23    1101  \n",
+       "1              28    1101  \n",
+       "2              38    1101  \n",
+       "3              27    1101  \n",
+       "4              30    1101  \n",
+       "...           ...     ...  \n",
+       "7600           29    1476  \n",
+       "7601           25    1476  \n",
+       "7602           34    1477  \n",
+       "7603           33    1477  \n",
+       "7604           26    1478  \n",
+       "\n",
+       "[7605 rows x 203 columns]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# combine the winning and losing stats so that we have overall game stats\n",
+    "reg_agg_df\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

src/mens_monte_carlo.ipynb ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import os\n",
+    "\n",
+    "DATA_DIR = os.path.join(\"..\", \"data\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

src/mens_nn.ipynb ADDED Viewed

	@@ -0,0 +1,613 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "998997dd",
+   "metadata": {},
+   "source": [
+    "# Modeling NCAA Tournament Basketball games\n",
+    "\n",
+    "The thought process is to build a neural network that can predict a teams tournament <br>\n",
+    "performance on a per game basis. Then we can use these predicted metrics to run a monte carlo <br>\n",
+    "style simulation and select whichever team is most likley to win. <br>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "f0ec30d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "\n",
+    "import pandas as pd\n",
+    "import os\n",
+    "\n",
+    "\n",
+    "# check to make sure if there are any gpu's available for faster training\n",
+    "def get_device() -> str:\n",
+    "    if torch.cuda.is_available():\n",
+    "        return \"cuda\"\n",
+    "    if torch.backends.mps.is_available():\n",
+    "        return \"mps\" \n",
+    "    return \"cpu\"\n",
+    "\n",
+    "# mps not working correctly on my m1 macbook air so just doing cpu for now\n",
+    "# DEVICE = get_device()\n",
+    "DEVICE = \"cpu\"\n",
+    "\n",
+    "# universal data directory for this project\n",
+    "DATA_DIR = os.path.join(\"..\", \"data\") "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "b820f210",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 655 entries, 0 to 654\n",
+      "Columns: 1068 entries, Unnamed: 0 to Seed\n",
+      "dtypes: float64(672), int64(388), object(8)\n",
+      "memory usage: 5.3+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "all_games_df = pd.read_csv(os.path.join(DATA_DIR, \"MDetailedAggregatedGames.csv\"))\n",
+    "all_games_df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "02ebc500",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Unnamed: 0</th>\n",
+       "      <th>Season</th>\n",
+       "      <th>DayNum</th>\n",
+       "      <th>WTeamID</th>\n",
+       "      <th>WScore</th>\n",
+       "      <th>LTeamID</th>\n",
+       "      <th>LScore</th>\n",
+       "      <th>WLoc</th>\n",
+       "      <th>NumOT</th>\n",
+       "      <th>WFGM</th>\n",
+       "      <th>...</th>\n",
+       "      <th>tourney_DR_max</th>\n",
+       "      <th>tourney_DR_mean</th>\n",
+       "      <th>tourney_DR_median</th>\n",
+       "      <th>tourney_DR_std</th>\n",
+       "      <th>tourney_DR_sum</th>\n",
+       "      <th>ConfAbbrev</th>\n",
+       "      <th>TeamName</th>\n",
+       "      <th>FirstD1Season</th>\n",
+       "      <th>LastD1Season</th>\n",
+       "      <th>Seed</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2003</td>\n",
+       "      <td>40</td>\n",
+       "      <td>1266</td>\n",
+       "      <td>63</td>\n",
+       "      <td>1458</td>\n",
+       "      <td>54</td>\n",
+       "      <td>H</td>\n",
+       "      <td>0</td>\n",
+       "      <td>24</td>\n",
+       "      <td>...</td>\n",
+       "      <td>21.666667</td>\n",
+       "      <td>21.666667</td>\n",
+       "      <td>21.666667</td>\n",
+       "      <td>21.666667</td>\n",
+       "      <td>21.666667</td>\n",
+       "      <td>big_ten</td>\n",
+       "      <td>Wisconsin</td>\n",
+       "      <td>1985</td>\n",
+       "      <td>2024</td>\n",
+       "      <td>Y05</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>5</td>\n",
+       "      <td>2003</td>\n",
+       "      <td>97</td>\n",
+       "      <td>1266</td>\n",
+       "      <td>68</td>\n",
+       "      <td>1448</td>\n",
+       "      <td>61</td>\n",
+       "      <td>H</td>\n",
+       "      <td>0</td>\n",
+       "      <td>21</td>\n",
+       "      <td>...</td>\n",
+       "      <td>26.000000</td>\n",
+       "      <td>26.000000</td>\n",
+       "      <td>26.000000</td>\n",
+       "      <td>26.000000</td>\n",
+       "      <td>26.000000</td>\n",
+       "      <td>acc</td>\n",
+       "      <td>Wake Forest</td>\n",
+       "      <td>1985</td>\n",
+       "      <td>2024</td>\n",
+       "      <td>W02</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>9</td>\n",
+       "      <td>2003</td>\n",
+       "      <td>115</td>\n",
+       "      <td>1266</td>\n",
+       "      <td>78</td>\n",
+       "      <td>1257</td>\n",
+       "      <td>73</td>\n",
+       "      <td>A</td>\n",
+       "      <td>0</td>\n",
+       "      <td>26</td>\n",
+       "      <td>...</td>\n",
+       "      <td>24.000000</td>\n",
+       "      <td>24.000000</td>\n",
+       "      <td>24.000000</td>\n",
+       "      <td>24.000000</td>\n",
+       "      <td>24.000000</td>\n",
+       "      <td>cusa</td>\n",
+       "      <td>Louisville</td>\n",
+       "      <td>1985</td>\n",
+       "      <td>2024</td>\n",
+       "      <td>W04</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>12</td>\n",
+       "      <td>2003</td>\n",
+       "      <td>138</td>\n",
+       "      <td>1266</td>\n",
+       "      <td>101</td>\n",
+       "      <td>1281</td>\n",
+       "      <td>92</td>\n",
+       "      <td>N</td>\n",
+       "      <td>1</td>\n",
+       "      <td>35</td>\n",
+       "      <td>...</td>\n",
+       "      <td>26.000000</td>\n",
+       "      <td>26.000000</td>\n",
+       "      <td>26.000000</td>\n",
+       "      <td>26.000000</td>\n",
+       "      <td>26.000000</td>\n",
+       "      <td>big_twelve</td>\n",
+       "      <td>Missouri</td>\n",
+       "      <td>1985</td>\n",
+       "      <td>2024</td>\n",
+       "      <td>Y06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>19</td>\n",
+       "      <td>2003</td>\n",
+       "      <td>143</td>\n",
+       "      <td>1266</td>\n",
+       "      <td>77</td>\n",
+       "      <td>1338</td>\n",
+       "      <td>74</td>\n",
+       "      <td>N</td>\n",
+       "      <td>0</td>\n",
+       "      <td>28</td>\n",
+       "      <td>...</td>\n",
+       "      <td>21.333333</td>\n",
+       "      <td>21.333333</td>\n",
+       "      <td>21.333333</td>\n",
+       "      <td>21.333333</td>\n",
+       "      <td>21.333333</td>\n",
+       "      <td>big_east</td>\n",
+       "      <td>Pittsburgh</td>\n",
+       "      <td>1985</td>\n",
+       "      <td>2024</td>\n",
+       "      <td>Y02</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 1068 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Unnamed: 0  Season  DayNum  WTeamID  WScore  LTeamID  LScore WLoc  NumOT  \\\n",
+       "0           0    2003      40     1266      63     1458      54    H      0   \n",
+       "1           5    2003      97     1266      68     1448      61    H      0   \n",
+       "2           9    2003     115     1266      78     1257      73    A      0   \n",
+       "3          12    2003     138     1266     101     1281      92    N      1   \n",
+       "4          19    2003     143     1266      77     1338      74    N      0   \n",
+       "\n",
+       "   WFGM  ...  tourney_DR_max  tourney_DR_mean  tourney_DR_median  \\\n",
+       "0    24  ...       21.666667        21.666667          21.666667   \n",
+       "1    21  ...       26.000000        26.000000          26.000000   \n",
+       "2    26  ...       24.000000        24.000000          24.000000   \n",
+       "3    35  ...       26.000000        26.000000          26.000000   \n",
+       "4    28  ...       21.333333        21.333333          21.333333   \n",
+       "\n",
+       "   tourney_DR_std  tourney_DR_sum  ConfAbbrev     TeamName  FirstD1Season  \\\n",
+       "0       21.666667       21.666667     big_ten    Wisconsin           1985   \n",
+       "1       26.000000       26.000000         acc  Wake Forest           1985   \n",
+       "2       24.000000       24.000000        cusa   Louisville           1985   \n",
+       "3       26.000000       26.000000  big_twelve     Missouri           1985   \n",
+       "4       21.333333       21.333333    big_east   Pittsburgh           1985   \n",
+       "\n",
+       "   LastD1Season  Seed  \n",
+       "0          2024   Y05  \n",
+       "1          2024   W02  \n",
+       "2          2024   W04  \n",
+       "3          2024   Y06  \n",
+       "4          2024   Y02  \n",
+       "\n",
+       "[5 rows x 1068 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_games_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "58e4fee8",
+   "metadata": {},
+   "source": [
+    "# Feature Selection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "1251726e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "target_df = all_games_df[[\"tourney_Score_mean\", \"tourney_Score_std\", \"tourney_Score_max\", \"tourney_Score_min\"]]\n",
+    "\n",
+    "features_df = all_games_df[[col for col in all_games_df if col.startswith(\"reg\") and \"_W\" not in col and \"_L\" not in col and \"sum\" not in col]]\n",
+    "# features_df = features_df.select_dtypes(include=\"number\")\n",
+    "\n",
+    "# split data into training and testing data sets\n",
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    features_df.astype(float),\n",
+    "    target_df.astype(float),\n",
+    "    train_size=0.8,\n",
+    "    random_state=8,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "28478189",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Int64Index: 524 entries, 5 to 451\n",
+      "Data columns (total 71 columns):\n",
+      " #   Column            Non-Null Count  Dtype  \n",
+      "---  ------            --------------  -----  \n",
+      " 0   reg_Games         524 non-null    float64\n",
+      " 1   reg_Score_min     524 non-null    float64\n",
+      " 2   reg_Score_max     524 non-null    float64\n",
+      " 3   reg_Score_mean    524 non-null    float64\n",
+      " 4   reg_Score_median  524 non-null    float64\n",
+      " 5   reg_Score_std     524 non-null    float64\n",
+      " 6   reg_FGM_min       524 non-null    float64\n",
+      " 7   reg_FGM_max       524 non-null    float64\n",
+      " 8   reg_FGM_mean      524 non-null    float64\n",
+      " 9   reg_FGM_median    524 non-null    float64\n",
+      " 10  reg_FGM_std       524 non-null    float64\n",
+      " 11  reg_FGA_min       524 non-null    float64\n",
+      " 12  reg_FGA_max       524 non-null    float64\n",
+      " 13  reg_FGA_mean      524 non-null    float64\n",
+      " 14  reg_FGA_median    524 non-null    float64\n",
+      " 15  reg_FGA_std       524 non-null    float64\n",
+      " 16  reg_FTM_min       524 non-null    float64\n",
+      " 17  reg_FTM_max       524 non-null    float64\n",
+      " 18  reg_FTM_mean      524 non-null    float64\n",
+      " 19  reg_FTM_median    524 non-null    float64\n",
+      " 20  reg_FTM_std       524 non-null    float64\n",
+      " 21  reg_FTA_min       524 non-null    float64\n",
+      " 22  reg_FTA_max       524 non-null    float64\n",
+      " 23  reg_FTA_mean      524 non-null    float64\n",
+      " 24  reg_FTA_median    524 non-null    float64\n",
+      " 25  reg_FTA_std       524 non-null    float64\n",
+      " 26  reg_Ast_min       524 non-null    float64\n",
+      " 27  reg_Ast_max       524 non-null    float64\n",
+      " 28  reg_Ast_mean      524 non-null    float64\n",
+      " 29  reg_Ast_median    524 non-null    float64\n",
+      " 30  reg_Ast_std       524 non-null    float64\n",
+      " 31  reg_Blk_min       524 non-null    float64\n",
+      " 32  reg_Blk_max       524 non-null    float64\n",
+      " 33  reg_Blk_mean      524 non-null    float64\n",
+      " 34  reg_Blk_median    524 non-null    float64\n",
+      " 35  reg_Blk_std       524 non-null    float64\n",
+      " 36  reg_PF_min        524 non-null    float64\n",
+      " 37  reg_PF_max        524 non-null    float64\n",
+      " 38  reg_PF_mean       524 non-null    float64\n",
+      " 39  reg_PF_median     524 non-null    float64\n",
+      " 40  reg_PF_std        524 non-null    float64\n",
+      " 41  reg_Stl_min       524 non-null    float64\n",
+      " 42  reg_Stl_max       524 non-null    float64\n",
+      " 43  reg_Stl_mean      524 non-null    float64\n",
+      " 44  reg_Stl_median    524 non-null    float64\n",
+      " 45  reg_Stl_std       524 non-null    float64\n",
+      " 46  reg_TO_min        524 non-null    float64\n",
+      " 47  reg_TO_max        524 non-null    float64\n",
+      " 48  reg_TO_mean       524 non-null    float64\n",
+      " 49  reg_TO_median     524 non-null    float64\n",
+      " 50  reg_TO_std        524 non-null    float64\n",
+      " 51  reg_FGM3_min      524 non-null    float64\n",
+      " 52  reg_FGM3_max      524 non-null    float64\n",
+      " 53  reg_FGM3_mean     524 non-null    float64\n",
+      " 54  reg_FGM3_median   524 non-null    float64\n",
+      " 55  reg_FGM3_std      524 non-null    float64\n",
+      " 56  reg_FGA3_min      524 non-null    float64\n",
+      " 57  reg_FGA3_max      524 non-null    float64\n",
+      " 58  reg_FGA3_mean     524 non-null    float64\n",
+      " 59  reg_FGA3_median   524 non-null    float64\n",
+      " 60  reg_FGA3_std      524 non-null    float64\n",
+      " 61  reg_OR_min        524 non-null    float64\n",
+      " 62  reg_OR_max        524 non-null    float64\n",
+      " 63  reg_OR_mean       524 non-null    float64\n",
+      " 64  reg_OR_median     524 non-null    float64\n",
+      " 65  reg_OR_std        524 non-null    float64\n",
+      " 66  reg_DR_min        524 non-null    float64\n",
+      " 67  reg_DR_max        524 non-null    float64\n",
+      " 68  reg_DR_mean       524 non-null    float64\n",
+      " 69  reg_DR_median     524 non-null    float64\n",
+      " 70  reg_DR_std        524 non-null    float64\n",
+      "dtypes: float64(71)\n",
+      "memory usage: 294.8 KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "X_train.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "04f4a0a6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Int64Index: 524 entries, 5 to 451\n",
+      "Data columns (total 4 columns):\n",
+      " #   Column              Non-Null Count  Dtype  \n",
+      "---  ------              --------------  -----  \n",
+      " 0   tourney_Score_mean  524 non-null    float64\n",
+      " 1   tourney_Score_std   524 non-null    float64\n",
+      " 2   tourney_Score_max   524 non-null    float64\n",
+      " 3   tourney_Score_min   524 non-null    float64\n",
+      "dtypes: float64(4)\n",
+      "memory usage: 20.5 KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "y_train.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "40094cd0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# convert all datasets into tensors and register them \n",
+    "# with the device (cuda, mps or cpu)\n",
+    "X_trainT = torch.Tensor(\n",
+    "    X_train.values,\n",
+    ").float().to(DEVICE)\n",
+    "\n",
+    "X_testT = torch.Tensor(\n",
+    "    X_test.values,\n",
+    ").float().to(DEVICE)\n",
+    "\n",
+    "y_trainT = torch.Tensor(\n",
+    "    y_train.values,\n",
+    ").float().to(DEVICE)\n",
+    "\n",
+    "y_testT = torch.Tensor(\n",
+    "    y_test.values,\n",
+    ").float().to(DEVICE)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "20bceb9a",
+   "metadata": {},
+   "source": [
+    "# Building Neural Network"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "7b0573ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_features = len(X_train.columns)\n",
+    "\n",
+    "class MadnessNN(nn.Module):\n",
+    "    def __init__(self) -> None:\n",
+    "        super().__init__()\n",
+    "        self.input_layer = nn.Linear(num_features, 64)\n",
+    "        self.activation_func = nn.ReLU()\n",
+    "        self.layer1 = nn.Linear(64, 32)\n",
+    "        self.layer2 = nn.Linear(32, 16)\n",
+    "        self.layer3 = nn.Linear(16, 8)\n",
+    "        self.output_layer = nn.Linear(8, 4)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x = self.input_layer(x)\n",
+    "        x = self.activation_func(x)\n",
+    "        x = self.layer1(x)\n",
+    "        x = self.activation_func(x)\n",
+    "        x = self.layer2(x)\n",
+    "        x = self.activation_func(x)\n",
+    "        x = self.layer3(x)\n",
+    "        x = self.activation_func(x)\n",
+    "        x = self.output_layer(x)\n",
+    "        x = self.activation_func(x)\n",
+    "        return x\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "061e2b52",
+   "metadata": {},
+   "source": [
+    "# Training Loop"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "db035b9d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[500 / 5000] Loss = 40.454681396484375\n",
+      "[1000 / 5000] Loss = 39.701454162597656\n",
+      "[1500 / 5000] Loss = 39.055484771728516\n",
+      "[2000 / 5000] Loss = 38.53948974609375\n",
+      "[2500 / 5000] Loss = 38.149085998535156\n",
+      "[3000 / 5000] Loss = 37.87413024902344\n",
+      "[3500 / 5000] Loss = 37.6934928894043\n",
+      "[4000 / 5000] Loss = 37.573673248291016\n",
+      "[4500 / 5000] Loss = 37.48927307128906\n",
+      "[5000 / 5000] Loss = 37.43183135986328\n"
+     ]
+    }
+   ],
+   "source": [
+    "torch.manual_seed(1)\n",
+    "\n",
+    "model5000 = MadnessNN()\n",
+    "optimizer = optim.Adam(lr=0.001, params=model5000.parameters())\n",
+    "loss_fn = nn.MSELoss()\n",
+    "epochs = 5000\n",
+    "\n",
+    "for epoch in range(1, epochs + 1):\n",
+    "    pred = model5000(X_trainT)\n",
+    "    loss = loss_fn(pred, y_trainT)\n",
+    "    loss.backward()\n",
+    "    optimizer.step()\n",
+    "    optimizer.zero_grad()\n",
+    "\n",
+    "    if epoch % 500 == 0:\n",
+    "        print(f\"[{epoch} / {epochs}] Loss = {loss}\") \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "b62fd19c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save\n",
+    "torch.save(model5000, os.path.join(\"models\", \"model5000.pth\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "17694dc7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MSE on testing data: 47.071144104003906\n"
+     ]
+    }
+   ],
+   "source": [
+    "# evaluate\n",
+    "model5000.eval()\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    pred = model5000(X_testT)\n",
+    "    loss = loss_fn(pred, y_testT)\n",
+    "    print(f\"MSE on testing data: {loss}\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

src/mens_pre_processing.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff