aggregating and summarizing team stats in mens_pre_procssing so that we

have a more informed dataset when it comes to evaluating team tournament
performance based on regular season performance

Files changed (3) hide show

src/{m_pp.ipynb → .ipynb_checkpoints/m_pp-checkpoint.ipynb} +495 -36
src/mens_monte_carlo.ipynb +0 -45
src/mens_pre_processing.ipynb +0 -0

src/{m_pp.ipynb → .ipynb_checkpoints/m_pp-checkpoint.ipynb} RENAMED Viewed

@@ -163,54 +163,513 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
     "\n",
-    "def flatten_multi_idx(df: pd.DataFrame) -> None:\n",
-    "    df.columns = [\"_\".join(filter(None, col)) for col in df.columns.to_flat_index()]\n",
     "\n",
     "\n",
-    "def summarize_teams(df: pd.DataFrame) -> pd.DataFrame:\n",
-    "    other_cols = {\"TeamID\", \"WTeamID\", \"LTeamID\", \"DayNum\", \"Season\", \"GameType\", \"total_games\"}\n",
-    "    agg_funcs = [np.sum, np.mean, np.median, np.std, np.min, np.max]\n",
-    "    dfs = {}\n",
-    "    subsets = [\"W\", \"L\"]\n",
-    "    for subset in subsets:\n",
-    "        sub = df[[col for col in df.columns if subset in col or col in other_cols]]\n",
-    "        agg_df = sub \\\n",
-    "            .groupby([f\"{subset}TeamID\", \"Season\"]) \\\n",
-    "            .agg({col: agg_funcs for col in sub.columns if col not in other_cols}) \\\n",
-    "            .reset_index()\n",
     "        \n",
-    "        flatten_multi_idx(agg_df)\n",
-    "        agg_df[f\"total{subset}\"] = df \\\n",
-    "            .groupby([f\"{subset}TeamID\", \"Season\"])[f\"{subset}TeamID\"] \\\n",
-    "            .transform(\"count\")\n",
-    "        dfs[subset] = agg_df\n",
     "\n",
-    "    merged = pd.merge(\n",
-    "        left=dfs[\"W\"],\n",
-    "        right=dfs[\"L\"],\n",
-    "        left_on=[\"WTeamID\", \"Season\"],\n",
-    "        right_on=[\"LTeamID\", \"Season\"],\n",
-    "    )\n",
     "\n",
-    "    merged[\"total_games\"] = merged[\"totalW\"] + merged[\"totalL\"]\n",
-    "    merged[\"TeamID\"] = merged[\"WTeamID\"]\n",
-    "    merged.drop([\"WTeamID\", \"LTeamID\"], axis=1, inplace=True)\n",
-    "    return merged\n",
     "\n",
-    "    # overall_stats_df = merged[[\"TeamID\", \"Season\", \"total_games\", \"WPA_sum\", \"LPA_sum\", \"total_games\"]]\n",
-    "    # # Combine stats from games won and games lost\n",
-    "    # overall_stats_df[\"TotalPA\"] = overall_stats_df[\"WPA_sum\"] + overall_stats_df[\"LPA_sum\"]\n",
-    "    return merged\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -219,7 +678,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -592,7 +1051,7 @@
        "[7605 rows x 203 columns]"
       ]
      },
-     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }

   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# here we are defining the metrics that we want to look at (practically all of them) as features\n",
+    "# for building models. I want to do so with metrics regardless of winning and losing metrics, or at least\n",
+    "# make extra features with combined stats from wins and losses. Because of that, here I am defining them manually\n",
     "\n",
+    "outcomes = [\"W\", \"L\"]\n",
     "\n",
+    "metrics = [\n",
+    "    \"FGM\", # field goals made\n",
+    "    \"FGA\", # field goals attempted\n",
+    "    \"FGM3\", # three pointers made\n",
+    "    \"FGA3\", # three pointers attempetd\n",
+    "    \"FTM\", # free throws made\n",
+    "    \"FTA\", # free throws attempted\n",
+    "    \"OR\", # Offensive rebounds\n",
+    "    \"DR\", # Defensive rebounds\n",
+    "    \"Ast\", # assists\n",
+    "    \"TO\", # turnovers\n",
+    "    \"Stl\", # steals\n",
+    "    \"Blk\", # blocks\n",
+    "    \"PF\", # personal fouls\n",
+    "]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# when doing groupbys' and aggregations on our data, it is important to keep it readable. At times where\n",
+    "# our dataframes are turned into MultiIndex objects, call this function to flatten it out.\n",
+    "def flatten_multi_idx(df: pd.DataFrame) -> None:\n",
+    "    df.columns = [\"_\".join(filter(None, col)) for col in df.columns.to_flat_index()]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# here we will summarize each teams statistics by creating new columns for each metric we are interested in\n",
+    "# that is the combined result of each teams winning stats and losing stats\n",
     "\n",
+    "def summarize_teams(szn_df: pd.DataFrame) -> pd.DataFrame:\n",
+    "    ovr_df = szn_df.copy()\n",
+    "    \n",
+    "    agg_funcs = [np.mean, np.sum, np.std, np.median, np.min, np.max]\n",
+    "    agg_dict = {f\"{outcome}{metric}\": agg_funcs for metric in metrics for outcome in outcomes}\n",
+    "    w_team_sum_df = ovr_df.groupby([\"WTeamID\", \"Season\"]).agg(agg_dict).reset_index()\n",
+    "    l_team_sum_df = ovr_df.groupby([\"LTeamID\", \"Season\"]).agg(agg_dict).reset_index()\n",
+    "    \n",
+    "    flatten_multi_idx(l_team_sum_df)\n",
+    "    flatten_multi_idx(w_team_sum_df)\n",
+    "    \n",
+    "    w_team_sum_df.drop([col for col in w_team_sum_df.columns if \"L\" in col], axis=1, inplace=True)\n",
+    "    l_team_sum_df.drop([col for col in l_team_sum_df.columns if \"W\" in col], axis=1, inplace=True)\n",
+    "    \n",
+    "    w_team_sum_df[\"TeamID\"] = w_team_sum_df[\"WTeamID\"]\n",
+    "    l_team_sum_df[\"TeamID\"] = l_team_sum_df[\"LTeamID\"]\n",
+    "    \n",
+    "    w_team_sum_df.drop([\"WTeamID\"], axis=1, inplace=True)\n",
+    "    l_team_sum_df.drop([\"LTeamID\"], axis=1, inplace=True)\n",
+    "    \n",
+    "    ovr_team_df = pd.merge(\n",
+    "        left=w_team_sum_df,\n",
+    "        right=l_team_sum_df,\n",
+    "        on=[\"TeamID\", \"Season\"],\n",
+    "    )\n",
+    "    \n",
+    "    # calculate the total of all metrics\n",
+    "    for metric in metrics:\n",
+    "        ovr_team_df[f\"tot_{metric}\"] = ovr_team_df.apply(\n",
+    "            lambda team: team[f\"W{metric}_sum\"] + team[f\"L{metric}_sum\"],\n",
+    "            axis=1,\n",
+    "        )\n",
+    "    \n",
+    "    return ovr_team_df\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Season</th>\n",
+       "      <th>WFGM_mean</th>\n",
+       "      <th>WFGM_sum</th>\n",
+       "      <th>WFGM_std</th>\n",
+       "      <th>WFGM_median</th>\n",
+       "      <th>WFGM_min</th>\n",
+       "      <th>WFGM_max</th>\n",
+       "      <th>WFGA_mean</th>\n",
+       "      <th>WFGA_sum</th>\n",
+       "      <th>WFGA_std</th>\n",
+       "      <th>...</th>\n",
+       "      <th>tot_FGA3</th>\n",
+       "      <th>tot_FTM</th>\n",
+       "      <th>tot_FTA</th>\n",
+       "      <th>tot_OR</th>\n",
+       "      <th>tot_DR</th>\n",
+       "      <th>tot_Ast</th>\n",
+       "      <th>tot_TO</th>\n",
+       "      <th>tot_Stl</th>\n",
+       "      <th>tot_Blk</th>\n",
+       "      <th>tot_PF</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2014</td>\n",
+       "      <td>26.000000</td>\n",
+       "      <td>52</td>\n",
+       "      <td>1.414214</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>25</td>\n",
+       "      <td>27</td>\n",
+       "      <td>48.500000</td>\n",
+       "      <td>97</td>\n",
+       "      <td>6.363961</td>\n",
+       "      <td>...</td>\n",
+       "      <td>375.0</td>\n",
+       "      <td>332.0</td>\n",
+       "      <td>445.0</td>\n",
+       "      <td>168.0</td>\n",
+       "      <td>427.0</td>\n",
+       "      <td>210.0</td>\n",
+       "      <td>315.0</td>\n",
+       "      <td>121.0</td>\n",
+       "      <td>31.0</td>\n",
+       "      <td>453.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2015</td>\n",
+       "      <td>27.000000</td>\n",
+       "      <td>189</td>\n",
+       "      <td>5.291503</td>\n",
+       "      <td>24.0</td>\n",
+       "      <td>22</td>\n",
+       "      <td>34</td>\n",
+       "      <td>53.000000</td>\n",
+       "      <td>371</td>\n",
+       "      <td>5.773503</td>\n",
+       "      <td>...</td>\n",
+       "      <td>537.0</td>\n",
+       "      <td>305.0</td>\n",
+       "      <td>419.0</td>\n",
+       "      <td>231.0</td>\n",
+       "      <td>550.0</td>\n",
+       "      <td>332.0</td>\n",
+       "      <td>359.0</td>\n",
+       "      <td>166.0</td>\n",
+       "      <td>33.0</td>\n",
+       "      <td>577.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2016</td>\n",
+       "      <td>25.666667</td>\n",
+       "      <td>231</td>\n",
+       "      <td>2.872281</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>21</td>\n",
+       "      <td>28</td>\n",
+       "      <td>54.000000</td>\n",
+       "      <td>486</td>\n",
+       "      <td>4.555217</td>\n",
+       "      <td>...</td>\n",
+       "      <td>509.0</td>\n",
+       "      <td>415.0</td>\n",
+       "      <td>587.0</td>\n",
+       "      <td>221.0</td>\n",
+       "      <td>608.0</td>\n",
+       "      <td>348.0</td>\n",
+       "      <td>362.0</td>\n",
+       "      <td>182.0</td>\n",
+       "      <td>66.0</td>\n",
+       "      <td>604.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2017</td>\n",
+       "      <td>24.000000</td>\n",
+       "      <td>216</td>\n",
+       "      <td>3.162278</td>\n",
+       "      <td>25.0</td>\n",
+       "      <td>19</td>\n",
+       "      <td>28</td>\n",
+       "      <td>49.555556</td>\n",
+       "      <td>446</td>\n",
+       "      <td>5.981453</td>\n",
+       "      <td>...</td>\n",
+       "      <td>477.0</td>\n",
+       "      <td>298.0</td>\n",
+       "      <td>464.0</td>\n",
+       "      <td>189.0</td>\n",
+       "      <td>572.0</td>\n",
+       "      <td>340.0</td>\n",
+       "      <td>362.0</td>\n",
+       "      <td>175.0</td>\n",
+       "      <td>69.0</td>\n",
+       "      <td>554.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2018</td>\n",
+       "      <td>27.416667</td>\n",
+       "      <td>329</td>\n",
+       "      <td>3.964807</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>22</td>\n",
+       "      <td>34</td>\n",
+       "      <td>57.250000</td>\n",
+       "      <td>687</td>\n",
+       "      <td>4.731423</td>\n",
+       "      <td>...</td>\n",
+       "      <td>539.0</td>\n",
+       "      <td>355.0</td>\n",
+       "      <td>504.0</td>\n",
+       "      <td>244.0</td>\n",
+       "      <td>627.0</td>\n",
+       "      <td>375.0</td>\n",
+       "      <td>389.0</td>\n",
+       "      <td>193.0</td>\n",
+       "      <td>98.0</td>\n",
+       "      <td>568.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7600</th>\n",
+       "      <td>2023</td>\n",
+       "      <td>24.153846</td>\n",
+       "      <td>314</td>\n",
+       "      <td>5.063697</td>\n",
+       "      <td>25.0</td>\n",
+       "      <td>16</td>\n",
+       "      <td>31</td>\n",
+       "      <td>51.461538</td>\n",
+       "      <td>669</td>\n",
+       "      <td>6.118488</td>\n",
+       "      <td>...</td>\n",
+       "      <td>649.0</td>\n",
+       "      <td>384.0</td>\n",
+       "      <td>506.0</td>\n",
+       "      <td>149.0</td>\n",
+       "      <td>676.0</td>\n",
+       "      <td>357.0</td>\n",
+       "      <td>384.0</td>\n",
+       "      <td>209.0</td>\n",
+       "      <td>85.0</td>\n",
+       "      <td>454.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7601</th>\n",
+       "      <td>2024</td>\n",
+       "      <td>23.000000</td>\n",
+       "      <td>46</td>\n",
+       "      <td>2.828427</td>\n",
+       "      <td>23.0</td>\n",
+       "      <td>21</td>\n",
+       "      <td>25</td>\n",
+       "      <td>45.500000</td>\n",
+       "      <td>91</td>\n",
+       "      <td>4.949747</td>\n",
+       "      <td>...</td>\n",
+       "      <td>684.0</td>\n",
+       "      <td>233.0</td>\n",
+       "      <td>330.0</td>\n",
+       "      <td>168.0</td>\n",
+       "      <td>565.0</td>\n",
+       "      <td>287.0</td>\n",
+       "      <td>336.0</td>\n",
+       "      <td>171.0</td>\n",
+       "      <td>57.0</td>\n",
+       "      <td>395.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7602</th>\n",
+       "      <td>2023</td>\n",
+       "      <td>25.583333</td>\n",
+       "      <td>307</td>\n",
+       "      <td>3.800917</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>19</td>\n",
+       "      <td>31</td>\n",
+       "      <td>57.000000</td>\n",
+       "      <td>684</td>\n",
+       "      <td>6.208499</td>\n",
+       "      <td>...</td>\n",
+       "      <td>827.0</td>\n",
+       "      <td>359.0</td>\n",
+       "      <td>513.0</td>\n",
+       "      <td>240.0</td>\n",
+       "      <td>675.0</td>\n",
+       "      <td>443.0</td>\n",
+       "      <td>398.0</td>\n",
+       "      <td>178.0</td>\n",
+       "      <td>92.0</td>\n",
+       "      <td>600.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7603</th>\n",
+       "      <td>2024</td>\n",
+       "      <td>27.166667</td>\n",
+       "      <td>163</td>\n",
+       "      <td>4.875107</td>\n",
+       "      <td>28.5</td>\n",
+       "      <td>21</td>\n",
+       "      <td>32</td>\n",
+       "      <td>60.166667</td>\n",
+       "      <td>361</td>\n",
+       "      <td>6.823977</td>\n",
+       "      <td>...</td>\n",
+       "      <td>626.0</td>\n",
+       "      <td>250.0</td>\n",
+       "      <td>363.0</td>\n",
+       "      <td>164.0</td>\n",
+       "      <td>448.0</td>\n",
+       "      <td>289.0</td>\n",
+       "      <td>253.0</td>\n",
+       "      <td>163.0</td>\n",
+       "      <td>105.0</td>\n",
+       "      <td>403.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7604</th>\n",
+       "      <td>2024</td>\n",
+       "      <td>28.285714</td>\n",
+       "      <td>198</td>\n",
+       "      <td>5.154748</td>\n",
+       "      <td>31.0</td>\n",
+       "      <td>19</td>\n",
+       "      <td>34</td>\n",
+       "      <td>57.142857</td>\n",
+       "      <td>400</td>\n",
+       "      <td>3.976119</td>\n",
+       "      <td>...</td>\n",
+       "      <td>576.0</td>\n",
+       "      <td>226.0</td>\n",
+       "      <td>292.0</td>\n",
+       "      <td>155.0</td>\n",
+       "      <td>459.0</td>\n",
+       "      <td>318.0</td>\n",
+       "      <td>231.0</td>\n",
+       "      <td>155.0</td>\n",
+       "      <td>61.0</td>\n",
+       "      <td>332.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>7605 rows × 171 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      Season  WFGM_mean  WFGM_sum  WFGM_std  WFGM_median  WFGM_min  WFGM_max  \\\n",
+       "0       2014  26.000000        52  1.414214         26.0        25        27   \n",
+       "1       2015  27.000000       189  5.291503         24.0        22        34   \n",
+       "2       2016  25.666667       231  2.872281         27.0        21        28   \n",
+       "3       2017  24.000000       216  3.162278         25.0        19        28   \n",
+       "4       2018  27.416667       329  3.964807         27.0        22        34   \n",
+       "...      ...        ...       ...       ...          ...       ...       ...   \n",
+       "7600    2023  24.153846       314  5.063697         25.0        16        31   \n",
+       "7601    2024  23.000000        46  2.828427         23.0        21        25   \n",
+       "7602    2023  25.583333       307  3.800917         26.0        19        31   \n",
+       "7603    2024  27.166667       163  4.875107         28.5        21        32   \n",
+       "7604    2024  28.285714       198  5.154748         31.0        19        34   \n",
+       "\n",
+       "      WFGA_mean  WFGA_sum  WFGA_std  ...  tot_FGA3  tot_FTM  tot_FTA  tot_OR  \\\n",
+       "0     48.500000        97  6.363961  ...     375.0    332.0    445.0   168.0   \n",
+       "1     53.000000       371  5.773503  ...     537.0    305.0    419.0   231.0   \n",
+       "2     54.000000       486  4.555217  ...     509.0    415.0    587.0   221.0   \n",
+       "3     49.555556       446  5.981453  ...     477.0    298.0    464.0   189.0   \n",
+       "4     57.250000       687  4.731423  ...     539.0    355.0    504.0   244.0   \n",
+       "...         ...       ...       ...  ...       ...      ...      ...     ...   \n",
+       "7600  51.461538       669  6.118488  ...     649.0    384.0    506.0   149.0   \n",
+       "7601  45.500000        91  4.949747  ...     684.0    233.0    330.0   168.0   \n",
+       "7602  57.000000       684  6.208499  ...     827.0    359.0    513.0   240.0   \n",
+       "7603  60.166667       361  6.823977  ...     626.0    250.0    363.0   164.0   \n",
+       "7604  57.142857       400  3.976119  ...     576.0    226.0    292.0   155.0   \n",
+       "\n",
+       "      tot_DR  tot_Ast  tot_TO  tot_Stl  tot_Blk  tot_PF  \n",
+       "0      427.0    210.0   315.0    121.0     31.0   453.0  \n",
+       "1      550.0    332.0   359.0    166.0     33.0   577.0  \n",
+       "2      608.0    348.0   362.0    182.0     66.0   604.0  \n",
+       "3      572.0    340.0   362.0    175.0     69.0   554.0  \n",
+       "4      627.0    375.0   389.0    193.0     98.0   568.0  \n",
+       "...      ...      ...     ...      ...      ...     ...  \n",
+       "7600   676.0    357.0   384.0    209.0     85.0   454.0  \n",
+       "7601   565.0    287.0   336.0    171.0     57.0   395.0  \n",
+       "7602   675.0    443.0   398.0    178.0     92.0   600.0  \n",
+       "7603   448.0    289.0   253.0    163.0    105.0   403.0  \n",
+       "7604   459.0    318.0   231.0    155.0     61.0   332.0  \n",
+       "\n",
+       "[7605 rows x 171 columns]"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "summarize_teams(reg_games_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# def summarize_teams(df: pd.DataFrame) -> pd.DataFrame:\n",
+    "#     other_cols = {\"TeamID\", \"WTeamID\", \"LTeamID\", \"DayNum\", \"Season\", \"GameType\", \"total_games\"}\n",
+    "#     agg_funcs = [np.sum, np.mean, np.median, np.std, np.min, np.max]\n",
+    "#     dfs = {}\n",
+    "#     subsets = [\"W\", \"L\"]\n",
+    "#     for subset in subsets:\n",
+    "#         sub = df[[col for col in df.columns if subset in col or col in other_cols]]\n",
+    "#         agg_df = sub \\\n",
+    "#             .groupby([f\"{subset}TeamID\", \"Season\"]) \\\n",
+    "#             .agg({col: agg_funcs for col in sub.columns if col not in other_cols}) \\\n",
+    "#             .reset_index()\n",
     "        \n",
+    "#         flatten_multi_idx(agg_df)\n",
+    "#         agg_df[f\"total{subset}\"] = df \\\n",
+    "#             .groupby([f\"{subset}TeamID\", \"Season\"])[f\"{subset}TeamID\"] \\\n",
+    "#             .transform(\"count\")\n",
+    "#         dfs[subset] = agg_df\n",
     "\n",
+    "#     merged = pd.merge(\n",
+    "#         left=dfs[\"W\"],\n",
+    "#         right=dfs[\"L\"],\n",
+    "#         left_on=[\"WTeamID\", \"Season\"],\n",
+    "#         right_on=[\"LTeamID\", \"Season\"],\n",
+    "#     )\n",
     "\n",
+    "#     merged[\"total_games\"] = merged[\"totalW\"] + merged[\"totalL\"]\n",
+    "#     merged[\"TeamID\"] = merged[\"WTeamID\"]\n",
+    "#     merged.drop([\"WTeamID\", \"LTeamID\"], axis=1, inplace=True)\n",
+    "#     return merged\n",
     "\n",
+    "#     # overall_stats_df = merged[[\"TeamID\", \"Season\", \"total_games\", \"WPA_sum\", \"LPA_sum\", \"total_games\"]]\n",
+    "#     # # Combine stats from games won and games lost\n",
+    "#     # overall_stats_df[\"TotalPA\"] = overall_stats_df[\"WPA_sum\"] + overall_stats_df[\"LPA_sum\"]\n",
+    "#     return merged"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
        "[7605 rows x 203 columns]"
       ]
      },
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }

src/mens_monte_carlo.ipynb DELETED Viewed

@@ -1,45 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "import os\n",
-    "\n",
-    "DATA_DIR = os.path.join(\"..\", \"data\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

src/mens_pre_processing.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff