Jensen-holm commited on
Commit
dd0de3f
·
1 Parent(s): 95171ee

aggregating and summarizing team stats in mens_pre_procssing so that we

Browse files

have a more informed dataset when it comes to evaluating team tournament
performance based on regular season performance

src/{m_pp.ipynb → .ipynb_checkpoints/m_pp-checkpoint.ipynb} RENAMED
@@ -163,54 +163,513 @@
163
  },
164
  {
165
  "cell_type": "code",
166
- "execution_count": 10,
167
  "metadata": {},
168
  "outputs": [],
169
  "source": [
 
 
 
170
  "\n",
171
- "def flatten_multi_idx(df: pd.DataFrame) -> None:\n",
172
- " df.columns = [\"_\".join(filter(None, col)) for col in df.columns.to_flat_index()]\n",
173
  "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  "\n",
175
- "def summarize_teams(df: pd.DataFrame) -> pd.DataFrame:\n",
176
- " other_cols = {\"TeamID\", \"WTeamID\", \"LTeamID\", \"DayNum\", \"Season\", \"GameType\", \"total_games\"}\n",
177
- " agg_funcs = [np.sum, np.mean, np.median, np.std, np.min, np.max]\n",
178
- " dfs = {}\n",
179
- " subsets = [\"W\", \"L\"]\n",
180
- " for subset in subsets:\n",
181
- " sub = df[[col for col in df.columns if subset in col or col in other_cols]]\n",
182
- " agg_df = sub \\\n",
183
- " .groupby([f\"{subset}TeamID\", \"Season\"]) \\\n",
184
- " .agg({col: agg_funcs for col in sub.columns if col not in other_cols}) \\\n",
185
- " .reset_index()\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  " \n",
187
- " flatten_multi_idx(agg_df)\n",
188
- " agg_df[f\"total{subset}\"] = df \\\n",
189
- " .groupby([f\"{subset}TeamID\", \"Season\"])[f\"{subset}TeamID\"] \\\n",
190
- " .transform(\"count\")\n",
191
- " dfs[subset] = agg_df\n",
192
  "\n",
193
- " merged = pd.merge(\n",
194
- " left=dfs[\"W\"],\n",
195
- " right=dfs[\"L\"],\n",
196
- " left_on=[\"WTeamID\", \"Season\"],\n",
197
- " right_on=[\"LTeamID\", \"Season\"],\n",
198
- " )\n",
199
  "\n",
200
- " merged[\"total_games\"] = merged[\"totalW\"] + merged[\"totalL\"]\n",
201
- " merged[\"TeamID\"] = merged[\"WTeamID\"]\n",
202
- " merged.drop([\"WTeamID\", \"LTeamID\"], axis=1, inplace=True)\n",
203
- " return merged\n",
204
  "\n",
205
- " # overall_stats_df = merged[[\"TeamID\", \"Season\", \"total_games\", \"WPA_sum\", \"LPA_sum\", \"total_games\"]]\n",
206
- " # # Combine stats from games won and games lost\n",
207
- " # overall_stats_df[\"TotalPA\"] = overall_stats_df[\"WPA_sum\"] + overall_stats_df[\"LPA_sum\"]\n",
208
- " return merged\n"
209
  ]
210
  },
211
  {
212
  "cell_type": "code",
213
- "execution_count": 11,
 
 
 
 
 
 
 
214
  "metadata": {},
215
  "outputs": [],
216
  "source": [
@@ -219,7 +678,7 @@
219
  },
220
  {
221
  "cell_type": "code",
222
- "execution_count": 12,
223
  "metadata": {},
224
  "outputs": [
225
  {
@@ -592,7 +1051,7 @@
592
  "[7605 rows x 203 columns]"
593
  ]
594
  },
595
- "execution_count": 12,
596
  "metadata": {},
597
  "output_type": "execute_result"
598
  }
 
163
  },
164
  {
165
  "cell_type": "code",
166
+ "execution_count": 4,
167
  "metadata": {},
168
  "outputs": [],
169
  "source": [
170
+ "# here we are defining the metrics that we want to look at (practically all of them) as features\n",
171
+ "# for building models. I want to do so with metrics regardless of winning and losing metrics, or at least\n",
172
+ "# make extra features with combined stats from wins and losses. Because of that, here I am defining them manually\n",
173
  "\n",
174
+ "outcomes = [\"W\", \"L\"]\n",
 
175
  "\n",
176
+ "metrics = [\n",
177
+ " \"FGM\", # field goals made\n",
178
+ " \"FGA\", # field goals attempted\n",
179
+ " \"FGM3\", # three pointers made\n",
180
+ " \"FGA3\", # three pointers attempetd\n",
181
+ " \"FTM\", # free throws made\n",
182
+ " \"FTA\", # free throws attempted\n",
183
+ " \"OR\", # Offensive rebounds\n",
184
+ " \"DR\", # Defensive rebounds\n",
185
+ " \"Ast\", # assists\n",
186
+ " \"TO\", # turnovers\n",
187
+ " \"Stl\", # steals\n",
188
+ " \"Blk\", # blocks\n",
189
+ " \"PF\", # personal fouls\n",
190
+ "]\n"
191
+ ]
192
+ },
193
+ {
194
+ "cell_type": "code",
195
+ "execution_count": 5,
196
+ "metadata": {},
197
+ "outputs": [],
198
+ "source": [
199
+ "# when doing groupbys' and aggregations on our data, it is important to keep it readable. At times where\n",
200
+ "# our dataframes are turned into MultiIndex objects, call this function to flatten it out.\n",
201
+ "def flatten_multi_idx(df: pd.DataFrame) -> None:\n",
202
+ " df.columns = [\"_\".join(filter(None, col)) for col in df.columns.to_flat_index()]\n"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "code",
207
+ "execution_count": 39,
208
+ "metadata": {},
209
+ "outputs": [],
210
+ "source": [
211
+ "# here we will summarize each teams statistics by creating new columns for each metric we are interested in\n",
212
+ "# that is the combined result of each teams winning stats and losing stats\n",
213
  "\n",
214
+ "def summarize_teams(szn_df: pd.DataFrame) -> pd.DataFrame:\n",
215
+ " ovr_df = szn_df.copy()\n",
216
+ " \n",
217
+ " agg_funcs = [np.mean, np.sum, np.std, np.median, np.min, np.max]\n",
218
+ " agg_dict = {f\"{outcome}{metric}\": agg_funcs for metric in metrics for outcome in outcomes}\n",
219
+ " w_team_sum_df = ovr_df.groupby([\"WTeamID\", \"Season\"]).agg(agg_dict).reset_index()\n",
220
+ " l_team_sum_df = ovr_df.groupby([\"LTeamID\", \"Season\"]).agg(agg_dict).reset_index()\n",
221
+ " \n",
222
+ " flatten_multi_idx(l_team_sum_df)\n",
223
+ " flatten_multi_idx(w_team_sum_df)\n",
224
+ " \n",
225
+ " w_team_sum_df.drop([col for col in w_team_sum_df.columns if \"L\" in col], axis=1, inplace=True)\n",
226
+ " l_team_sum_df.drop([col for col in l_team_sum_df.columns if \"W\" in col], axis=1, inplace=True)\n",
227
+ " \n",
228
+ " w_team_sum_df[\"TeamID\"] = w_team_sum_df[\"WTeamID\"]\n",
229
+ " l_team_sum_df[\"TeamID\"] = l_team_sum_df[\"LTeamID\"]\n",
230
+ " \n",
231
+ " w_team_sum_df.drop([\"WTeamID\"], axis=1, inplace=True)\n",
232
+ " l_team_sum_df.drop([\"LTeamID\"], axis=1, inplace=True)\n",
233
+ " \n",
234
+ " ovr_team_df = pd.merge(\n",
235
+ " left=w_team_sum_df,\n",
236
+ " right=l_team_sum_df,\n",
237
+ " on=[\"TeamID\", \"Season\"],\n",
238
+ " )\n",
239
+ " \n",
240
+ " # calculate the total of all metrics\n",
241
+ " for metric in metrics:\n",
242
+ " ovr_team_df[f\"tot_{metric}\"] = ovr_team_df.apply(\n",
243
+ " lambda team: team[f\"W{metric}_sum\"] + team[f\"L{metric}_sum\"],\n",
244
+ " axis=1,\n",
245
+ " )\n",
246
+ " \n",
247
+ " return ovr_team_df\n"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "code",
252
+ "execution_count": 40,
253
+ "metadata": {},
254
+ "outputs": [
255
+ {
256
+ "data": {
257
+ "text/html": [
258
+ "<div>\n",
259
+ "<style scoped>\n",
260
+ " .dataframe tbody tr th:only-of-type {\n",
261
+ " vertical-align: middle;\n",
262
+ " }\n",
263
+ "\n",
264
+ " .dataframe tbody tr th {\n",
265
+ " vertical-align: top;\n",
266
+ " }\n",
267
+ "\n",
268
+ " .dataframe thead th {\n",
269
+ " text-align: right;\n",
270
+ " }\n",
271
+ "</style>\n",
272
+ "<table border=\"1\" class=\"dataframe\">\n",
273
+ " <thead>\n",
274
+ " <tr style=\"text-align: right;\">\n",
275
+ " <th></th>\n",
276
+ " <th>Season</th>\n",
277
+ " <th>WFGM_mean</th>\n",
278
+ " <th>WFGM_sum</th>\n",
279
+ " <th>WFGM_std</th>\n",
280
+ " <th>WFGM_median</th>\n",
281
+ " <th>WFGM_min</th>\n",
282
+ " <th>WFGM_max</th>\n",
283
+ " <th>WFGA_mean</th>\n",
284
+ " <th>WFGA_sum</th>\n",
285
+ " <th>WFGA_std</th>\n",
286
+ " <th>...</th>\n",
287
+ " <th>tot_FGA3</th>\n",
288
+ " <th>tot_FTM</th>\n",
289
+ " <th>tot_FTA</th>\n",
290
+ " <th>tot_OR</th>\n",
291
+ " <th>tot_DR</th>\n",
292
+ " <th>tot_Ast</th>\n",
293
+ " <th>tot_TO</th>\n",
294
+ " <th>tot_Stl</th>\n",
295
+ " <th>tot_Blk</th>\n",
296
+ " <th>tot_PF</th>\n",
297
+ " </tr>\n",
298
+ " </thead>\n",
299
+ " <tbody>\n",
300
+ " <tr>\n",
301
+ " <th>0</th>\n",
302
+ " <td>2014</td>\n",
303
+ " <td>26.000000</td>\n",
304
+ " <td>52</td>\n",
305
+ " <td>1.414214</td>\n",
306
+ " <td>26.0</td>\n",
307
+ " <td>25</td>\n",
308
+ " <td>27</td>\n",
309
+ " <td>48.500000</td>\n",
310
+ " <td>97</td>\n",
311
+ " <td>6.363961</td>\n",
312
+ " <td>...</td>\n",
313
+ " <td>375.0</td>\n",
314
+ " <td>332.0</td>\n",
315
+ " <td>445.0</td>\n",
316
+ " <td>168.0</td>\n",
317
+ " <td>427.0</td>\n",
318
+ " <td>210.0</td>\n",
319
+ " <td>315.0</td>\n",
320
+ " <td>121.0</td>\n",
321
+ " <td>31.0</td>\n",
322
+ " <td>453.0</td>\n",
323
+ " </tr>\n",
324
+ " <tr>\n",
325
+ " <th>1</th>\n",
326
+ " <td>2015</td>\n",
327
+ " <td>27.000000</td>\n",
328
+ " <td>189</td>\n",
329
+ " <td>5.291503</td>\n",
330
+ " <td>24.0</td>\n",
331
+ " <td>22</td>\n",
332
+ " <td>34</td>\n",
333
+ " <td>53.000000</td>\n",
334
+ " <td>371</td>\n",
335
+ " <td>5.773503</td>\n",
336
+ " <td>...</td>\n",
337
+ " <td>537.0</td>\n",
338
+ " <td>305.0</td>\n",
339
+ " <td>419.0</td>\n",
340
+ " <td>231.0</td>\n",
341
+ " <td>550.0</td>\n",
342
+ " <td>332.0</td>\n",
343
+ " <td>359.0</td>\n",
344
+ " <td>166.0</td>\n",
345
+ " <td>33.0</td>\n",
346
+ " <td>577.0</td>\n",
347
+ " </tr>\n",
348
+ " <tr>\n",
349
+ " <th>2</th>\n",
350
+ " <td>2016</td>\n",
351
+ " <td>25.666667</td>\n",
352
+ " <td>231</td>\n",
353
+ " <td>2.872281</td>\n",
354
+ " <td>27.0</td>\n",
355
+ " <td>21</td>\n",
356
+ " <td>28</td>\n",
357
+ " <td>54.000000</td>\n",
358
+ " <td>486</td>\n",
359
+ " <td>4.555217</td>\n",
360
+ " <td>...</td>\n",
361
+ " <td>509.0</td>\n",
362
+ " <td>415.0</td>\n",
363
+ " <td>587.0</td>\n",
364
+ " <td>221.0</td>\n",
365
+ " <td>608.0</td>\n",
366
+ " <td>348.0</td>\n",
367
+ " <td>362.0</td>\n",
368
+ " <td>182.0</td>\n",
369
+ " <td>66.0</td>\n",
370
+ " <td>604.0</td>\n",
371
+ " </tr>\n",
372
+ " <tr>\n",
373
+ " <th>3</th>\n",
374
+ " <td>2017</td>\n",
375
+ " <td>24.000000</td>\n",
376
+ " <td>216</td>\n",
377
+ " <td>3.162278</td>\n",
378
+ " <td>25.0</td>\n",
379
+ " <td>19</td>\n",
380
+ " <td>28</td>\n",
381
+ " <td>49.555556</td>\n",
382
+ " <td>446</td>\n",
383
+ " <td>5.981453</td>\n",
384
+ " <td>...</td>\n",
385
+ " <td>477.0</td>\n",
386
+ " <td>298.0</td>\n",
387
+ " <td>464.0</td>\n",
388
+ " <td>189.0</td>\n",
389
+ " <td>572.0</td>\n",
390
+ " <td>340.0</td>\n",
391
+ " <td>362.0</td>\n",
392
+ " <td>175.0</td>\n",
393
+ " <td>69.0</td>\n",
394
+ " <td>554.0</td>\n",
395
+ " </tr>\n",
396
+ " <tr>\n",
397
+ " <th>4</th>\n",
398
+ " <td>2018</td>\n",
399
+ " <td>27.416667</td>\n",
400
+ " <td>329</td>\n",
401
+ " <td>3.964807</td>\n",
402
+ " <td>27.0</td>\n",
403
+ " <td>22</td>\n",
404
+ " <td>34</td>\n",
405
+ " <td>57.250000</td>\n",
406
+ " <td>687</td>\n",
407
+ " <td>4.731423</td>\n",
408
+ " <td>...</td>\n",
409
+ " <td>539.0</td>\n",
410
+ " <td>355.0</td>\n",
411
+ " <td>504.0</td>\n",
412
+ " <td>244.0</td>\n",
413
+ " <td>627.0</td>\n",
414
+ " <td>375.0</td>\n",
415
+ " <td>389.0</td>\n",
416
+ " <td>193.0</td>\n",
417
+ " <td>98.0</td>\n",
418
+ " <td>568.0</td>\n",
419
+ " </tr>\n",
420
+ " <tr>\n",
421
+ " <th>...</th>\n",
422
+ " <td>...</td>\n",
423
+ " <td>...</td>\n",
424
+ " <td>...</td>\n",
425
+ " <td>...</td>\n",
426
+ " <td>...</td>\n",
427
+ " <td>...</td>\n",
428
+ " <td>...</td>\n",
429
+ " <td>...</td>\n",
430
+ " <td>...</td>\n",
431
+ " <td>...</td>\n",
432
+ " <td>...</td>\n",
433
+ " <td>...</td>\n",
434
+ " <td>...</td>\n",
435
+ " <td>...</td>\n",
436
+ " <td>...</td>\n",
437
+ " <td>...</td>\n",
438
+ " <td>...</td>\n",
439
+ " <td>...</td>\n",
440
+ " <td>...</td>\n",
441
+ " <td>...</td>\n",
442
+ " <td>...</td>\n",
443
+ " </tr>\n",
444
+ " <tr>\n",
445
+ " <th>7600</th>\n",
446
+ " <td>2023</td>\n",
447
+ " <td>24.153846</td>\n",
448
+ " <td>314</td>\n",
449
+ " <td>5.063697</td>\n",
450
+ " <td>25.0</td>\n",
451
+ " <td>16</td>\n",
452
+ " <td>31</td>\n",
453
+ " <td>51.461538</td>\n",
454
+ " <td>669</td>\n",
455
+ " <td>6.118488</td>\n",
456
+ " <td>...</td>\n",
457
+ " <td>649.0</td>\n",
458
+ " <td>384.0</td>\n",
459
+ " <td>506.0</td>\n",
460
+ " <td>149.0</td>\n",
461
+ " <td>676.0</td>\n",
462
+ " <td>357.0</td>\n",
463
+ " <td>384.0</td>\n",
464
+ " <td>209.0</td>\n",
465
+ " <td>85.0</td>\n",
466
+ " <td>454.0</td>\n",
467
+ " </tr>\n",
468
+ " <tr>\n",
469
+ " <th>7601</th>\n",
470
+ " <td>2024</td>\n",
471
+ " <td>23.000000</td>\n",
472
+ " <td>46</td>\n",
473
+ " <td>2.828427</td>\n",
474
+ " <td>23.0</td>\n",
475
+ " <td>21</td>\n",
476
+ " <td>25</td>\n",
477
+ " <td>45.500000</td>\n",
478
+ " <td>91</td>\n",
479
+ " <td>4.949747</td>\n",
480
+ " <td>...</td>\n",
481
+ " <td>684.0</td>\n",
482
+ " <td>233.0</td>\n",
483
+ " <td>330.0</td>\n",
484
+ " <td>168.0</td>\n",
485
+ " <td>565.0</td>\n",
486
+ " <td>287.0</td>\n",
487
+ " <td>336.0</td>\n",
488
+ " <td>171.0</td>\n",
489
+ " <td>57.0</td>\n",
490
+ " <td>395.0</td>\n",
491
+ " </tr>\n",
492
+ " <tr>\n",
493
+ " <th>7602</th>\n",
494
+ " <td>2023</td>\n",
495
+ " <td>25.583333</td>\n",
496
+ " <td>307</td>\n",
497
+ " <td>3.800917</td>\n",
498
+ " <td>26.0</td>\n",
499
+ " <td>19</td>\n",
500
+ " <td>31</td>\n",
501
+ " <td>57.000000</td>\n",
502
+ " <td>684</td>\n",
503
+ " <td>6.208499</td>\n",
504
+ " <td>...</td>\n",
505
+ " <td>827.0</td>\n",
506
+ " <td>359.0</td>\n",
507
+ " <td>513.0</td>\n",
508
+ " <td>240.0</td>\n",
509
+ " <td>675.0</td>\n",
510
+ " <td>443.0</td>\n",
511
+ " <td>398.0</td>\n",
512
+ " <td>178.0</td>\n",
513
+ " <td>92.0</td>\n",
514
+ " <td>600.0</td>\n",
515
+ " </tr>\n",
516
+ " <tr>\n",
517
+ " <th>7603</th>\n",
518
+ " <td>2024</td>\n",
519
+ " <td>27.166667</td>\n",
520
+ " <td>163</td>\n",
521
+ " <td>4.875107</td>\n",
522
+ " <td>28.5</td>\n",
523
+ " <td>21</td>\n",
524
+ " <td>32</td>\n",
525
+ " <td>60.166667</td>\n",
526
+ " <td>361</td>\n",
527
+ " <td>6.823977</td>\n",
528
+ " <td>...</td>\n",
529
+ " <td>626.0</td>\n",
530
+ " <td>250.0</td>\n",
531
+ " <td>363.0</td>\n",
532
+ " <td>164.0</td>\n",
533
+ " <td>448.0</td>\n",
534
+ " <td>289.0</td>\n",
535
+ " <td>253.0</td>\n",
536
+ " <td>163.0</td>\n",
537
+ " <td>105.0</td>\n",
538
+ " <td>403.0</td>\n",
539
+ " </tr>\n",
540
+ " <tr>\n",
541
+ " <th>7604</th>\n",
542
+ " <td>2024</td>\n",
543
+ " <td>28.285714</td>\n",
544
+ " <td>198</td>\n",
545
+ " <td>5.154748</td>\n",
546
+ " <td>31.0</td>\n",
547
+ " <td>19</td>\n",
548
+ " <td>34</td>\n",
549
+ " <td>57.142857</td>\n",
550
+ " <td>400</td>\n",
551
+ " <td>3.976119</td>\n",
552
+ " <td>...</td>\n",
553
+ " <td>576.0</td>\n",
554
+ " <td>226.0</td>\n",
555
+ " <td>292.0</td>\n",
556
+ " <td>155.0</td>\n",
557
+ " <td>459.0</td>\n",
558
+ " <td>318.0</td>\n",
559
+ " <td>231.0</td>\n",
560
+ " <td>155.0</td>\n",
561
+ " <td>61.0</td>\n",
562
+ " <td>332.0</td>\n",
563
+ " </tr>\n",
564
+ " </tbody>\n",
565
+ "</table>\n",
566
+ "<p>7605 rows × 171 columns</p>\n",
567
+ "</div>"
568
+ ],
569
+ "text/plain": [
570
+ " Season WFGM_mean WFGM_sum WFGM_std WFGM_median WFGM_min WFGM_max \\\n",
571
+ "0 2014 26.000000 52 1.414214 26.0 25 27 \n",
572
+ "1 2015 27.000000 189 5.291503 24.0 22 34 \n",
573
+ "2 2016 25.666667 231 2.872281 27.0 21 28 \n",
574
+ "3 2017 24.000000 216 3.162278 25.0 19 28 \n",
575
+ "4 2018 27.416667 329 3.964807 27.0 22 34 \n",
576
+ "... ... ... ... ... ... ... ... \n",
577
+ "7600 2023 24.153846 314 5.063697 25.0 16 31 \n",
578
+ "7601 2024 23.000000 46 2.828427 23.0 21 25 \n",
579
+ "7602 2023 25.583333 307 3.800917 26.0 19 31 \n",
580
+ "7603 2024 27.166667 163 4.875107 28.5 21 32 \n",
581
+ "7604 2024 28.285714 198 5.154748 31.0 19 34 \n",
582
+ "\n",
583
+ " WFGA_mean WFGA_sum WFGA_std ... tot_FGA3 tot_FTM tot_FTA tot_OR \\\n",
584
+ "0 48.500000 97 6.363961 ... 375.0 332.0 445.0 168.0 \n",
585
+ "1 53.000000 371 5.773503 ... 537.0 305.0 419.0 231.0 \n",
586
+ "2 54.000000 486 4.555217 ... 509.0 415.0 587.0 221.0 \n",
587
+ "3 49.555556 446 5.981453 ... 477.0 298.0 464.0 189.0 \n",
588
+ "4 57.250000 687 4.731423 ... 539.0 355.0 504.0 244.0 \n",
589
+ "... ... ... ... ... ... ... ... ... \n",
590
+ "7600 51.461538 669 6.118488 ... 649.0 384.0 506.0 149.0 \n",
591
+ "7601 45.500000 91 4.949747 ... 684.0 233.0 330.0 168.0 \n",
592
+ "7602 57.000000 684 6.208499 ... 827.0 359.0 513.0 240.0 \n",
593
+ "7603 60.166667 361 6.823977 ... 626.0 250.0 363.0 164.0 \n",
594
+ "7604 57.142857 400 3.976119 ... 576.0 226.0 292.0 155.0 \n",
595
+ "\n",
596
+ " tot_DR tot_Ast tot_TO tot_Stl tot_Blk tot_PF \n",
597
+ "0 427.0 210.0 315.0 121.0 31.0 453.0 \n",
598
+ "1 550.0 332.0 359.0 166.0 33.0 577.0 \n",
599
+ "2 608.0 348.0 362.0 182.0 66.0 604.0 \n",
600
+ "3 572.0 340.0 362.0 175.0 69.0 554.0 \n",
601
+ "4 627.0 375.0 389.0 193.0 98.0 568.0 \n",
602
+ "... ... ... ... ... ... ... \n",
603
+ "7600 676.0 357.0 384.0 209.0 85.0 454.0 \n",
604
+ "7601 565.0 287.0 336.0 171.0 57.0 395.0 \n",
605
+ "7602 675.0 443.0 398.0 178.0 92.0 600.0 \n",
606
+ "7603 448.0 289.0 253.0 163.0 105.0 403.0 \n",
607
+ "7604 459.0 318.0 231.0 155.0 61.0 332.0 \n",
608
+ "\n",
609
+ "[7605 rows x 171 columns]"
610
+ ]
611
+ },
612
+ "execution_count": 40,
613
+ "metadata": {},
614
+ "output_type": "execute_result"
615
+ }
616
+ ],
617
+ "source": [
618
+ "summarize_teams(reg_games_df)"
619
+ ]
620
+ },
621
+ {
622
+ "cell_type": "code",
623
+ "execution_count": 20,
624
+ "metadata": {},
625
+ "outputs": [],
626
+ "source": [
627
+ "# def summarize_teams(df: pd.DataFrame) -> pd.DataFrame:\n",
628
+ "# other_cols = {\"TeamID\", \"WTeamID\", \"LTeamID\", \"DayNum\", \"Season\", \"GameType\", \"total_games\"}\n",
629
+ "# agg_funcs = [np.sum, np.mean, np.median, np.std, np.min, np.max]\n",
630
+ "# dfs = {}\n",
631
+ "# subsets = [\"W\", \"L\"]\n",
632
+ "# for subset in subsets:\n",
633
+ "# sub = df[[col for col in df.columns if subset in col or col in other_cols]]\n",
634
+ "# agg_df = sub \\\n",
635
+ "# .groupby([f\"{subset}TeamID\", \"Season\"]) \\\n",
636
+ "# .agg({col: agg_funcs for col in sub.columns if col not in other_cols}) \\\n",
637
+ "# .reset_index()\n",
638
  " \n",
639
+ "# flatten_multi_idx(agg_df)\n",
640
+ "# agg_df[f\"total{subset}\"] = df \\\n",
641
+ "# .groupby([f\"{subset}TeamID\", \"Season\"])[f\"{subset}TeamID\"] \\\n",
642
+ "# .transform(\"count\")\n",
643
+ "# dfs[subset] = agg_df\n",
644
  "\n",
645
+ "# merged = pd.merge(\n",
646
+ "# left=dfs[\"W\"],\n",
647
+ "# right=dfs[\"L\"],\n",
648
+ "# left_on=[\"WTeamID\", \"Season\"],\n",
649
+ "# right_on=[\"LTeamID\", \"Season\"],\n",
650
+ "# )\n",
651
  "\n",
652
+ "# merged[\"total_games\"] = merged[\"totalW\"] + merged[\"totalL\"]\n",
653
+ "# merged[\"TeamID\"] = merged[\"WTeamID\"]\n",
654
+ "# merged.drop([\"WTeamID\", \"LTeamID\"], axis=1, inplace=True)\n",
655
+ "# return merged\n",
656
  "\n",
657
+ "# # overall_stats_df = merged[[\"TeamID\", \"Season\", \"total_games\", \"WPA_sum\", \"LPA_sum\", \"total_games\"]]\n",
658
+ "# # # Combine stats from games won and games lost\n",
659
+ "# # overall_stats_df[\"TotalPA\"] = overall_stats_df[\"WPA_sum\"] + overall_stats_df[\"LPA_sum\"]\n",
660
+ "# return merged"
661
  ]
662
  },
663
  {
664
  "cell_type": "code",
665
+ "execution_count": null,
666
+ "metadata": {},
667
+ "outputs": [],
668
+ "source": []
669
+ },
670
+ {
671
+ "cell_type": "code",
672
+ "execution_count": 18,
673
  "metadata": {},
674
  "outputs": [],
675
  "source": [
 
678
  },
679
  {
680
  "cell_type": "code",
681
+ "execution_count": 19,
682
  "metadata": {},
683
  "outputs": [
684
  {
 
1051
  "[7605 rows x 203 columns]"
1052
  ]
1053
  },
1054
+ "execution_count": 19,
1055
  "metadata": {},
1056
  "output_type": "execute_result"
1057
  }
src/mens_monte_carlo.ipynb DELETED
@@ -1,45 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "import pandas as pd\n",
10
- "import numpy as np\n",
11
- "import os\n",
12
- "\n",
13
- "DATA_DIR = os.path.join(\"..\", \"data\")"
14
- ]
15
- },
16
- {
17
- "cell_type": "code",
18
- "execution_count": null,
19
- "metadata": {},
20
- "outputs": [],
21
- "source": []
22
- }
23
- ],
24
- "metadata": {
25
- "kernelspec": {
26
- "display_name": "Python 3",
27
- "language": "python",
28
- "name": "python3"
29
- },
30
- "language_info": {
31
- "codemirror_mode": {
32
- "name": "ipython",
33
- "version": 3
34
- },
35
- "file_extension": ".py",
36
- "mimetype": "text/x-python",
37
- "name": "python",
38
- "nbconvert_exporter": "python",
39
- "pygments_lexer": "ipython3",
40
- "version": "3.11.7"
41
- }
42
- },
43
- "nbformat": 4,
44
- "nbformat_minor": 2
45
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/mens_pre_processing.ipynb CHANGED
The diff for this file is too large to render. See raw diff