import pandas as pd
import numpy as np

dr = '/Users/karlgregory/Desktop/stat540_data/'

trcols = ['NA','Place','First','Last','City','State','Age','Division','DP','Time','Rank']
tr23 = pd.read_table(dr + 'tr50k_2023.txt', header = None,names = trcols,usecols = range(1,len(trcols)))
tr24 = pd.read_table(dr + 'tr50k_2024.txt', header = None,names = trcols,usecols = range(1,len(trcols)))

tr23

tr2324 = pd.merge(tr23, tr24, on = ['First','Last'], suffixes = [' 2023',' 2024'] )
tr2324 = tr2324[['First','Last','Time 2023','Time 2024']]
tr2324

# stack two data sets:
pd.concat([tr23,tr24])

# run a loop and stack many data sets

li = list() # create an empty list.  We want to put data sets into it.

for i in range(21,25):
    
    file = dr + 'tr50k_20' + str(i) + '.txt' # construct the current file name

    tr_yr = pd.read_table(file, header = None, names = trcols, usecols = range(1,len(trcols))) # read the data set

    tr_yr['Year'] = '20' + str(i) # make a new column containing the year

    li.append(tr_yr)

tr = pd.concat(li)
tr

def secs(ch):

    ch = ch.replace('`','') # replace tick mark (indicating a tie) with nothing
    h, m, s = ch.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)

secs('7:05:21`')

25521

tr['secs'] = tr['Time'].map(secs)
tr

avgtm = tr.pivot_table(values = 'secs', index = ['Year','Division'], aggfunc = {'secs' : 'mean'})
avgtm

avgtm.loc[('2023','M')]

secs    25676.268966
Name: (2023, M), dtype: float64

avgtm_nomulti = avgtm.reset_index(None)
avgtm_nomulti

import time

def secs2time(x): return time.strftime("%H:%M:%S",time.gmtime(x))

secs2time(27903)

'07:45:03'

avgtm_nomulti['Ave. Time'] = avgtm_nomulti['secs'].map(secs2time)
avgtm_nomulti.drop(['secs'],axis = 1) # axis = 1 for dropping a column

avgtm_nomulti[['Year','Division','Ave. Time']]

tr

age = tr.pivot_table( values = 'Age', index = ['Year'], aggfunc = {'Age' : ['min','median','max']})
age

def p90(x): # make a function to return the 90th percentile
    return np.quantile(x,0.1) # for finishing times I really want the 10th percentile

def p75(x): # make a function to return the 75th percentile
    return np.quantile(x,0.25) # for finishing times I really want the 25th percentile

percentiles = tr.pivot_table(values = 'secs', 
                             index = ['Year','Division'], 
                             aggfunc = {'secs' : [('90%',p90), ('75%',p75)]})

percentiles[['75%','90%']].map(secs2time)

percentiles_M =  percentiles.loc[(slice(None),'M'),] # keeps all the values of the first-level index
percentiles_F =  percentiles.loc[(slice(None),'F'),] # keeps all the values of the first-level index
percentiles_M.plot()
percentiles_F.plot()

<Axes: xlabel='Year,Division'>

podium = tr['Place'] <= 3
tr_podium = tr.loc[podium]
tr_podium

# use the .pivot method
tr_podium.pivot(index = 'Place', values = 'Time', columns = 'Year')

div_podium = tr['DP'] <= 3 # for both divisions get the top three
tr_div_podium = tr.loc[div_podium]
tr_div_podium.pivot(index = ['Division','DP'], values = 'Time', columns = 'Year')

	Place	First	Last	City	State	Age	Division	DP	Time	Rank
0	1	James	Thompson	Asheville	NC	39	M	1	4:42:18	93.55
1	2	Robbie	Harms	Fletcher	NC	30	M	2	4:46:22	87.05
2	3	Chris	Ingram	Charlotte	NC	43	M	3	4:47:12	81.69
3	4	Alex	Black	Marshall	NC	33	M	4	4:56:19	77.94
4	5	Mark	Rebholz	Charlotte	NC	33	M	5	4:56:44	88.10
...	...	...	...	...	...	...	...	...	...	...
182	183	Angie	Cato	Wilson	NC	47	F	40	9:41:57	58.85
183	184	Irina	Kalashnik	Matthews	NC	41	F	41	9:48:40	64.52
184	185	Jay	Weatherington	Chapel Hill	NC	33	M	144	9:48:48	47.95
185	186	Rick	Gray	Johnson City	TN	62	M	145	9:58:03	64.00
186	187	Emily	Floess	Raleigh	NC	35	F	42	10:30:53	62.58

	First	Last	Time 2023	Time 2024
0	Adam	Corey	5:29:05	5:31:55
1	Patrick	Donoghue	5:50:24	5:55:23
2	Travis	Pace	5:53:17	5:46:06
3	David	Nielsen	5:55:03	6:01:44
4	Matthew	Steed	5:57:55	5:41:34
5	Michelle	Gray	6:17:49	6:48:08
6	Andrew	Hall	6:29:50	6:25:33
7	Bryson	Peter	6:31:10	6:44:51
8	Tyson	Jeffus	6:32:04	6:07:53
9	Daniel	Merlin	6:33:04	5:42:52
10	Hunter	Steed	6:35:50	6:52:47
11	Ivan	Ng	6:36:15	7:13:36
12	Jake	Hinshaw	6:39:17	6:25:35
13	Ryan	Ford	6:50:58	7:54:12
14	Timothy	Newell	7:20:10	6:55:19
15	Jared	Wingler	7:29:25	7:27:15
16	Liana	Hastings	7:33:00	8:41:13
17	Kati	Cushman	7:55:59	6:59:52
18	Daniel	Wells	7:57:52	8:35:15
19	Spencer	Wells	7:57:55	8:38:07
20	Rio	Kira	8:01:30	8:13:46
21	Amy	McDowell	9:15:15	9:44:09
22	Rick	Gray	9:58:03	10:18:03

	Place	First	Last	City	State	Age	Division	DP	Time	Rank
0	1	James	Thompson	Asheville	NC	39	M	1	4:42:18	93.55
1	2	Robbie	Harms	Fletcher	NC	30	M	2	4:46:22	87.05
2	3	Chris	Ingram	Charlotte	NC	43	M	3	4:47:12	81.69
3	4	Alex	Black	Marshall	NC	33	M	4	4:56:19	77.94
4	5	Mark	Rebholz	Charlotte	NC	33	M	5	4:56:44	88.10
...	...	...	...	...	...	...	...	...	...	...
200	201	Jessica	Jasinski	LINCOLNTON	NC	35	F	44	10:12:28	62.33
201	202	Matt	Kirchner	Atlanta	GA	41	M	158	10:12:28	51.43
202	203	Rick	Gray	Johnson City	TN	63	M	159	10:18:03	64.00
203	204	Kimber	Jones	Mars Hill	NC	33	F	45	10:24:42	56.73
204	205	Brandon	King	Weaverville	NC	36	M	160	10:24:44	68.37

	Place	First	Last	City	State	Age	Division	DP	Time	Rank	Year
0	1	Peter	Schlachte	Washington	DC	24	M	1	4:46:32	82.88	2021
1	2	Randy	Elliott	Brevard	NC	37	M	2	4:47:37	82.29	2021
2	3	Anthony	Fagundes	Fair Oaks	CA	31	M	3	4:50:14	95.08	2021
3	4	Garrett	Murgatroyd	McHenry	IL	34	M	4	5:12:42	78.97	2021
4	5	Jeremy	Gardner	Baltimore	MD	43	M	5	5:21:54	89.44	2021
...	...	...	...	...	...	...	...	...	...	...	...
200	201	Jessica	Jasinski	LINCOLNTON	NC	35	F	44	10:12:28	62.33	2024
201	202	Matt	Kirchner	Atlanta	GA	41	M	158	10:12:28	51.43	2024
202	203	Rick	Gray	Johnson City	TN	63	M	159	10:18:03	64.00	2024
203	204	Kimber	Jones	Mars Hill	NC	33	F	45	10:24:42	56.73	2024
204	205	Brandon	King	Weaverville	NC	36	M	160	10:24:44	68.37	2024

	Place	First	Last	City	State	Age	Division	DP	Time	Rank	Year	secs
0	1	Peter	Schlachte	Washington	DC	24	M	1	4:46:32	82.88	2021	17192
1	2	Randy	Elliott	Brevard	NC	37	M	2	4:47:37	82.29	2021	17257
2	3	Anthony	Fagundes	Fair Oaks	CA	31	M	3	4:50:14	95.08	2021	17414
3	4	Garrett	Murgatroyd	McHenry	IL	34	M	4	5:12:42	78.97	2021	18762
4	5	Jeremy	Gardner	Baltimore	MD	43	M	5	5:21:54	89.44	2021	19314
...	...	...	...	...	...	...	...	...	...	...	...	...
200	201	Jessica	Jasinski	LINCOLNTON	NC	35	F	44	10:12:28	62.33	2024	36748
201	202	Matt	Kirchner	Atlanta	GA	41	M	158	10:12:28	51.43	2024	36748
202	203	Rick	Gray	Johnson City	TN	63	M	159	10:18:03	64.00	2024	37083
203	204	Kimber	Jones	Mars Hill	NC	33	F	45	10:24:42	56.73	2024	37482
204	205	Brandon	King	Weaverville	NC	36	M	160	10:24:44	68.37	2024	37484

How to merge data sets.¶

Concatenating or stacking data sets¶

Aggregating data¶

Reshaping data frames¶

	Year	Division	secs
0	2021	F	29119.214286
1	2021	M	26696.552846
2	2022	F	28396.342105
3	2022	M	26423.602740
4	2023	F	28869.833333
5	2023	M	25676.268966
6	2024	F	29381.733333
7	2024	M	27385.306250

	Year	Division	Ave. Time
0	2021	F	08:05:19
1	2021	M	07:24:56
2	2022	F	07:53:16
3	2022	M	07:20:23
4	2023	F	08:01:09
5	2023	M	07:07:56
6	2024	F	08:09:41
7	2024	M	07:36:25

Year	2021	2022	2023	2024
Place
1	4:46:32	4:38:01	4:42:18	4:15:08
2	4:47:37	4:51:39	4:46:22	4:18:06
3	4:50:14	5:08:57	4:47:12	4:42:09

	Year	2021	2022	2023	2024
Division	DP
F	1	5:24:21	5:08:57	5:29:03	5:23:39
	2	5:45:59	6:23:25	5:30:29	5:51:21
	3	5:47:59	6:32:22	5:53:12	5:53:37
M	1	4:46:32	4:38:01	4:42:18	4:15:08
	2	4:47:37	4:51:39	4:46:22	4:18:06
	3	4:50:14	5:12:04	4:47:12	4:42:09

	max	median	min
Year
2021	72	39.0	20
2022	72	39.0	20
2023	73	39.0	20
2024	78	39.0	19

		75%	90%
Year	Division
2021	F	07:25:50	06:25:55
2021	M	06:42:29	05:59:25
2022	F	07:11:55	06:43:28
2022	M	06:32:51	05:51:35
2023	F	07:31:28	06:30:28
2023	M	06:23:16	05:37:44
2024	F	07:25:11	06:55:23
2024	M	06:38:52	05:43:57