import pandas as pd
import numpy as np
dr = '/Users/karlgregory/Desktop/stat540_data/'
trcols = ['NA','Place','First','Last','City','State','Age','Division','DP','Time','Rank']
tr23 = pd.read_table(dr + 'tr50k_2023.txt', header = None,names = trcols,usecols = range(1,len(trcols)))
tr24 = pd.read_table(dr + 'tr50k_2024.txt', header = None,names = trcols,usecols = range(1,len(trcols)))
tr23
| Place | First | Last | City | State | Age | Division | DP | Time | Rank | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | James | Thompson | Asheville | NC | 39 | M | 1 | 4:42:18 | 93.55 |
| 1 | 2 | Robbie | Harms | Fletcher | NC | 30 | M | 2 | 4:46:22 | 87.05 |
| 2 | 3 | Chris | Ingram | Charlotte | NC | 43 | M | 3 | 4:47:12 | 81.69 |
| 3 | 4 | Alex | Black | Marshall | NC | 33 | M | 4 | 4:56:19 | 77.94 |
| 4 | 5 | Mark | Rebholz | Charlotte | NC | 33 | M | 5 | 4:56:44 | 88.10 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 182 | 183 | Angie | Cato | Wilson | NC | 47 | F | 40 | 9:41:57 | 58.85 |
| 183 | 184 | Irina | Kalashnik | Matthews | NC | 41 | F | 41 | 9:48:40 | 64.52 |
| 184 | 185 | Jay | Weatherington | Chapel Hill | NC | 33 | M | 144 | 9:48:48 | 47.95 |
| 185 | 186 | Rick | Gray | Johnson City | TN | 62 | M | 145 | 9:58:03 | 64.00 |
| 186 | 187 | Emily | Floess | Raleigh | NC | 35 | F | 42 | 10:30:53 | 62.58 |
187 rows × 10 columns
How to merge data sets.¶
Let's get a data set with runner who participated in both years. This is an "inner join".
tr2324 = pd.merge(tr23, tr24, on = ['First','Last'], suffixes = [' 2023',' 2024'] )
tr2324 = tr2324[['First','Last','Time 2023','Time 2024']]
tr2324
| First | Last | Time 2023 | Time 2024 | |
|---|---|---|---|---|
| 0 | Adam | Corey | 5:29:05 | 5:31:55 |
| 1 | Patrick | Donoghue | 5:50:24 | 5:55:23 |
| 2 | Travis | Pace | 5:53:17 | 5:46:06 |
| 3 | David | Nielsen | 5:55:03 | 6:01:44 |
| 4 | Matthew | Steed | 5:57:55 | 5:41:34 |
| 5 | Michelle | Gray | 6:17:49 | 6:48:08 |
| 6 | Andrew | Hall | 6:29:50 | 6:25:33 |
| 7 | Bryson | Peter | 6:31:10 | 6:44:51 |
| 8 | Tyson | Jeffus | 6:32:04 | 6:07:53 |
| 9 | Daniel | Merlin | 6:33:04 | 5:42:52 |
| 10 | Hunter | Steed | 6:35:50 | 6:52:47 |
| 11 | Ivan | Ng | 6:36:15 | 7:13:36 |
| 12 | Jake | Hinshaw | 6:39:17 | 6:25:35 |
| 13 | Ryan | Ford | 6:50:58 | 7:54:12 |
| 14 | Timothy | Newell | 7:20:10 | 6:55:19 |
| 15 | Jared | Wingler | 7:29:25 | 7:27:15 |
| 16 | Liana | Hastings | 7:33:00 | 8:41:13 |
| 17 | Kati | Cushman | 7:55:59 | 6:59:52 |
| 18 | Daniel | Wells | 7:57:52 | 8:35:15 |
| 19 | Spencer | Wells | 7:57:55 | 8:38:07 |
| 20 | Rio | Kira | 8:01:30 | 8:13:46 |
| 21 | Amy | McDowell | 9:15:15 | 9:44:09 |
| 22 | Rick | Gray | 9:58:03 | 10:18:03 |
Concatenating or stacking data sets¶
Putting datasets one on top of the other.
# stack two data sets:
pd.concat([tr23,tr24])
| Place | First | Last | City | State | Age | Division | DP | Time | Rank | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | James | Thompson | Asheville | NC | 39 | M | 1 | 4:42:18 | 93.55 |
| 1 | 2 | Robbie | Harms | Fletcher | NC | 30 | M | 2 | 4:46:22 | 87.05 |
| 2 | 3 | Chris | Ingram | Charlotte | NC | 43 | M | 3 | 4:47:12 | 81.69 |
| 3 | 4 | Alex | Black | Marshall | NC | 33 | M | 4 | 4:56:19 | 77.94 |
| 4 | 5 | Mark | Rebholz | Charlotte | NC | 33 | M | 5 | 4:56:44 | 88.10 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 200 | 201 | Jessica | Jasinski | LINCOLNTON | NC | 35 | F | 44 | 10:12:28 | 62.33 |
| 201 | 202 | Matt | Kirchner | Atlanta | GA | 41 | M | 158 | 10:12:28 | 51.43 |
| 202 | 203 | Rick | Gray | Johnson City | TN | 63 | M | 159 | 10:18:03 | 64.00 |
| 203 | 204 | Kimber | Jones | Mars Hill | NC | 33 | F | 45 | 10:24:42 | 56.73 |
| 204 | 205 | Brandon | King | Weaverville | NC | 36 | M | 160 | 10:24:44 | 68.37 |
392 rows × 10 columns
# run a loop and stack many data sets
li = list() # create an empty list. We want to put data sets into it.
for i in range(21,25):
file = dr + 'tr50k_20' + str(i) + '.txt' # construct the current file name
tr_yr = pd.read_table(file, header = None, names = trcols, usecols = range(1,len(trcols))) # read the data set
tr_yr['Year'] = '20' + str(i) # make a new column containing the year
li.append(tr_yr)
tr = pd.concat(li)
tr
| Place | First | Last | City | State | Age | Division | DP | Time | Rank | Year | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Peter | Schlachte | Washington | DC | 24 | M | 1 | 4:46:32 | 82.88 | 2021 |
| 1 | 2 | Randy | Elliott | Brevard | NC | 37 | M | 2 | 4:47:37 | 82.29 | 2021 |
| 2 | 3 | Anthony | Fagundes | Fair Oaks | CA | 31 | M | 3 | 4:50:14 | 95.08 | 2021 |
| 3 | 4 | Garrett | Murgatroyd | McHenry | IL | 34 | M | 4 | 5:12:42 | 78.97 | 2021 |
| 4 | 5 | Jeremy | Gardner | Baltimore | MD | 43 | M | 5 | 5:21:54 | 89.44 | 2021 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 200 | 201 | Jessica | Jasinski | LINCOLNTON | NC | 35 | F | 44 | 10:12:28 | 62.33 | 2024 |
| 201 | 202 | Matt | Kirchner | Atlanta | GA | 41 | M | 158 | 10:12:28 | 51.43 | 2024 |
| 202 | 203 | Rick | Gray | Johnson City | TN | 63 | M | 159 | 10:18:03 | 64.00 | 2024 |
| 203 | 204 | Kimber | Jones | Mars Hill | NC | 33 | F | 45 | 10:24:42 | 56.73 | 2024 |
| 204 | 205 | Brandon | King | Weaverville | NC | 36 | M | 160 | 10:24:44 | 68.37 | 2024 |
741 rows × 11 columns
Aggregating data¶
Suppose we want to get average finishing time in each division for each year of the race.
First convert these hh:mm:ss character strings to actual numbers.
def secs(ch):
ch = ch.replace('`','') # replace tick mark (indicating a tie) with nothing
h, m, s = ch.split(':')
return int(h) * 3600 + int(m) * 60 + int(s)
secs('7:05:21`')
25521
tr['secs'] = tr['Time'].map(secs)
tr
| Place | First | Last | City | State | Age | Division | DP | Time | Rank | Year | secs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Peter | Schlachte | Washington | DC | 24 | M | 1 | 4:46:32 | 82.88 | 2021 | 17192 |
| 1 | 2 | Randy | Elliott | Brevard | NC | 37 | M | 2 | 4:47:37 | 82.29 | 2021 | 17257 |
| 2 | 3 | Anthony | Fagundes | Fair Oaks | CA | 31 | M | 3 | 4:50:14 | 95.08 | 2021 | 17414 |
| 3 | 4 | Garrett | Murgatroyd | McHenry | IL | 34 | M | 4 | 5:12:42 | 78.97 | 2021 | 18762 |
| 4 | 5 | Jeremy | Gardner | Baltimore | MD | 43 | M | 5 | 5:21:54 | 89.44 | 2021 | 19314 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 200 | 201 | Jessica | Jasinski | LINCOLNTON | NC | 35 | F | 44 | 10:12:28 | 62.33 | 2024 | 36748 |
| 201 | 202 | Matt | Kirchner | Atlanta | GA | 41 | M | 158 | 10:12:28 | 51.43 | 2024 | 36748 |
| 202 | 203 | Rick | Gray | Johnson City | TN | 63 | M | 159 | 10:18:03 | 64.00 | 2024 | 37083 |
| 203 | 204 | Kimber | Jones | Mars Hill | NC | 33 | F | 45 | 10:24:42 | 56.73 | 2024 | 37482 |
| 204 | 205 | Brandon | King | Weaverville | NC | 36 | M | 160 | 10:24:44 | 68.37 | 2024 | 37484 |
741 rows × 12 columns
avgtm = tr.pivot_table(values = 'secs', index = ['Year','Division'], aggfunc = {'secs' : 'mean'})
avgtm
| secs | ||
|---|---|---|
| Year | Division | |
| 2021 | F | 29119.214286 |
| M | 26696.552846 | |
| 2022 | F | 28396.342105 |
| M | 26423.602740 | |
| 2023 | F | 28869.833333 |
| M | 25676.268966 | |
| 2024 | F | 29381.733333 |
| M | 27385.306250 |
We get a multi-index when we make a pivot table.
avgtm.loc[('2023','M')]
secs 25676.268966 Name: (2023, M), dtype: float64
Sometimes you want to get rid of the mult-index: Do it like this.
avgtm_nomulti = avgtm.reset_index(None)
avgtm_nomulti
| Year | Division | secs | |
|---|---|---|---|
| 0 | 2021 | F | 29119.214286 |
| 1 | 2021 | M | 26696.552846 |
| 2 | 2022 | F | 28396.342105 |
| 3 | 2022 | M | 26423.602740 |
| 4 | 2023 | F | 28869.833333 |
| 5 | 2023 | M | 25676.268966 |
| 6 | 2024 | F | 29381.733333 |
| 7 | 2024 | M | 27385.306250 |
Change number of seconds back to HH:MM:SS representation.
import time
def secs2time(x): return time.strftime("%H:%M:%S",time.gmtime(x))
secs2time(27903)
'07:45:03'
avgtm_nomulti['Ave. Time'] = avgtm_nomulti['secs'].map(secs2time)
avgtm_nomulti.drop(['secs'],axis = 1) # axis = 1 for dropping a column
| Year | Division | Ave. Time | |
|---|---|---|---|
| 0 | 2021 | F | 08:05:19 |
| 1 | 2021 | M | 07:24:56 |
| 2 | 2022 | F | 07:53:16 |
| 3 | 2022 | M | 07:20:23 |
| 4 | 2023 | F | 08:01:09 |
| 5 | 2023 | M | 07:07:56 |
| 6 | 2024 | F | 08:09:41 |
| 7 | 2024 | M | 07:36:25 |
instead of dropping what you don't want, you can name what you DO want
avgtm_nomulti[['Year','Division','Ave. Time']]
| Year | Division | Ave. Time | |
|---|---|---|---|
| 0 | 2021 | F | 08:05:19 |
| 1 | 2021 | M | 07:24:56 |
| 2 | 2022 | F | 07:53:16 |
| 3 | 2022 | M | 07:20:23 |
| 4 | 2023 | F | 08:01:09 |
| 5 | 2023 | M | 07:07:56 |
| 6 | 2024 | F | 08:09:41 |
| 7 | 2024 | M | 07:36:25 |
Say I want average age of runners for each year of the race.
tr
| Place | First | Last | City | State | Age | Division | DP | Time | Rank | Year | secs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Peter | Schlachte | Washington | DC | 24 | M | 1 | 4:46:32 | 82.88 | 2021 | 17192 |
| 1 | 2 | Randy | Elliott | Brevard | NC | 37 | M | 2 | 4:47:37 | 82.29 | 2021 | 17257 |
| 2 | 3 | Anthony | Fagundes | Fair Oaks | CA | 31 | M | 3 | 4:50:14 | 95.08 | 2021 | 17414 |
| 3 | 4 | Garrett | Murgatroyd | McHenry | IL | 34 | M | 4 | 5:12:42 | 78.97 | 2021 | 18762 |
| 4 | 5 | Jeremy | Gardner | Baltimore | MD | 43 | M | 5 | 5:21:54 | 89.44 | 2021 | 19314 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 200 | 201 | Jessica | Jasinski | LINCOLNTON | NC | 35 | F | 44 | 10:12:28 | 62.33 | 2024 | 36748 |
| 201 | 202 | Matt | Kirchner | Atlanta | GA | 41 | M | 158 | 10:12:28 | 51.43 | 2024 | 36748 |
| 202 | 203 | Rick | Gray | Johnson City | TN | 63 | M | 159 | 10:18:03 | 64.00 | 2024 | 37083 |
| 203 | 204 | Kimber | Jones | Mars Hill | NC | 33 | F | 45 | 10:24:42 | 56.73 | 2024 | 37482 |
| 204 | 205 | Brandon | King | Weaverville | NC | 36 | M | 160 | 10:24:44 | 68.37 | 2024 | 37484 |
741 rows × 12 columns
age = tr.pivot_table( values = 'Age', index = ['Year'], aggfunc = {'Age' : ['min','median','max']})
age
| max | median | min | |
|---|---|---|---|
| Year | |||
| 2021 | 72 | 39.0 | 20 |
| 2022 | 72 | 39.0 | 20 |
| 2023 | 73 | 39.0 | 20 |
| 2024 | 78 | 39.0 | 19 |
Say you have your own custom function for aggregating. How to use it?
def p90(x): # make a function to return the 90th percentile
return np.quantile(x,0.1) # for finishing times I really want the 10th percentile
def p75(x): # make a function to return the 75th percentile
return np.quantile(x,0.25) # for finishing times I really want the 25th percentile
percentiles = tr.pivot_table(values = 'secs',
index = ['Year','Division'],
aggfunc = {'secs' : [('90%',p90), ('75%',p75)]})
percentiles[['75%','90%']].map(secs2time)
| 75% | 90% | ||
|---|---|---|---|
| Year | Division | ||
| 2021 | F | 07:25:50 | 06:25:55 |
| M | 06:42:29 | 05:59:25 | |
| 2022 | F | 07:11:55 | 06:43:28 |
| M | 06:32:51 | 05:51:35 | |
| 2023 | F | 07:31:28 | 06:30:28 |
| M | 06:23:16 | 05:37:44 | |
| 2024 | F | 07:25:11 | 06:55:23 |
| M | 06:38:52 | 05:43:57 |
For each division, make a plot of the percentiles over time
percentiles_M = percentiles.loc[(slice(None),'M'),] # keeps all the values of the first-level index
percentiles_F = percentiles.loc[(slice(None),'F'),] # keeps all the values of the first-level index
percentiles_M.plot()
percentiles_F.plot()
<Axes: xlabel='Year,Division'>
podium = tr['Place'] <= 3
tr_podium = tr.loc[podium]
tr_podium
| Place | First | Last | City | State | Age | Division | DP | Time | Rank | Year | secs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Peter | Schlachte | Washington | DC | 24 | M | 1 | 4:46:32 | 82.88 | 2021 | 17192 |
| 1 | 2 | Randy | Elliott | Brevard | NC | 37 | M | 2 | 4:47:37 | 82.29 | 2021 | 17257 |
| 2 | 3 | Anthony | Fagundes | Fair Oaks | CA | 31 | M | 3 | 4:50:14 | 95.08 | 2021 | 17414 |
| 0 | 1 | James | Thompson | Asheville | NC | 38 | M | 1 | 4:38:01 | 93.55 | 2022 | 16681 |
| 1 | 2 | Lee | Jarvis | London | 40 | M | 2 | 4:51:39 | 88.77 | 2022 | 17499 | |
| 2 | 3 | Amelie | Roy | Kill devil hills | NC | 29 | F | 1 | 5:08:57 | 99.74 | 2022 | 18537 |
| 0 | 1 | James | Thompson | Asheville | NC | 39 | M | 1 | 4:42:18 | 93.55 | 2023 | 16938 |
| 1 | 2 | Robbie | Harms | Fletcher | NC | 30 | M | 2 | 4:46:22 | 87.05 | 2023 | 17182 |
| 2 | 3 | Chris | Ingram | Charlotte | NC | 43 | M | 3 | 4:47:12 | 81.69 | 2023 | 17232 |
| 0 | 1 | Drew | Marshall | Charlotte | NC | 30 | M | 1 | 4:15:08 | 93.78 | 2024 | 15308 |
| 1 | 2 | Brent | Bookwalter | Montreat | NC | 40 | M | 2 | 4:18:06 | 99.61 | 2024 | 15486 |
| 2 | 3 | Devon | Brodmyer | Brevard | NC | 22 | M | 3 | 4:42:09 | 93.33 | 2024 | 16929 |
# use the .pivot method
tr_podium.pivot(index = 'Place', values = 'Time', columns = 'Year')
| Year | 2021 | 2022 | 2023 | 2024 |
|---|---|---|---|---|
| Place | ||||
| 1 | 4:46:32 | 4:38:01 | 4:42:18 | 4:15:08 |
| 2 | 4:47:37 | 4:51:39 | 4:46:22 | 4:18:06 |
| 3 | 4:50:14 | 5:08:57 | 4:47:12 | 4:42:09 |
Can pivot a table on multiple indices:
div_podium = tr['DP'] <= 3 # for both divisions get the top three
tr_div_podium = tr.loc[div_podium]
tr_div_podium.pivot(index = ['Division','DP'], values = 'Time', columns = 'Year')
| Year | 2021 | 2022 | 2023 | 2024 | |
|---|---|---|---|---|---|
| Division | DP | ||||
| F | 1 | 5:24:21 | 5:08:57 | 5:29:03 | 5:23:39 |
| 2 | 5:45:59 | 6:23:25 | 5:30:29 | 5:51:21 | |
| 3 | 5:47:59 | 6:32:22 | 5:53:12 | 5:53:37 | |
| M | 1 | 4:46:32 | 4:38:01 | 4:42:18 | 4:15:08 |
| 2 | 4:47:37 | 4:51:39 | 4:46:22 | 4:18:06 | |
| 3 | 4:50:14 | 5:12:04 | 4:47:12 | 4:42:09 |