In [1]:
import pandas as pd
# windows: 'C:\\Users\\karlgregory\\Desktop\\stat540_data\\'
dr = '/Users/karlgregory/Desktop/stat540_data/'
filename = 'safari_comma_missing.dat'
file = open(dr + filename) # create connection to a file
contents = file.read()
print(contents)
file.close
Some make-believe safari data Woohoo!! date,wildebeest,laughing hyena,crocodile,weather,start,end,fun,guide 1/13/1999,12,none,2,sunny,7:21 am,4:14 pm,yes,Joshua Tebbs 4/28/2001,3,1,1,cloudy,6:25 am,12:33 pm,Y,Edsel Peña 10/15/2010,3,.,6,rainy,8:12 am,,no,Karl Bruce Gregory 3/02/2006,1,14,5,hot/sunny,7:15 am,3:12 pm,y,Lianming Wang 2/28/1988,2,6,3,partly cloudy,4:53 am,2:16 pm,Yes,Brian Habing 7/14/2015,3,12,0,cloudy,5:47 am,3:46 pm,No,Edwards
Out[1]:
<function TextIOWrapper.close()>
In [2]:
safari = pd.read_table(dr + filename,sep=',',skiprows = 3, na_values = ['.',' '])
safari
Out[2]:
| date | wildebeest | laughing hyena | crocodile | weather | start | end | fun | guide | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1/13/1999 | 12 | none | 2 | sunny | 7:21 am | 4:14 pm | yes | Joshua Tebbs |
| 1 | 4/28/2001 | 3 | 1 | 1 | cloudy | 6:25 am | 12:33 pm | Y | Edsel Peña |
| 2 | 10/15/2010 | 3 | NaN | 6 | rainy | 8:12 am | NaN | no | Karl Bruce Gregory |
| 3 | 3/02/2006 | 1 | 14 | 5 | hot/sunny | 7:15 am | 3:12 pm | y | Lianming Wang |
| 4 | 2/28/1988 | 2 | 6 | 3 | partly cloudy | 4:53 am | 2:16 pm | Yes | Brian Habing |
| 5 | 7/14/2015 | 3 | 12 | 0 | cloudy | 5:47 am | 3:46 pm | No | Edwards |
In [3]:
type(safari)
Out[3]:
pandas.core.frame.DataFrame
In [4]:
print(safari['wildebeest'])
print(safari.wildebeest) # columns of Pandas data frames are called Series.
0 12 1 3 2 3 3 1 4 2 5 3 Name: wildebeest, dtype: int64 0 12 1 3 2 3 3 1 4 2 5 3 Name: wildebeest, dtype: int64
Reading in fixed width format data
In [5]:
filename = 'safari_fwf_missing.dat'
pd.read_fwf(dr + filename,
skiprows = 4,
widths = [11,3,5,3,14,9,9,4,18],
header = None,
names = ['Date','W','LH','Croc','Weather','start','end','fun','guide'])
Out[5]:
| Date | W | LH | Croc | Weather | start | end | fun | guide | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1/13/1999 | 12 | none | 2.0 | sunny | 7:21 am | 4:14 pm | yes | Joshua Tebbs |
| 1 | 4/28/2001 | 3 | 1 | 1.0 | cloudy | 6:25 am | 12:33 pm | Y | Edsel Peña |
| 2 | 10/15/2010 | 3 | 6 | NaN | rainy | 8:12 am | NaN | no | Karl Bruce Gregory |
| 3 | 3/02/2006 | 1 | 14 | 5.0 | hot/sunny | 7:15 am | 3:12 pm | y | Lianming Wang |
| 4 | 2/28/1988 | 2 | 6 | 3.0 | partly cloudy | 4:53 am | 2:16 pm | Yes | Brian Habing |
| 5 | 7/14/2015 | 3 | 12 | 0.0 | cloudy | 5:47 am | 3:46 pm | No | Edwards |
Dates in Python¶
In [6]:
import datetime
now = datetime.datetime.today()
print(now)
type(now)
print(now.year)
print(now.month)
print(now.microsecond)
2025-10-28 14:41:55.887099 2025 10 887099
Two nice function are these:
In [7]:
viva = datetime.datetime.strptime('11/22/2015 02:23', '%m/%d/%Y %H:%M' ) # converting a character string to a real date time value
print(viva)
viva_reformat = datetime.datetime.strftime(viva,'%B %d Anno Domini %Y, at %H:%M')
print(viva_reformat)
2015-11-22 02:23:00 November 22 Anno Domini 2015, at 02:23
What about converting strings to dates in a Pandas data frame???
In [8]:
dates = pd.to_datetime(safari.date,format = '%m/%d/%Y')
print(dates)
safari['newdate'] = dates
safari
0 1999-01-13 1 2001-04-28 2 2010-10-15 3 2006-03-02 4 1988-02-28 5 2015-07-14 Name: date, dtype: datetime64[ns]
Out[8]:
| date | wildebeest | laughing hyena | crocodile | weather | start | end | fun | guide | newdate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1/13/1999 | 12 | none | 2 | sunny | 7:21 am | 4:14 pm | yes | Joshua Tebbs | 1999-01-13 |
| 1 | 4/28/2001 | 3 | 1 | 1 | cloudy | 6:25 am | 12:33 pm | Y | Edsel Peña | 2001-04-28 |
| 2 | 10/15/2010 | 3 | NaN | 6 | rainy | 8:12 am | NaN | no | Karl Bruce Gregory | 2010-10-15 |
| 3 | 3/02/2006 | 1 | 14 | 5 | hot/sunny | 7:15 am | 3:12 pm | y | Lianming Wang | 2006-03-02 |
| 4 | 2/28/1988 | 2 | 6 | 3 | partly cloudy | 4:53 am | 2:16 pm | Yes | Brian Habing | 1988-02-28 |
| 5 | 7/14/2015 | 3 | 12 | 0 | cloudy | 5:47 am | 3:46 pm | No | Edwards | 2015-07-14 |
In [9]:
weekday = safari.newdate.dt.weekday
safari['weekday'] = weekday # make a new column of the safari data set
safari
Out[9]:
| date | wildebeest | laughing hyena | crocodile | weather | start | end | fun | guide | newdate | weekday | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1/13/1999 | 12 | none | 2 | sunny | 7:21 am | 4:14 pm | yes | Joshua Tebbs | 1999-01-13 | 2 |
| 1 | 4/28/2001 | 3 | 1 | 1 | cloudy | 6:25 am | 12:33 pm | Y | Edsel Peña | 2001-04-28 | 5 |
| 2 | 10/15/2010 | 3 | NaN | 6 | rainy | 8:12 am | NaN | no | Karl Bruce Gregory | 2010-10-15 | 4 |
| 3 | 3/02/2006 | 1 | 14 | 5 | hot/sunny | 7:15 am | 3:12 pm | y | Lianming Wang | 2006-03-02 | 3 |
| 4 | 2/28/1988 | 2 | 6 | 3 | partly cloudy | 4:53 am | 2:16 pm | Yes | Brian Habing | 1988-02-28 | 6 |
| 5 | 7/14/2015 | 3 | 12 | 0 | cloudy | 5:47 am | 3:46 pm | No | Edwards | 2015-07-14 | 1 |
Working with text in Python¶
In [10]:
import re # re stands for regular expression
ch = 'Karl Bruce Gregory'
print(re.sub('e','*',ch)) # replace a substring with another substring
print(re.sub('[aeiou]','*',ch)) # replace any vowels with stars
print(re.sub('[a-z]','-',ch))
print(re.search('go',ch)) # find location of a substring (first match)
print(re.findall('e',ch))
print(re.findall('[A-Z]',ch))
print(re.split(' ',ch))
Karl Bruc* Gr*gory K*rl Br*c* Gr*g*ry K--- B---- G------ <re.Match object; span=(14, 16), match='go'> ['e', 'e'] ['K', 'B', 'G'] ['Karl', 'Bruce', 'Gregory']
In [11]:
init = re.findall('[A-Z]',ch)
'.'.join(init) + '.' # collapse the init list with '.' between
Out[11]:
'K.B.G.'
In [12]:
ch1 = 'Matthew Frederick Thaddeus Bailey' # convert to M. F. T. Bailey
ch2 = 'Karl Gregory' # want to say K. Gregory
In [13]:
def nameabb(ch):
full = re.split(' ',ch)
n = len(full)
if(n == 1):
abb = ch
else:
abb = ''
for i in range(n-1): abb += full[i][0] + '. '
abb += full[n-1]
return abb
In [14]:
print(nameabb('Matthew Frederick Thaddeus Bailey'))
print(nameabb('Karl Gregory'))
print(nameabb('Karl'))
M. F. T. Bailey K. Gregory Karl
Let's abbreviate the names in the data set:
In [15]:
safari['name_abb'] = safari['guide'].map(nameabb)
safari
Out[15]:
| date | wildebeest | laughing hyena | crocodile | weather | start | end | fun | guide | newdate | weekday | name_abb | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1/13/1999 | 12 | none | 2 | sunny | 7:21 am | 4:14 pm | yes | Joshua Tebbs | 1999-01-13 | 2 | J. Tebbs |
| 1 | 4/28/2001 | 3 | 1 | 1 | cloudy | 6:25 am | 12:33 pm | Y | Edsel Peña | 2001-04-28 | 5 | E. Peña |
| 2 | 10/15/2010 | 3 | NaN | 6 | rainy | 8:12 am | NaN | no | Karl Bruce Gregory | 2010-10-15 | 4 | K. B. Gregory |
| 3 | 3/02/2006 | 1 | 14 | 5 | hot/sunny | 7:15 am | 3:12 pm | y | Lianming Wang | 2006-03-02 | 3 | L. Wang |
| 4 | 2/28/1988 | 2 | 6 | 3 | partly cloudy | 4:53 am | 2:16 pm | Yes | Brian Habing | 1988-02-28 | 6 | B. Habing |
| 5 | 7/14/2015 | 3 | 12 | 0 | cloudy | 5:47 am | 3:46 pm | No | Edwards | 2015-07-14 | 1 | Edwards |
Use a dictionary to make a "mapping" of the values in the fun column to standardized values.
In [16]:
fundict = {'Yes': True, 'yes': True, 'y': True, 'Y' : True, 'no' : False, 'No' : False}
safari['funTF'] = safari.fun.map(fundict)
safari
Out[16]:
| date | wildebeest | laughing hyena | crocodile | weather | start | end | fun | guide | newdate | weekday | name_abb | funTF | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1/13/1999 | 12 | none | 2 | sunny | 7:21 am | 4:14 pm | yes | Joshua Tebbs | 1999-01-13 | 2 | J. Tebbs | True |
| 1 | 4/28/2001 | 3 | 1 | 1 | cloudy | 6:25 am | 12:33 pm | Y | Edsel Peña | 2001-04-28 | 5 | E. Peña | True |
| 2 | 10/15/2010 | 3 | NaN | 6 | rainy | 8:12 am | NaN | no | Karl Bruce Gregory | 2010-10-15 | 4 | K. B. Gregory | False |
| 3 | 3/02/2006 | 1 | 14 | 5 | hot/sunny | 7:15 am | 3:12 pm | y | Lianming Wang | 2006-03-02 | 3 | L. Wang | True |
| 4 | 2/28/1988 | 2 | 6 | 3 | partly cloudy | 4:53 am | 2:16 pm | Yes | Brian Habing | 1988-02-28 | 6 | B. Habing | True |
| 5 | 7/14/2015 | 3 | 12 | 0 | cloudy | 5:47 am | 3:46 pm | No | Edwards | 2015-07-14 | 1 | Edwards | False |
Accessing entries of a Pandas data frame.
In [17]:
safari
Out[17]:
| date | wildebeest | laughing hyena | crocodile | weather | start | end | fun | guide | newdate | weekday | name_abb | funTF | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1/13/1999 | 12 | none | 2 | sunny | 7:21 am | 4:14 pm | yes | Joshua Tebbs | 1999-01-13 | 2 | J. Tebbs | True |
| 1 | 4/28/2001 | 3 | 1 | 1 | cloudy | 6:25 am | 12:33 pm | Y | Edsel Peña | 2001-04-28 | 5 | E. Peña | True |
| 2 | 10/15/2010 | 3 | NaN | 6 | rainy | 8:12 am | NaN | no | Karl Bruce Gregory | 2010-10-15 | 4 | K. B. Gregory | False |
| 3 | 3/02/2006 | 1 | 14 | 5 | hot/sunny | 7:15 am | 3:12 pm | y | Lianming Wang | 2006-03-02 | 3 | L. Wang | True |
| 4 | 2/28/1988 | 2 | 6 | 3 | partly cloudy | 4:53 am | 2:16 pm | Yes | Brian Habing | 1988-02-28 | 6 | B. Habing | True |
| 5 | 7/14/2015 | 3 | 12 | 0 | cloudy | 5:47 am | 3:46 pm | No | Edwards | 2015-07-14 | 1 | Edwards | False |
In [18]:
pos = safari['laughing hyena'] == 'none'
print(pos)
0 True 1 False 2 False 3 False 4 False 5 False Name: laughing hyena, dtype: bool
In [19]:
safari.loc[pos,'laughing hyena'] = 0 # instead of using square brackets
safari
Out[19]:
| date | wildebeest | laughing hyena | crocodile | weather | start | end | fun | guide | newdate | weekday | name_abb | funTF | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1/13/1999 | 12 | 0 | 2 | sunny | 7:21 am | 4:14 pm | yes | Joshua Tebbs | 1999-01-13 | 2 | J. Tebbs | True |
| 1 | 4/28/2001 | 3 | 1 | 1 | cloudy | 6:25 am | 12:33 pm | Y | Edsel Peña | 2001-04-28 | 5 | E. Peña | True |
| 2 | 10/15/2010 | 3 | NaN | 6 | rainy | 8:12 am | NaN | no | Karl Bruce Gregory | 2010-10-15 | 4 | K. B. Gregory | False |
| 3 | 3/02/2006 | 1 | 14 | 5 | hot/sunny | 7:15 am | 3:12 pm | y | Lianming Wang | 2006-03-02 | 3 | L. Wang | True |
| 4 | 2/28/1988 | 2 | 6 | 3 | partly cloudy | 4:53 am | 2:16 pm | Yes | Brian Habing | 1988-02-28 | 6 | B. Habing | True |
| 5 | 7/14/2015 | 3 | 12 | 0 | cloudy | 5:47 am | 3:46 pm | No | Edwards | 2015-07-14 | 1 | Edwards | False |
How to sort a Pandas dataframe.
In [20]:
safari.sort_values(by = 'newdate')
Out[20]:
| date | wildebeest | laughing hyena | crocodile | weather | start | end | fun | guide | newdate | weekday | name_abb | funTF | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | 2/28/1988 | 2 | 6 | 3 | partly cloudy | 4:53 am | 2:16 pm | Yes | Brian Habing | 1988-02-28 | 6 | B. Habing | True |
| 0 | 1/13/1999 | 12 | 0 | 2 | sunny | 7:21 am | 4:14 pm | yes | Joshua Tebbs | 1999-01-13 | 2 | J. Tebbs | True |
| 1 | 4/28/2001 | 3 | 1 | 1 | cloudy | 6:25 am | 12:33 pm | Y | Edsel Peña | 2001-04-28 | 5 | E. Peña | True |
| 3 | 3/02/2006 | 1 | 14 | 5 | hot/sunny | 7:15 am | 3:12 pm | y | Lianming Wang | 2006-03-02 | 3 | L. Wang | True |
| 2 | 10/15/2010 | 3 | NaN | 6 | rainy | 8:12 am | NaN | no | Karl Bruce Gregory | 2010-10-15 | 4 | K. B. Gregory | False |
| 5 | 7/14/2015 | 3 | 12 | 0 | cloudy | 5:47 am | 3:46 pm | No | Edwards | 2015-07-14 | 1 | Edwards | False |
In [21]:
safari.sort_values(by = ['funTF','newdate'], inplace = True) # add inplace = True to overwrite the table
safari
Out[21]:
| date | wildebeest | laughing hyena | crocodile | weather | start | end | fun | guide | newdate | weekday | name_abb | funTF | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 10/15/2010 | 3 | NaN | 6 | rainy | 8:12 am | NaN | no | Karl Bruce Gregory | 2010-10-15 | 4 | K. B. Gregory | False |
| 5 | 7/14/2015 | 3 | 12 | 0 | cloudy | 5:47 am | 3:46 pm | No | Edwards | 2015-07-14 | 1 | Edwards | False |
| 4 | 2/28/1988 | 2 | 6 | 3 | partly cloudy | 4:53 am | 2:16 pm | Yes | Brian Habing | 1988-02-28 | 6 | B. Habing | True |
| 0 | 1/13/1999 | 12 | 0 | 2 | sunny | 7:21 am | 4:14 pm | yes | Joshua Tebbs | 1999-01-13 | 2 | J. Tebbs | True |
| 1 | 4/28/2001 | 3 | 1 | 1 | cloudy | 6:25 am | 12:33 pm | Y | Edsel Peña | 2001-04-28 | 5 | E. Peña | True |
| 3 | 3/02/2006 | 1 | 14 | 5 | hot/sunny | 7:15 am | 3:12 pm | y | Lianming Wang | 2006-03-02 | 3 | L. Wang | True |