In [1]:
import pandas as pd

# windows: 'C:\\Users\\karlgregory\\Desktop\\stat540_data\\'

dr = '/Users/karlgregory/Desktop/stat540_data/'
filename = 'safari_comma_missing.dat'

file = open(dr + filename) # create connection to a file
contents = file.read()
print(contents)
file.close
Some make-believe safari data
Woohoo!!

date,wildebeest,laughing hyena,crocodile,weather,start,end,fun,guide
1/13/1999,12,none,2,sunny,7:21 am,4:14 pm,yes,Joshua Tebbs
4/28/2001,3,1,1,cloudy,6:25 am,12:33 pm,Y,Edsel Peña
10/15/2010,3,.,6,rainy,8:12 am,,no,Karl Bruce Gregory
3/02/2006,1,14,5,hot/sunny,7:15 am,3:12 pm,y,Lianming Wang
2/28/1988,2,6,3,partly cloudy,4:53 am,2:16 pm,Yes,Brian Habing
7/14/2015,3,12,0,cloudy,5:47 am,3:46 pm,No,Edwards

Out[1]:
<function TextIOWrapper.close()>
In [2]:
safari = pd.read_table(dr + filename,sep=',',skiprows = 3, na_values = ['.',' '])
safari
Out[2]:
date wildebeest laughing hyena crocodile weather start end fun guide
0 1/13/1999 12 none 2 sunny 7:21 am 4:14 pm yes Joshua Tebbs
1 4/28/2001 3 1 1 cloudy 6:25 am 12:33 pm Y Edsel Peña
2 10/15/2010 3 NaN 6 rainy 8:12 am NaN no Karl Bruce Gregory
3 3/02/2006 1 14 5 hot/sunny 7:15 am 3:12 pm y Lianming Wang
4 2/28/1988 2 6 3 partly cloudy 4:53 am 2:16 pm Yes Brian Habing
5 7/14/2015 3 12 0 cloudy 5:47 am 3:46 pm No Edwards
In [3]:
type(safari)
Out[3]:
pandas.core.frame.DataFrame
In [4]:
print(safari['wildebeest'])
print(safari.wildebeest)  # columns of Pandas data frames are called Series. 
0    12
1     3
2     3
3     1
4     2
5     3
Name: wildebeest, dtype: int64
0    12
1     3
2     3
3     1
4     2
5     3
Name: wildebeest, dtype: int64

Reading in fixed width format data

In [5]:
filename = 'safari_fwf_missing.dat'

pd.read_fwf(dr + filename,
            skiprows = 4,
            widths = [11,3,5,3,14,9,9,4,18],
            header = None,
            names = ['Date','W','LH','Croc','Weather','start','end','fun','guide'])
Out[5]:
Date W LH Croc Weather start end fun guide
0 1/13/1999 12 none 2.0 sunny 7:21 am 4:14 pm yes Joshua Tebbs
1 4/28/2001 3 1 1.0 cloudy 6:25 am 12:33 pm Y Edsel Peña
2 10/15/2010 3 6 NaN rainy 8:12 am NaN no Karl Bruce Gregory
3 3/02/2006 1 14 5.0 hot/sunny 7:15 am 3:12 pm y Lianming Wang
4 2/28/1988 2 6 3.0 partly cloudy 4:53 am 2:16 pm Yes Brian Habing
5 7/14/2015 3 12 0.0 cloudy 5:47 am 3:46 pm No Edwards

Dates in Python¶

In [6]:
import datetime

now = datetime.datetime.today()
print(now)

type(now)
print(now.year)
print(now.month)
print(now.microsecond)
2025-10-28 14:41:55.887099
2025
10
887099

Two nice function are these:

In [7]:
viva = datetime.datetime.strptime('11/22/2015 02:23', '%m/%d/%Y %H:%M'  )  # converting a character string to a real date time value
print(viva)

viva_reformat = datetime.datetime.strftime(viva,'%B %d Anno Domini %Y, at %H:%M')
print(viva_reformat)
2015-11-22 02:23:00
November 22 Anno Domini 2015, at 02:23

What about converting strings to dates in a Pandas data frame???

In [8]:
dates = pd.to_datetime(safari.date,format = '%m/%d/%Y')
print(dates)
safari['newdate'] = dates
safari
0   1999-01-13
1   2001-04-28
2   2010-10-15
3   2006-03-02
4   1988-02-28
5   2015-07-14
Name: date, dtype: datetime64[ns]
Out[8]:
date wildebeest laughing hyena crocodile weather start end fun guide newdate
0 1/13/1999 12 none 2 sunny 7:21 am 4:14 pm yes Joshua Tebbs 1999-01-13
1 4/28/2001 3 1 1 cloudy 6:25 am 12:33 pm Y Edsel Peña 2001-04-28
2 10/15/2010 3 NaN 6 rainy 8:12 am NaN no Karl Bruce Gregory 2010-10-15
3 3/02/2006 1 14 5 hot/sunny 7:15 am 3:12 pm y Lianming Wang 2006-03-02
4 2/28/1988 2 6 3 partly cloudy 4:53 am 2:16 pm Yes Brian Habing 1988-02-28
5 7/14/2015 3 12 0 cloudy 5:47 am 3:46 pm No Edwards 2015-07-14
In [9]:
weekday = safari.newdate.dt.weekday
safari['weekday'] = weekday  # make a new column of the safari data set
safari
Out[9]:
date wildebeest laughing hyena crocodile weather start end fun guide newdate weekday
0 1/13/1999 12 none 2 sunny 7:21 am 4:14 pm yes Joshua Tebbs 1999-01-13 2
1 4/28/2001 3 1 1 cloudy 6:25 am 12:33 pm Y Edsel Peña 2001-04-28 5
2 10/15/2010 3 NaN 6 rainy 8:12 am NaN no Karl Bruce Gregory 2010-10-15 4
3 3/02/2006 1 14 5 hot/sunny 7:15 am 3:12 pm y Lianming Wang 2006-03-02 3
4 2/28/1988 2 6 3 partly cloudy 4:53 am 2:16 pm Yes Brian Habing 1988-02-28 6
5 7/14/2015 3 12 0 cloudy 5:47 am 3:46 pm No Edwards 2015-07-14 1

Working with text in Python¶

In [10]:
import re # re stands for regular expression

ch = 'Karl Bruce Gregory'

print(re.sub('e','*',ch))  # replace a substring with another substring
print(re.sub('[aeiou]','*',ch)) # replace any vowels with stars
print(re.sub('[a-z]','-',ch))

print(re.search('go',ch)) # find location of a substring (first match)
print(re.findall('e',ch))
print(re.findall('[A-Z]',ch))

print(re.split(' ',ch))
Karl Bruc* Gr*gory
K*rl Br*c* Gr*g*ry
K--- B---- G------
<re.Match object; span=(14, 16), match='go'>
['e', 'e']
['K', 'B', 'G']
['Karl', 'Bruce', 'Gregory']
In [11]:
init = re.findall('[A-Z]',ch)
'.'.join(init) + '.' # collapse the init list with '.' between
Out[11]:
'K.B.G.'
In [12]:
ch1 = 'Matthew Frederick Thaddeus Bailey' # convert to M. F. T. Bailey
ch2 = 'Karl Gregory' # want to say K. Gregory
In [13]:
def nameabb(ch):
    
    full =   re.split(' ',ch) 
    n = len(full)
    
    if(n == 1):
        
        abb = ch
        
    else:
    
        abb = ''
        
        for i in range(n-1): abb += full[i][0] + '. '
            
        abb += full[n-1]
        
    return abb
In [14]:
print(nameabb('Matthew Frederick Thaddeus Bailey'))
print(nameabb('Karl Gregory'))
print(nameabb('Karl'))
M. F. T. Bailey
K. Gregory
Karl

Let's abbreviate the names in the data set:

In [15]:
safari['name_abb'] = safari['guide'].map(nameabb)
safari
Out[15]:
date wildebeest laughing hyena crocodile weather start end fun guide newdate weekday name_abb
0 1/13/1999 12 none 2 sunny 7:21 am 4:14 pm yes Joshua Tebbs 1999-01-13 2 J. Tebbs
1 4/28/2001 3 1 1 cloudy 6:25 am 12:33 pm Y Edsel Peña 2001-04-28 5 E. Peña
2 10/15/2010 3 NaN 6 rainy 8:12 am NaN no Karl Bruce Gregory 2010-10-15 4 K. B. Gregory
3 3/02/2006 1 14 5 hot/sunny 7:15 am 3:12 pm y Lianming Wang 2006-03-02 3 L. Wang
4 2/28/1988 2 6 3 partly cloudy 4:53 am 2:16 pm Yes Brian Habing 1988-02-28 6 B. Habing
5 7/14/2015 3 12 0 cloudy 5:47 am 3:46 pm No Edwards 2015-07-14 1 Edwards

Use a dictionary to make a "mapping" of the values in the fun column to standardized values.

In [16]:
fundict = {'Yes': True, 'yes': True, 'y': True, 'Y' : True, 'no' : False, 'No' : False}
safari['funTF'] = safari.fun.map(fundict)
safari
Out[16]:
date wildebeest laughing hyena crocodile weather start end fun guide newdate weekday name_abb funTF
0 1/13/1999 12 none 2 sunny 7:21 am 4:14 pm yes Joshua Tebbs 1999-01-13 2 J. Tebbs True
1 4/28/2001 3 1 1 cloudy 6:25 am 12:33 pm Y Edsel Peña 2001-04-28 5 E. Peña True
2 10/15/2010 3 NaN 6 rainy 8:12 am NaN no Karl Bruce Gregory 2010-10-15 4 K. B. Gregory False
3 3/02/2006 1 14 5 hot/sunny 7:15 am 3:12 pm y Lianming Wang 2006-03-02 3 L. Wang True
4 2/28/1988 2 6 3 partly cloudy 4:53 am 2:16 pm Yes Brian Habing 1988-02-28 6 B. Habing True
5 7/14/2015 3 12 0 cloudy 5:47 am 3:46 pm No Edwards 2015-07-14 1 Edwards False

Accessing entries of a Pandas data frame.

In [17]:
safari
Out[17]:
date wildebeest laughing hyena crocodile weather start end fun guide newdate weekday name_abb funTF
0 1/13/1999 12 none 2 sunny 7:21 am 4:14 pm yes Joshua Tebbs 1999-01-13 2 J. Tebbs True
1 4/28/2001 3 1 1 cloudy 6:25 am 12:33 pm Y Edsel Peña 2001-04-28 5 E. Peña True
2 10/15/2010 3 NaN 6 rainy 8:12 am NaN no Karl Bruce Gregory 2010-10-15 4 K. B. Gregory False
3 3/02/2006 1 14 5 hot/sunny 7:15 am 3:12 pm y Lianming Wang 2006-03-02 3 L. Wang True
4 2/28/1988 2 6 3 partly cloudy 4:53 am 2:16 pm Yes Brian Habing 1988-02-28 6 B. Habing True
5 7/14/2015 3 12 0 cloudy 5:47 am 3:46 pm No Edwards 2015-07-14 1 Edwards False
In [18]:
pos = safari['laughing hyena'] == 'none'
print(pos)
0     True
1    False
2    False
3    False
4    False
5    False
Name: laughing hyena, dtype: bool
In [19]:
safari.loc[pos,'laughing hyena'] = 0 # instead of using square brackets
safari
Out[19]:
date wildebeest laughing hyena crocodile weather start end fun guide newdate weekday name_abb funTF
0 1/13/1999 12 0 2 sunny 7:21 am 4:14 pm yes Joshua Tebbs 1999-01-13 2 J. Tebbs True
1 4/28/2001 3 1 1 cloudy 6:25 am 12:33 pm Y Edsel Peña 2001-04-28 5 E. Peña True
2 10/15/2010 3 NaN 6 rainy 8:12 am NaN no Karl Bruce Gregory 2010-10-15 4 K. B. Gregory False
3 3/02/2006 1 14 5 hot/sunny 7:15 am 3:12 pm y Lianming Wang 2006-03-02 3 L. Wang True
4 2/28/1988 2 6 3 partly cloudy 4:53 am 2:16 pm Yes Brian Habing 1988-02-28 6 B. Habing True
5 7/14/2015 3 12 0 cloudy 5:47 am 3:46 pm No Edwards 2015-07-14 1 Edwards False

How to sort a Pandas dataframe.

In [20]:
safari.sort_values(by = 'newdate')
Out[20]:
date wildebeest laughing hyena crocodile weather start end fun guide newdate weekday name_abb funTF
4 2/28/1988 2 6 3 partly cloudy 4:53 am 2:16 pm Yes Brian Habing 1988-02-28 6 B. Habing True
0 1/13/1999 12 0 2 sunny 7:21 am 4:14 pm yes Joshua Tebbs 1999-01-13 2 J. Tebbs True
1 4/28/2001 3 1 1 cloudy 6:25 am 12:33 pm Y Edsel Peña 2001-04-28 5 E. Peña True
3 3/02/2006 1 14 5 hot/sunny 7:15 am 3:12 pm y Lianming Wang 2006-03-02 3 L. Wang True
2 10/15/2010 3 NaN 6 rainy 8:12 am NaN no Karl Bruce Gregory 2010-10-15 4 K. B. Gregory False
5 7/14/2015 3 12 0 cloudy 5:47 am 3:46 pm No Edwards 2015-07-14 1 Edwards False
In [21]:
safari.sort_values(by = ['funTF','newdate'], inplace = True) # add inplace = True to overwrite the table
safari
Out[21]:
date wildebeest laughing hyena crocodile weather start end fun guide newdate weekday name_abb funTF
2 10/15/2010 3 NaN 6 rainy 8:12 am NaN no Karl Bruce Gregory 2010-10-15 4 K. B. Gregory False
5 7/14/2015 3 12 0 cloudy 5:47 am 3:46 pm No Edwards 2015-07-14 1 Edwards False
4 2/28/1988 2 6 3 partly cloudy 4:53 am 2:16 pm Yes Brian Habing 1988-02-28 6 B. Habing True
0 1/13/1999 12 0 2 sunny 7:21 am 4:14 pm yes Joshua Tebbs 1999-01-13 2 J. Tebbs True
1 4/28/2001 3 1 1 cloudy 6:25 am 12:33 pm Y Edsel Peña 2001-04-28 5 E. Peña True
3 3/02/2006 1 14 5 hot/sunny 7:15 am 3:12 pm y Lianming Wang 2006-03-02 3 L. Wang True