#================================================#
# Machine Learning Lecture 3 in Python
# Author: Chong Ma
# Date : June 26, 2017
#================================================#
#================================================#
# import Python library (just like library in R)
# that will be used in this lecture
#================================================#
# update jupyter notebook: pip install -U jupyter
import numpy as np
import pandas as pd
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# one way to load iris data set
from sklearn import datasets
iris=datasets.load_iris()
iris_data=pd.DataFrame(iris.data)
iris_class=pd.DataFrame(np.repeat(iris.target_names,[50,50,50]))
dataset=pd.concat([iris_data,iris_class],axis=1)
dataset.columns=["Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species"]
# Another way to load iris data set
# dataset = pd.read_csv("iris.csv", index_col=False,mangle_dupe_cols=True)
# print the first and last 5 subjects
dataset.head(5), dataset.tail(5)
dataset.plot(kind='box', subplots=True, layout=(2,2),
figsize=(8,8),
sharex=False, sharey=False)
plt.show()
plt.figure()
dataset.hist(figsize=(8,8))
plt.show()
# scatter plot matrix
plt.figure()
scatter_matrix(dataset,figsize=(10,10))
plt.show()
# Split-out validation dataset
array = dataset.values
X = array[:,0:4]
Y = array[:,4]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size,
random_state=seed)
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
pd.DataFrame(results).apply(lambda x: (np.mean(x),np.sqrt(np.var(x))),axis=1)
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
Under a general hypothesis test $\mathrm{H}_0: \text{null}$ Versus $\mathrm{H}_1: \text{non-null}$,
$\mathrm{Precision}=P(\mathrm{H}_1| \text{Rejection})=\frac{\text{True Positive}}{\text{True Positive}+\text{False Positive}}$
$\mathrm{Recall}=P(\text{Rejection}|\mathrm{H}_1)=\frac{\text{True Positive}}{\text{True Positive}+\text{False Negative}}$
$\mathrm{F}_{1}\text{-score}=2\frac{\mathrm{Precision}*\mathrm{Recall}}{\mathrm{Precision}+\mathrm{Recall}}$
# Make predictions on validation dataset
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))