#import warnings
#from sklearn.datasets import load_iris
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn.decomposition import PCA #
from sklearn.linear_model import LogisticRegression #
from sklearn.ensemble import RandomForestClassifier #
#https://scikit-learn.org/stable/ for images of classifiers
#128 What each is good for
#When working with a new dataset, it is in general a good idea to start with a simple model, such as a
#linear model or a naive Bayes or nearest neighbors classifier, and see how far you can get. After
#understanding more about the data,
#you can consider moving to an algorithm that can build more complex models, such as random forests, gradient boosted decision
#trees, SVMs, or neural networks.
#Supervised
from sklearn.neighbors import KNeighborsClassifier #37
from sklearn.neighbors import KNeighborsRegressor #42
from sklearn.linear_model import LinearRegression #47
from sklearn.linear_model import Ridge #49
from sklearn.linear_model import Lasso #53
from sklearn.linear_model import LogisticRegression #57
from sklearn.svm import LinearSVC #57
from sklearn.tree import DecisionTreeClassifier #75
from sklearn.tree import DecisionTreeRegressor #81
from sklearn.ensemble import RandomForestClassifier #85
from sklearn.ensemble import GradientBoostingClassifier #89
from sklearn.svm import SVC #98
from sklearn.neural_network import MLPClassifier #108
#Unsupervised
from sklearn.decomposition import PCA #144
from sklearn.decomposition import NMF #159
#Digits
from sklearn.manifold import TSNE #166
from sklearn.cluster import KMeans #170
from sklearn.cluster import AgglomerativeClustering #183
from sklearn.cluster import DBSCAN #188
#https://machinelearningmastery.com/how-to-fix-futurewarning-messages-in-scikit-learn/
from sklearn.utils import check_random_state
from sklearn import svm
# run block of code and catch warnings
#with warnings.catch_warnings():
# ignore all caught warnings
# warnings.filterwarnings("ignore")
# execute code that will generate warnings
# Load and split the data
# Turn down for faster convergence
train_size = 500
test_size = 100
### load MNIST data from https://www.openml.org/d/554
X, y = datasets.fetch_openml('mnist_784', version=1, return_X_y=True)
# shuffle data
random_state = check_random_state(0)
permutation = random_state.permutation(X.shape[0])
X = X[permutation]
y = y[permutation]
X = X.reshape((X.shape[0], -1))
# pick training and test data sets
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=train_size,test_size=test_size)
#mnist = datasets.fetch_mldata("MNIST Original")
#mnist = datasets.fetch_openml('mnist_784', version=1, return_X_y=False)
#iris = load_iris()
#X_train, X_test, y_train, y_test = train_test_split(mnist.data, mnist.target, test_size = 0.2, random_state=42)
#X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)
# Construct some pipelines
pipe_lr = Pipeline([('scl', StandardScaler()), ('clf', LogisticRegression(random_state=42, solver='lbfgs', multi_class='auto'))])
pipe_lr_pca = Pipeline([('scl', StandardScaler()), ('pca', PCA(n_components=2)), ('clf', LogisticRegression(random_state=42, solver='lbfgs', multi_class='auto'))])
pipe_rf = Pipeline([('scl', StandardScaler()), ('clf', RandomForestClassifier(random_state=42, n_estimators=100))])
pipe_rf_pca = Pipeline([('scl', StandardScaler()), ('pca', PCA(n_components=2)), ('clf', RandomForestClassifier(random_state=42, n_estimators=100))])
pipe_svm = Pipeline([('scl', StandardScaler()), ('clf', svm.SVC(random_state=42, gamma='scale'))])
pipe_svm_pca = Pipeline([('scl', StandardScaler()), ('pca', PCA(n_components=2)), ('clf', svm.SVC(random_state=42, gamma='scale'))])
# Set grid search params
param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_range_fl = [1.0, 0.5, 0.1]
grid_params_lr = [{'clf__penalty': ['l1', 'l2'],
'clf__C': param_range_fl,
'clf__solver': ['liblinear']}]
grid_params_rf = [{'clf__criterion': ['gini', 'entropy'],
'clf__min_samples_leaf': param_range,
'clf__max_depth': param_range,
'clf__min_samples_split': param_range[1:]}]
grid_params_svm = [{'clf__kernel': ['linear', 'rbf'],
'clf__C': param_range}]
# Construct grid searches
jobs = -1
gs_lr = GridSearchCV(estimator=pipe_lr, param_grid=grid_params_lr, scoring='accuracy', cv=10)
gs_lr_pca = GridSearchCV(estimator=pipe_lr_pca, param_grid=grid_params_lr, scoring='accuracy', cv=10)
gs_rf = GridSearchCV(estimator=pipe_rf, param_grid=grid_params_rf, scoring='accuracy', cv=10, n_jobs=jobs)
gs_rf_pca = GridSearchCV(estimator=pipe_rf_pca, param_grid=grid_params_rf, scoring='accuracy', cv=10, n_jobs=jobs)
gs_svm = GridSearchCV(estimator=pipe_svm, param_grid=grid_params_svm, scoring='accuracy', cv=10, n_jobs=jobs)
gs_svm_pca = GridSearchCV(estimator=pipe_svm_pca, param_grid=grid_params_svm, scoring='accuracy', cv=10, n_jobs=jobs)
# List of pipelines for ease of iteration
grids = [gs_lr, gs_lr_pca, gs_rf, gs_rf_pca, gs_svm, gs_svm_pca]
# Dictionary of pipelines and classifier types for ease of reference
grid_dict = {0: 'Logistic Regression',
1: 'Logistic Regression w/PCA',
2: 'Random Forest',
3: 'Random Forest w/PCA',
4: 'Support Vector Machine',
5: 'Support Vector Machine w/PCA'}
# Fit the grid search objects
print('Performing model optimizations...')
best_acc = 0.0
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
print('\nEstimator: %s' % grid_dict[idx])
# Fit grid search
gs.fit(X_train, y_train)
# Best params
print('Best params: %s' % gs.best_params_)
# Best training data accuracy
print('Best training accuracy: %.3f' % gs.best_score_)
# Predict on test data with best params
y_pred = gs.predict(X_test)
# Test data accuracy of model with best params
print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
# Track best (highest test accuracy) model
if accuracy_score(y_test, y_pred) > best_acc:
best_acc = accuracy_score(y_test, y_pred)
best_gs = gs
best_clf = idx
print('\nClassifier with best test set accuracy: %s' % grid_dict[best_clf])
# Save best grid search pipeline to file
dump_file = 'best_gs_pipeline.pkl'
joblib.dump(best_gs, dump_file, compress=1)
print('\nSaved %s grid search pipeline to file: %s' % (grid_dict[best_clf], dump_file))
#https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/