Diabetes Onset Detection
In this project, it will show the Deep Neural Network with GridSearch for detecting Diabetes onset. GridSearch is used for hyperparameter tuning. The original data(PIMA indian diabetes dataset) is from UCI Open Repository.
- Required Packages
- Version check
- Prepare Dataset
- Preprocess Dataset
- Build Neural Network
- Define grid Search
- Applying Dropout, Optimizing learning rate
- Weight Initialization, Activation function
- Number of Neurons
- Predict with Optimal hyperparameters
import sys
import datetime
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn
import tensorflow as tf
plt.rcParams['figure.figsize'] = (8, 8)
print('Python: {}'.format(sys.version))
print('Numpy: {}'.format(np.__version__))
print('Matplotlib: {}'.format(mpl.__version__))
print('Seaborn: {}'.format(sns.__version__))
print('Pandas: {}'.format(pd.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
print('Tensorflow: {}'.format(tf.__version__))
Prepare Dataset
Currently, PIMA indian diabetes dataset is offered from kaggle
df = pd.read_csv('./dataset/datasets_228_482_diabetes.csv')
df.head()
df.describe()
df[df['Glucose'] == 0]
columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in columns:
df[col].replace(0, np.nan, inplace=True)
df.describe()
df.dropna(inplace=True)
# Summarize the number of rows and columns in df
df.describe()
dataset = df.values
print(dataset)
print(dataset.shape)
X = dataset[:, :-1]
y = dataset[:, -1].astype(int)
print(X.shape, y.shape)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
X_standard = scaler.transform(X)
data = pd.DataFrame(X_standard)
data.describe()
from sklearn.model_selection import GridSearchCV, KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.optimizers import Adam
def create_model():
# Create model
model = Sequential()
model.add(Dense(8, input_shape=(8, ), kernel_initializer='normal', activation='relu'))
model.add(Dense(4, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(optimizer=Adam(lr=0.01), loss='binary_crossentropy', metrics=['accuracy'])
return model
model = create_model()
print(model.summary())
seed = 6
np.random.seed(seed)
# Create model with KerasClassifier
model = KerasClassifier(build_fn=create_model, verbose=False)
batch_size = [10, 20, 40]
epochs = [10, 50, 100]
# Make a dictionary of the grid search parameters
param_grid = {
'batch_size':batch_size,
'epochs':epochs
}
# Build and fit the GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=KFold(random_state=seed), verbose=10)
grid_results = grid.fit(X_standard, y)
# Summarize the results
print('Best: {0}, using {1} '.format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print('{0} ({1}) with: {2}'.format(mean, stdev, param))
from tensorflow.keras.layers import Dropout
def create_model(learning_rate, dropout_rate):
# Create model
model = Sequential()
model.add(Dense(8, input_shape=(8, ), kernel_initializer='normal', activation='relu'))
model.add(Dropout(dropout_rate))
model.add(Dense(4, kernel_initializer='normal', activation='relu'))
model.add(Dropout(dropout_rate))
model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
return model
# Create model with KerasClassifier
model = KerasClassifier(build_fn=create_model, epochs=50, batch_size=20, verbose=False)
# Define Grid Search parameter
learning_rates = [0.001, 0.01, 0.1]
dropout_rates = [0.0, 0.1, 0.2]
# Make a dictionary of the grid search parameters
param_grid = {
'learning_rate':learning_rates,
'dropout_rate':dropout_rates
}
# Build and fit the GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=KFold(random_state=seed), verbose=10)
grid_results = grid.fit(X_standard, y)
# Summarize the results
print('Best: {0}, using {1} '.format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print('{0} ({1}) with: {2}'.format(mean, stdev, param))
def create_model(activation, initializer):
# Create model
model = Sequential()
model.add(Dense(8, input_shape=(8, ), kernel_initializer=initializer, activation=activation))
model.add(Dense(4, kernel_initializer=initializer, activation=activation))
model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(optimizer=Adam(lr=0.1), loss='binary_crossentropy', metrics=['accuracy'])
return model
# Create model with KerasClassifier
model = KerasClassifier(build_fn=create_model, epochs=50, batch_size=20, verbose=False)
# Define Grid Search parameter
activations = ['softmax', 'relu', 'tanh', 'linear']
initializers = ['normal', 'uniform', 'zero']
# Make a dictionary of the grid search parameters
param_grid = {
'activation':activations,
'initializer':initializers
}
# Build and fit the GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=KFold(random_state=seed), verbose=10)
grid_results = grid.fit(X_standard, y)
# Summarize the results
print('Best: {0}, using {1} '.format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print('{0} ({1}) with: {2}'.format(mean, stdev, param))
def create_model(neuron1, neuron2):
# Create model
model = Sequential()
model.add(Dense(neuron1, input_shape=(8, ), kernel_initializer='uniform', activation='linear'))
model.add(Dense(neuron2, kernel_initializer='uniform', activation='linear'))
model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(optimizer=Adam(lr=0.1), loss='binary_crossentropy', metrics=['accuracy'])
return model
# Create model with KerasClassifier
model = KerasClassifier(build_fn=create_model, epochs=50, batch_size=20, verbose=False)
# Define Grid Search parameter
neuron1 = [4, 8, 16]
neuron2 = [2, 4, 8]
# Make a dictionary of the grid search parameters
param_grid = {
'neuron1':neuron1,
'neuron2':neuron2
}
# Build and fit the GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=KFold(random_state=seed), refit=True,
verbose=10)
grid_results = grid.fit(X_standard, y)
# Summarize the results
print('Best: {0}, using {1} '.format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print('{0} ({1}) with: {2}'.format(mean, stdev, param))
y_pred = grid.predict(X_standard)
y_pred.shape
y_pred[:5]
from sklearn.metrics import classification_report, accuracy_score
print(accuracy_score(y, y_pred))
print(classification_report(y, y_pred))
example = df.iloc[0]
example
prediction = grid.predict(X_standard[0].reshape(1, -1))
prediction