Required Packages

import sys
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn

plt.rcParams['figure.figsize'] = (8, 8)

Version check

print('Python: {}'.format(sys.version))
print('Numpy: {}'.format(np.__version__))
print('Pandas: {}'.format(pd.__version__))
Python: 3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]
Numpy: 1.18.1
Pandas: 1.0.1

Prepare dataset

The original data is from UCI Machine Learning Repository

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'
names = ['Class', 'id', 'Sequence']

data = pd.read_csv(url, names=names)
data.head()
Class id Sequence
0 + S10 \t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1 + AMPC \t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2 + AROH \t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3 + DEOP2 \taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4 + LEU1_TRNA \ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...

Preprocess Data

# each column in a dataframe is called a series
classes = data.loc[:, 'Class']
classes.value_counts()
+    53
-    53
Name: Class, dtype: int64
sequences = data.loc[:, 'Sequence'].tolist()
dataset = {}

# Loop throught the sequences and split into individual nucleotides
for i, seq in enumerate(sequences):
    # split into nucleotides, remove tab characters
    nucleotides = list(seq)
    nucleotides = [x for x in nucleotides if x != '\t']
    
    # Append class assignment
    nucleotides.append(classes[i])
    
    # add to dataset
    dataset[i] = nucleotides
    
print(dataset[0])
['t', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 't', 'a', 'c', 'g', 'c', 't', 't', 'g', 'c', 'g', 't', 't', 'c', 'g', 'g', 't', 'g', 'g', 't', 't', 'a', 'a', 'g', 't', 'a', 't', 'g', 't', 'a', 't', 'a', 'a', 't', 'g', 'c', 'g', 'c', 'g', 'g', 'g', 'c', 't', 't', 'g', 't', 'c', 'g', 't', '+']
df = pd.DataFrame(dataset).T
df
0 1 2 3 4 5 6 7 8 9 ... 48 49 50 51 52 53 54 55 56 57
0 t a c t a g c a a t ... g c t t g t c g t +
1 t g c t a t c c t g ... c a t c g c c a a +
2 g t a c t a g a g a ... c a c c c g g c g +
3 a a t t g t g a t g ... a a c a a a c t c +
4 t c g a t a a t t a ... c c g t g g t a g +
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
101 c c t c a a t g g c ... g a a c t a t a t -
102 g t a t t c t c a a ... t c a a c a t t g -
103 c g c g a c t a c g ... a a g g c t t c c -
104 c t c g t c c t c a ... a g g a g g a a c -
105 t a a c a t t a a t ... t c a a g a a c t -

106 rows × 58 columns

df.rename(columns={57: 'Class'}, inplace=True)
df
0 1 2 3 4 5 6 7 8 9 ... 48 49 50 51 52 53 54 55 56 Class
0 t a c t a g c a a t ... g c t t g t c g t +
1 t g c t a t c c t g ... c a t c g c c a a +
2 g t a c t a g a g a ... c a c c c g g c g +
3 a a t t g t g a t g ... a a c a a a c t c +
4 t c g a t a a t t a ... c c g t g g t a g +
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
101 c c t c a a t g g c ... g a a c t a t a t -
102 g t a t t c t c a a ... t c a a c a t t g -
103 c g c g a c t a c g ... a a g g c t t c c -
104 c t c g t c c t c a ... a g g a g g a a c -
105 t a a c a t t a a t ... t c a a g a a c t -

106 rows × 58 columns

Describe Dataset

df.describe()
0 1 2 3 4 5 6 7 8 9 ... 48 49 50 51 52 53 54 55 56 Class
count 106 106 106 106 106 106 106 106 106 106 ... 106 106 106 106 106 106 106 106 106 106
unique 4 4 4 4 4 4 4 4 4 4 ... 4 4 4 4 4 4 4 4 4 2
top t a a c a a a a a a ... c c c t t c c c t +
freq 38 34 30 30 36 42 38 34 33 36 ... 36 42 31 33 35 32 29 29 34 53

4 rows × 58 columns

series = []

for name in df.columns:
    series.append(df[name].value_counts())
    
info = pd.DataFrame(series)
details = info.T
details
0 1 2 3 4 5 6 7 8 9 ... 48 49 50 51 52 53 54 55 56 Class
t 38.0 26.0 27.0 26.0 22.0 24.0 30.0 32.0 32.0 28.0 ... 21.0 22.0 23.0 33.0 35.0 30.0 23.0 29.0 34.0 NaN
c 27.0 22.0 21.0 30.0 19.0 18.0 21.0 20.0 22.0 22.0 ... 36.0 42.0 31.0 32.0 21.0 32.0 29.0 29.0 17.0 NaN
a 26.0 34.0 30.0 22.0 36.0 42.0 38.0 34.0 33.0 36.0 ... 23.0 24.0 28.0 27.0 25.0 22.0 26.0 24.0 27.0 NaN
g 15.0 24.0 28.0 28.0 29.0 22.0 17.0 20.0 19.0 20.0 ... 26.0 18.0 24.0 14.0 25.0 22.0 28.0 24.0 28.0 NaN
+ NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 53.0
- NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 53.0

6 rows × 58 columns

numerical_df = pd.get_dummies(df)
numerical_df.head()
0_a 0_c 0_g 0_t 1_a 1_c 1_g 1_t 2_a 2_c ... 55_a 55_c 55_g 55_t 56_a 56_c 56_g 56_t Class_+ Class_-
0 0 0 0 1 1 0 0 0 0 1 ... 0 0 1 0 0 0 0 1 1 0
1 0 0 0 1 0 0 1 0 0 1 ... 1 0 0 0 1 0 0 0 1 0
2 0 0 1 0 0 0 0 1 1 0 ... 0 1 0 0 0 0 1 0 1 0
3 1 0 0 0 1 0 0 0 0 0 ... 0 0 0 1 0 1 0 0 1 0
4 0 0 0 1 0 1 0 0 0 0 ... 1 0 0 0 0 0 1 0 1 0

5 rows × 230 columns

df = numerical_df.drop(columns=['Class_-'])

df.rename(columns={'Class_+':'Class'}, inplace=True)
df
0_a 0_c 0_g 0_t 1_a 1_c 1_g 1_t 2_a 2_c ... 54_t 55_a 55_c 55_g 55_t 56_a 56_c 56_g 56_t Class
0 0 0 0 1 1 0 0 0 0 1 ... 0 0 0 1 0 0 0 0 1 1
1 0 0 0 1 0 0 1 0 0 1 ... 0 1 0 0 0 1 0 0 0 1
2 0 0 1 0 0 0 0 1 1 0 ... 0 0 1 0 0 0 0 1 0 1
3 1 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 1 0 1 0 0 1
4 0 0 0 1 0 1 0 0 0 0 ... 1 1 0 0 0 0 0 1 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
101 0 1 0 0 0 1 0 0 0 0 ... 1 1 0 0 0 0 0 0 1 0
102 0 0 1 0 0 0 0 1 1 0 ... 1 0 0 0 1 0 0 1 0 0
103 0 1 0 0 0 0 1 0 0 1 ... 1 0 1 0 0 0 1 0 0 0
104 0 1 0 0 0 0 0 1 0 1 ... 0 1 0 0 0 0 1 0 0 0
105 0 0 0 1 1 0 0 0 1 0 ... 0 0 1 0 0 0 0 0 1 0

106 rows × 229 columns

Build the Machine Learning Model

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import KFold, train_test_split, cross_val_score
X = df.drop(['Class'], axis=1).to_numpy()
y = df['Class'].to_numpy()

# Split the data into training and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)
scoring = 'accuracy'

# Define models to train
names = ['K Nearest Neighbors', 'Gaussian Process', 'Decision Tree', 'Random Forest', 
         'Neural Network', 'AdaBoost', 'Naive Bayes', 'SVM Linear', 'SVM RBF', 'SVM Sigmoid']

classifiers = [
    KNeighborsClassifier(n_neighbors=3),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=500),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(kernel='linear'),
    SVC(kernel='rbf'),
    SVC(kernel='sigmoid')
]

models = zip(names, classifiers)

# Evaluate each model in turn
results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10, shuffle=True)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = '{0}:  {1}  ({2})'.format(name, cv_results.mean(), cv_results.std())
    print(msg)
K Nearest Neighbors:  0.7946428571428571  (0.1569499627789906)
Gaussian Process:  0.9125  (0.08003905296791061)
Decision Tree:  0.7839285714285714  (0.15980895801307413)
Random Forest:  0.6339285714285714  (0.187329854774395)
Neural Network:  0.8732142857142857  (0.0969726713027533)
AdaBoost:  0.8482142857142858  (0.12222689256176861)
Naive Bayes:  0.8607142857142858  (0.11785714285714285)
SVM Linear:  0.8964285714285714  (0.08253014291636673)
SVM RBF:  0.8607142857142858  (0.13044273119821195)
SVM Sigmoid:  0.95  (0.09999999999999999)
models = zip(names, classifiers)
# Test the algorithm on the validation dataset
for name, model in models:
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(name)
    print(accuracy_score(y_test, predictions))
    print(classification_report(y_test, predictions))
K Nearest Neighbors
0.8518518518518519
              precision    recall  f1-score   support

           0       1.00      0.73      0.85        15
           1       0.75      1.00      0.86        12

    accuracy                           0.85        27
   macro avg       0.88      0.87      0.85        27
weighted avg       0.89      0.85      0.85        27

Gaussian Process
0.9259259259259259
              precision    recall  f1-score   support

           0       0.88      1.00      0.94        15
           1       1.00      0.83      0.91        12

    accuracy                           0.93        27
   macro avg       0.94      0.92      0.92        27
weighted avg       0.93      0.93      0.92        27

Decision Tree
0.8148148148148148
              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.77      0.83      0.80        12

    accuracy                           0.81        27
   macro avg       0.81      0.82      0.81        27
weighted avg       0.82      0.81      0.82        27

Random Forest
0.8148148148148148
              precision    recall  f1-score   support

           0       0.92      0.73      0.81        15
           1       0.73      0.92      0.81        12

    accuracy                           0.81        27
   macro avg       0.82      0.82      0.81        27
weighted avg       0.84      0.81      0.81        27

Neural Network
0.9259259259259259
              precision    recall  f1-score   support

           0       0.88      1.00      0.94        15
           1       1.00      0.83      0.91        12

    accuracy                           0.93        27
   macro avg       0.94      0.92      0.92        27
weighted avg       0.93      0.93      0.92        27

AdaBoost
0.9259259259259259
              precision    recall  f1-score   support

           0       1.00      0.87      0.93        15
           1       0.86      1.00      0.92        12

    accuracy                           0.93        27
   macro avg       0.93      0.93      0.93        27
weighted avg       0.94      0.93      0.93        27

Naive Bayes
0.9259259259259259
              precision    recall  f1-score   support

           0       0.88      1.00      0.94        15
           1       1.00      0.83      0.91        12

    accuracy                           0.93        27
   macro avg       0.94      0.92      0.92        27
weighted avg       0.93      0.93      0.92        27

SVM Linear
0.8888888888888888
              precision    recall  f1-score   support

           0       0.88      0.93      0.90        15
           1       0.91      0.83      0.87        12

    accuracy                           0.89        27
   macro avg       0.89      0.88      0.89        27
weighted avg       0.89      0.89      0.89        27

SVM RBF
0.9259259259259259
              precision    recall  f1-score   support

           0       0.88      1.00      0.94        15
           1       1.00      0.83      0.91        12

    accuracy                           0.93        27
   macro avg       0.94      0.92      0.92        27
weighted avg       0.93      0.93      0.92        27

SVM Sigmoid
0.8518518518518519
              precision    recall  f1-score   support

           0       0.87      0.87      0.87        15
           1       0.83      0.83      0.83        12

    accuracy                           0.85        27
   macro avg       0.85      0.85      0.85        27
weighted avg       0.85      0.85      0.85        27