DNA Classification
In this project, it will show the Machine Learning Model for classifying DNA sequence. K-Nearest Neighborhood and Support Vector Machine and several algorithm for classification will be used. The original data is from UCI Machine Learning Repository.
- Required Packages
- Version check
- Prepare dataset
- Preprocess Data
- Describe Dataset
- Build the Machine Learning Model
import sys
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn
plt.rcParams['figure.figsize'] = (8, 8)
print('Python: {}'.format(sys.version))
print('Numpy: {}'.format(np.__version__))
print('Pandas: {}'.format(pd.__version__))
Prepare dataset
The original data is from UCI Machine Learning Repository
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'
names = ['Class', 'id', 'Sequence']
data = pd.read_csv(url, names=names)
data.head()
# each column in a dataframe is called a series
classes = data.loc[:, 'Class']
classes.value_counts()
sequences = data.loc[:, 'Sequence'].tolist()
dataset = {}
# Loop throught the sequences and split into individual nucleotides
for i, seq in enumerate(sequences):
# split into nucleotides, remove tab characters
nucleotides = list(seq)
nucleotides = [x for x in nucleotides if x != '\t']
# Append class assignment
nucleotides.append(classes[i])
# add to dataset
dataset[i] = nucleotides
print(dataset[0])
df = pd.DataFrame(dataset).T
df
df.rename(columns={57: 'Class'}, inplace=True)
df
df.describe()
series = []
for name in df.columns:
series.append(df[name].value_counts())
info = pd.DataFrame(series)
details = info.T
details
numerical_df = pd.get_dummies(df)
numerical_df.head()
df = numerical_df.drop(columns=['Class_-'])
df.rename(columns={'Class_+':'Class'}, inplace=True)
df
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import KFold, train_test_split, cross_val_score
X = df.drop(['Class'], axis=1).to_numpy()
y = df['Class'].to_numpy()
# Split the data into training and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)
scoring = 'accuracy'
# Define models to train
names = ['K Nearest Neighbors', 'Gaussian Process', 'Decision Tree', 'Random Forest',
'Neural Network', 'AdaBoost', 'Naive Bayes', 'SVM Linear', 'SVM RBF', 'SVM Sigmoid']
classifiers = [
KNeighborsClassifier(n_neighbors=3),
GaussianProcessClassifier(1.0 * RBF(1.0)),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
MLPClassifier(alpha=1, max_iter=500),
AdaBoostClassifier(),
GaussianNB(),
SVC(kernel='linear'),
SVC(kernel='rbf'),
SVC(kernel='sigmoid')
]
models = zip(names, classifiers)
# Evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = KFold(n_splits=10, shuffle=True)
cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = '{0}: {1} ({2})'.format(name, cv_results.mean(), cv_results.std())
print(msg)
models = zip(names, classifiers)
# Test the algorithm on the validation dataset
for name, model in models:
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(name)
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))