Stock Market Clustering with a KMeans algorithm
In this project, it will show the clustering algorithm to detect similar companies based on stock market movement. The original data is from Yahoo Finance.
- Required Packages
- Version check
- Prepare dataset
- Visualization
- Preprocessing dataset
- Build Pipeline
- Clustering
- Dimensionality Reduction with PCA
- Visualize cluster
import sys
import datetime
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pandas_datareader import data
import sklearn
plt.rcParams['figure.figsize'] = (8, 8)
print('Python: {}'.format(sys.version))
print('Numpy: {}'.format(np.__version__))
print('Matplotlib: {}'.format(mpl.__version__))
print('Seaborn: {}'.format(sns.__version__))
print('Pandas: {}'.format(pd.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
companies_dict = {
'Amazon': 'AMZN',
'Apple': 'AAPL',
'Walgreen': 'WBA',
'Northrop Grumman': 'NOC',
'Boeing': 'BA',
'Lockheed Martin': 'LMT',
'McDonalds': 'MCD',
'Intel': 'INTC',
'Navistar': 'NAV',
'IBM': 'IBM',
'Texas Instruments': 'TXN',
'MasterCard': 'MA',
'Microsoft': 'MSFT',
'General Electrics': 'GE',
'American Express': 'AXP',
'Pepsi': 'PEP',
'Coca Cola': 'KO',
'Johnson & Johnson': 'JNJ',
'Toyota': 'TM',
'Honda': 'HMC',
'Mistubishi': 'MSBHY',
'Sony': 'SNE',
'Exxon': 'XOM',
'Chevron': 'CVX',
'Valero Energy': 'VLO',
'Ford': 'F',
'Bank of America': 'BAC',
}
companies = sorted(companies_dict.items(), key=lambda x: x[1])
print(companies)
companies_sorted = [x[1] for x in companies]
companies_sorted
data_source = 'yahoo'
# Define the start and end dates
start_date = '2015-01-01'
end_date = '2017-12-31'
# Use pandas_reader.data.DataReader to load the desired stock data
panel_data = data.DataReader(companies_sorted, data_source, start_date, end_date).unstack().unstack().T
# Print Axes labels
print(panel_data.axes)
stock_close = panel_data['Close']
stock_open = panel_data['Open']
print(stock_close.iloc[0])
stock_close = np.array(stock_close).T
stock_open = np.array(stock_open).T
row, col = stock_close.shape
print(row, col)
movements = np.zeros([row, col])
for i in range(0, row):
movements[i, :] = np.subtract(stock_close[i, :], stock_open[i, :])
for i in range(0, len(companies)):
print('Company: {}, Change: {}'.format(companies[i][0], sum(movements[i, :])))
plt.figure(figsize=(18, 16))
ax1 = plt.subplot(221)
plt.plot(movements[0][:]);
plt.title(companies[0]);
ax2 = plt.subplot(222, sharey=ax1)
plt.plot(movements[1][:]);
plt.title(companies[1]);
from sklearn.preprocessing import Normalizer
# Create the normalizer
normalizer = Normalizer()
new = normalizer.fit_transform(movements)
print(new.max())
print(new.min())
print(new.mean())
plt.figure(figsize=(18, 16))
ax1 = plt.subplot(221)
plt.plot(new[0][:]);
plt.title(companies[0]);
ax2 = plt.subplot(222, sharey=ax1)
plt.plot(new[1][:]);
plt.title(companies[1]);
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
# Define normalizer
normalizer = Normalizer()
# Create a KMeans model - 10 clusters
kmeans = KMeans(n_clusters=10, max_iter=1000)
# Make a pipeline chaining normalizer and kmeans
pipeline = make_pipeline(normalizer, kmeans)
pipeline.fit(movements)
print(kmeans.inertia_)
labels = pipeline.predict(movements)
# Create a DataFrame aligning labels and companies
df = pd.DataFrame({'labels':labels, 'companies': companies})
# Display df sorted by cluster label
print(df.sort_values('labels'))
from sklearn.decomposition import PCA
# Visualize the results on PCA-reduced data
reduced_data = PCA(n_components=2).fit_transform(new)
# Run kmeans on the reduced data
kmeans = KMeans(n_clusters=10)
kmeans.fit(reduced_data)
labels = kmeans.predict(reduced_data)
print(kmeans.inertia_)
# Create a DataFrame aligning labels and companies
df = pd.DataFrame({'labels':labels, 'companies':companies})
print(df.sort_values('labels'))
h = 0.01
# Plot the decision boundary (+- 1 for padding)
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Obtain labels for each point in the mesh using our trained model
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
# Define colorplot
cmap = plt.cm.Paired
# Plot figure
plt.figure(figsize=(10, 10))
plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap = cmap, aspect='auto', origin='lower');
plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=5);
# Plot the centroids of each cluster as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10);
plt.title('K-Means clustering on Stock Market Movements (PCA-Reduced Data)');
plt.xlim(x_min, x_max);
plt.ylim(y_min, y_max);