Board Game Review Prediction
Using Linear Regression/Random Forest Regression with Game information, it can predict Average game User rates. This data is from boardgamegeek, re-organized in scrapper
- Required Packages
- Version Check
- Dataset Load
- Exploratory Data Analysis
- Preprocess Dataset
- Build Linear Regression Model
- Build Non-Linear Regression (RandomForestRegressor) Model
- Validate the model with individual test set
import sys
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn
plt.rcParams['figure.figsize'] = (8, 8)
from sklearn.model_selection import train_test_split
print('Python: {}'.format(sys.version))
print('Numpy: {}'.format(np.__version__))
print('Matplotlib: {}'.format(mpl.__version__))
print('Seaborn: {}'.format(sns.__version__))
print('Pandas: {}'.format(pd.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
Dataset Load
More data information is in here
games = pd.read_csv('./dataset/games.csv')
games.head()
games.describe()
print(games.columns)
print(games.shape)
Our purpose is to predict average_rating
. But some rows contain 0 rating. So we should remove that.
plt.hist(games['average_rating']);
print(games[games['average_rating'] == 0].iloc[0])
This row is meaningless
print(games[games['average_rating'] > 0].iloc[0])
games = games[games['users_rated'] > 0]
# Remove any rows with missing values
games.dropna(axis=0, inplace=True)
# Make a histogram of all the average ratings
plt.hist(games['average_rating']);
corrmat = games.corr()
fig = plt.figure(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);
columns = games.columns.tolist()
# Filter the columns to remove data we don't want
columns = [c for c in columns if c not in ['bayes_average_rating', 'average_rating', 'type', 'name', 'id']]
# Store the variable we'll be predicting on
target = 'average_rating'
train = games.sample(frac=0.8, random_state=1)
test = games.loc[~games.index.isin(train.index)]
# Print shapes
print(train.shape)
print(test.shape)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Initialize the model class
lr = LinearRegression()
# Fit the model with training data
lr.fit(train[columns], train[target])
predictions = lr.predict(test[columns])
# Compute error between test predictions and actual values
print('MSE : {}'.format(mean_squared_error(predictions, test[target])))
from sklearn.ensemble import RandomForestRegressor
# Initialize the model class
rfr = RandomForestRegressor(n_estimators=100, min_samples_leaf=10, random_state=1)
# Fit the model with training data
rfr.fit(train[columns], train[target])
predictions = rfr.predict(test[columns])
# Compute error between test predictions and actual values
print('MSE : {}'.format(mean_squared_error(predictions, test[target])))
As a result, we can get more improved result from non-linear regression rather than linear regression model.
test[columns].iloc[1]
rating_lr = lr.predict(test[columns].iloc[1].values.reshape(1, -1))
rating_rfr = rfr.predict(test[columns].iloc[1].values.reshape(1, -1))
# Print out the predictions
print(rating_lr)
print(rating_rfr)
print(test[target].iloc[1])