Relationships
A Summary of lecture "Exploratory Data Analysis in Python", via datacamp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from empiricaldist import Pmf, Cdf
brfss_original = pd.read_hdf('./dataset/brfss.hdf5', 'brfss')
age = Pmf.from_seq(brfss_original['AGE'])
# Plot the PMF
age.bar()
# Label the axes
plt.xlabel('Age in years')
plt.ylabel('PMF')
brfss = brfss_original[:1000]
# Extract age and weight
age = brfss['AGE']
weight = brfss['WTKG3']
# Make a scatter plot
plt.plot(age, weight, 'o', alpha=0.1)
plt.xlabel('Age in years')
plt.ylabel('Weight in kg')
brfss = brfss_original[:1000]
# Add jittering to age
age = brfss['AGE'] + np.random.normal(0, 2.5, size=len(brfss))
# Extract weight
weight = brfss['WTKG3']
# Make a scatter plot
plt.plot(age, weight, 'o', markersize=5, alpha=0.2)
plt.xlabel('Age in years')
plt.ylabel('Weight in kg')
data = brfss_original.dropna(subset=['_HTMG10', 'WTKG3'])
# Make a box plot
sns.boxplot(x='_HTMG10', y='WTKG3', data=data, whis=10)
# Plot the y-axis on a log scale
plt.yscale('log')
# Remove unneeded lines and label axes
sns.despine(left=True, bottom=True)
plt.xlabel('Height in cm')
plt.ylabel('Weight in kg')
plt.savefig('../images/brfss-boxplot.png')
income = brfss_original['INCOME2']
# Plot the PMF
Pmf.from_seq(income).bar()
# Label the axes
plt.xlabel('Income level')
plt.ylabel('PMF')
data = brfss_original.dropna(subset=['INCOME2', 'HTM4'])
# Make a violin plot
sns.violinplot(x = 'INCOME2', y='HTM4', data=data, inner=None)
# Remove unneeded lines and label axes
sns.despine(left=True, bottom=True)
plt.xlabel('Income level')
plt.ylabel('Height in cm')
columns = ['AGE', 'INCOME2', '_VEGESU1']
subset = brfss_original[columns]
# Compute the correlation matrix
print(subset.corr())
from scipy.stats import linregress
subset = brfss_original.dropna(subset=['INCOME2', '_VEGESU1'])
xs = subset['INCOME2']
ys = subset['_VEGESU1']
# Compute the linear regression
res = linregress(xs, ys)
print(res)
plt.figure(figsize=(10, 10))
# Plot the scatter plot
x_jitter = xs + np.random.normal(0, 0.15, len(xs))
plt.plot(x_jitter, ys, 'o', alpha=0.2)
# Plot the line of best fit
fx = np.array([xs.min(), xs.max()])
fy = res.intercept + res.slope * fx
plt.plot(fx, fy, '-', alpha=0.7)
plt.xlabel('Income code')
plt.ylabel('Vegetable servings per day')
plt.ylim([0, 6])