Distributions
A Summary of lecture "Exploratory Data Analysis in Python", via datacamp
- Probability mass functions
- Cumulative distribution functions
- Comparing distribution
- Modeling distributions
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from empiricaldist import Pmf, Cdf
from scipy.stats import norm
gss = pd.read_hdf('./dataset/gss.hdf5', 'gss')
gss.head()
year = gss['year']
# Compute the PMF for year
pmf_year = Pmf.from_seq(year, normalize=False)
# Print the result
print(pmf_year)
age = gss['age']
# Make a PMF of age
pmf_age = Pmf.from_seq(age)
# Plot the PMF
pmf_age.bar()
# Label the axes
plt.xlabel('Age')
plt.ylabel('PMF')
age = gss['age']
# Compute the CDF of age
cdf_age = Cdf.from_seq(age)
# Calculate the CDF of 30
print(cdf_age(30))
print(1 - cdf_age(30))
income = gss['realinc']
cdf_income = Cdf.from_seq(income)
percentile_75th = cdf_income.inverse(0.75)
# Calculate the 25th percentile
percentile_25th = cdf_income.inverse(0.25)
# Calculate the interquartile range
iqr = percentile_75th - percentile_25th
# Print the interquartile range
print(iqr)
cdf_income.plot()
# Label the axes
plt.xlabel('Income (1986 USD)')
plt.ylabel('CDF')
educ = gss['educ']
# Bachelor`s degree
bach = (educ >= 16)
# Associate degree
assc = ((educ >= 14) & (educ < 16))
# High school (12 or fewer years of education)
high = (educ <= 12)
print(high.mean())
income = gss['realinc']
# Plot the CDFs
Cdf.from_seq(income[high]).plot(label='High school')
Cdf.from_seq(income[assc]).plot(label='Associate')
Cdf.from_seq(income[bach]).plot(label='Bachelor')
# Label the axes
plt.xlabel('Income (1986 USD)')
plt.ylabel('CDF')
plt.legend()
plt.savefig('../images/income-cdf.png')
log_income = np.log10(income)
# Compute mean and standard deviation
mean = np.mean(log_income)
std = np.std(log_income)
print(mean, std)
# Make a norm object
dist = norm(mean, std)
xs = np.linspace(2, 5.5)
ys = dist.cdf(xs)
# Plot the model CDF
plt.plot(xs, ys, color='gray')
# Create and plot the Cdf of log_income
Cdf.from_seq(log_income).plot()
# Label the axes
plt.xlabel('log10 of realinc')
plt.ylabel('CDF')
xs = np.linspace(2, 5.5)
ys = dist.pdf(xs)
# Plot the model CDF
plt.plot(xs, ys, color='gray')
# Plot the data KDE
sns.kdeplot(log_income)
# Label the axes
plt.xlabel('log10 of realinc')
plt.ylabel('CDF')