Read, clean, and validate
A Summary of lecture "Exploratory Data Analysis in Python", via datacamp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
nsfg = pd.read_hdf('./dataset/nsfg.hdf5', 'nsfg')
print(nsfg.shape)
# Display the names of the columns
print(nsfg.columns)
# Select columns birthwgt_oz1: ounces
ounces = nsfg['birthwgt_oz1']
# Print the first 5 elements of ounces
print(ounces.head(5))
nsfg['nbrnaliv'].value_counts()
nsfg['nbrnaliv'].replace([8], np.nan, inplace=True)
# Print the values and their frequencies
print(nsfg['nbrnaliv'].value_counts())
nsfg['agecon'].describe()
nsfg['agepreg'].describe()
agecon = nsfg['agecon'] / 100
agepreg = nsfg['agepreg'] / 100
# Compute the difference
preg_length = agepreg - agecon
# Compute summary statistics
print(preg_length.describe())
plt.hist(agecon, bins=20)
# Label the axes
plt.xlabel("Age at conception")
plt.ylabel('Number of pregnancies')
plt.savefig('../images/conception.png')
plt.hist(agecon, bins=20, histtype='step')
# Label the axes
plt.xlabel("Age at conception")
plt.ylabel('Number of pregnancies')
def resample_rows_weighted(df, column='wgt2013_2015'):
"""Resamples a DataFrame using probabilities proportional to given column.
Args:
df: DataFrame
column: string column name to use as weights
returns:
DataFrame
"""
weights = df[column].copy()
weights /= sum(weights)
indices = np.random.choice(df.index, len(df), replace=True, p=weights)
sample = df.loc[indices]
return sample
nsfg = resample_rows_weighted(nsfg, 'wgt2013_2015')
# Clean the weight variables
pounds = nsfg['birthwgt_lb1'].replace([98, 99], np.nan)
ounces = nsfg['birthwgt_oz1'].replace([98, 99], np.nan)
# Compute total birth weight
birth_weight = pounds + ounces/16
full_term = nsfg['prglngth'] >= 37
# Select the weights of full-term babies
full_term_weight = birth_weight[full_term]
# Compute the mean weight of full-term babies
print(full_term_weight.mean())
full_term = nsfg['prglngth'] >= 37
# Filter single birth
single = nsfg['nbrnaliv'] == 1
# Compute birth weight for single full-term babies
single_full_term_weight = birth_weight[single & full_term]
print('Single full-term mean:', single_full_term_weight.mean())
# Compute birth weight for multiple full-term babies
mult_full_term_weight = birth_weight[~single & full_term]
print('Multiple full-term mean:', mult_full_term_weight.mean())