<!DOCTYPE html>
import pandas as pd
import io
import requests
from IPython.core.display import display, HTML
import numpy as np
import matplotlib.pyplot as plt
import scipy
%matplotlib inline
pd.options.display.max_columns = 50
def sample_size_from_stats(mean1, mean2, std1, std2, alpha=0.05, beta=0.2, two_sided=True, k=None):
'''
Returns:
n {int}
'''
power = 1 - beta
if two_sided:
z_alpha = norm.ppf(q=1-alpha/2, loc=0, scale=1)
else:
z_alpha = norm.ppf(q=1-alpha, loc=0, scale=1)
z_power = norm.ppf(q=power, loc=0, scale=1)
# from equation 8.24
if k == None:
n = (std1**2 + std2**2) * (z_alpha + z_power)**2 / diff**2
return int(n)
else:
n1 = (std1**2 + std2**2/k) * (z_alpha + z_power)**2 / (mean1 - mean2)**2
n2 = (k*std1**2 + std2**2) * (z_alpha + z_power)**2 / (mean1 - mean2)**2
return (n1, n2)
def power_from_stats(mean1, nobs1, std1, mean2, nobs2, std2, alpha=0.05, two_sided=True):
'''Power for Comparing the Means of Two Normally Distributed Samples
Using a Significance Level α.
Returns:
power {float} -- value between [0, 1]
'''
from scipy.stats import norm
from numpy import sqrt
if two_sided:
z_alpha = norm.ppf(q=1-alpha/2, loc=0, scale=1)
else:
z_alpha = norm.ppf(q=1-alpha, loc=0, scale=1)
# instantiate a standard normal continuous random variable
rv = scipy.stats.norm
statistic = -1*z_alpha + (mean1 - mean2)/sqrt(((std1**2/nobs1) + (std2**2/nobs2)))
power = rv.cdf(statistic)
print("z_alpha=", z_alpha)
print("statistic=", statistic)
print("power=", power)
return power
def ci_ind(a, b, alpha=0.95):
'''Calculates the confidence interval corresponding to significance {alpha} for two independent
samples a and b.
Parameters:
a -- {np.array}
b -- {np.array}
alpha -- {float}
Returns:
mean_diff -- {float}
ci -- {tuple}
'''
# make sure input vectors are the same length
assert(len(a) == len(b))
n1 = len(a)
n2 = len(b)
s1 = np.std(a)
s2 = np.std(b)
var = ((n1 - 1)*s1**2 + (n2-1)*s2**2) / (n1 + n2 - 2)
std = np.sqrt(var)
# calculate mean difference
diff = b.mean() - a.mean()
# generate a t distributed random variable
dof = n1 + n2 - 2 # calculate degrees of freedom
rv = scipy.stats.t(dof) # instantiate A Student’s t continuous random variable
t = rv.ppf((1 - alpha) / 2)
t = np.abs(t)
# calculate ci
ci = (diff - t * (std / np.sqrt((1/n1) + (1/n2))), diff + t * (std / np.sqrt((1/n1) + (1/n2))))
return diff, ci
def ci_ind_from_stats(mean1, nobs1, std1, mean2, nobs2, std2, alpha=0.95):
'''Calculates the confidence interval corresponding to significance {alpha} for two independent
samples a and b.
Returns:
mean_diff -- {float}
ci -- {tuple}
'''
import scipy.stats
import numpy as np
# calculate estimate
diff = mean1 - mean2
var = ((nobs1 - 1)*std1**2 + (nobs2-1)*std2**2) / (nobs1 + nobs2 - 2)
std = np.sqrt(var)
# generate a t distributed random variable
dof = nobs1 + nobs2 - 2 # calculate degrees of freedom
rv = scipy.stats.t(dof) # instantiate A Student’s t continuous random variable
t = rv.ppf((1 - alpha) / 2)
t = np.abs(t)
# calculate ci
ci = (diff - t * (std / np.sqrt((1/nobs1) + (1/nobs2))), diff + t * (std / np.sqrt((1/nobs2) + (1/nobs2))))
return diff, ci
def std_paired(a, b):
assert(len(a) == len(b))
n = len(a)
d = b - a
var = np.sum((d - d.mean())**2) / (n - 1)
std = np.sqrt(var)
return std
def ci_paired(a, b, alpha=0.95):
'''Calculates the confidence interval corresponding to significance {alpha} for two related (i.e. "paired")
vectors a and b.
Parameters:
a -- {np.array}
b -- {np.array}
alpha -- {float}
Returns:
mean_diff -- {float}
ci -- {tuple}
'''
# make sure input vectors are the same length
assert(len(a) == len(b))
n = len(a)
# calculate mean difference
mean_diff = (b - a).mean()
# calculate standard deviation of paired samples
s_d = std_paired(a, b)
# generate a t distributed random variable
dof = n - 1 # calculate degrees of freedom
rv = scipy.stats.t(dof) # instantiate A Student’s t continuous random variable
# get the t statistics for degrees of freedom and specified alpha
t = rv.ppf((1 - alpha) / 2)
t = np.abs(t)
# calculate ci
print(mean_diff, t, s_d, np.sqrt(n))
ci = (mean_diff - t * (s_d / np.sqrt(n)), mean_diff + t * (s_d / np.sqrt(n)))
return mean_diff, ci
Data for Problems 8.2 - 8.13
nobs1 = 25
mean1 = 6.56
std1 = 0.64
nobs2 = 40
mean2 = 6.8
std2 = 0.76
8.2 Test for a significant difference between the variances.
H0: var1 == var2
Ha: var1 != var 2
from scipy.stats import f
f_statistic = std1**2 / std2**2
print(statistic)
# parameters
alpha = 0.05
dfn = nobs1 - 1
dfd = nobs2 - 1
loc = 0
scale = 1
The question is, what's the F statistic of the F distribution with those parameters (alpha/2 and 1- alpha/2)?
We want a function where we can input the probability (alpha) and get out the corresponding F-statistic... That means we want the PPF.
f_lower = f.ppf(alpha/2, dfn, dfd, loc, scale)
f_upper = f.ppf(1-(alpha/2), dfn, dfd, loc, scale)
print(f_lower, f_upper)
print(f_lower < f_statistic < f_upper)
0.46491092633494036 2.0166484942971676 True
So our test statistic falls within the acceptance region of the F distribution. This means that, given that the null hypothesis is true, it is highly likely to get these results, therefore we cannot reject (i.e., we can accept) the null hypothesis that the two variances are equal.
8.3 What is the appropriate procedure to test for a signifi- cant difference in means between the two groups?
Since we discovered that the two samples come from populations with equal variances, the Two samples independent t-test with equal variances is the most appropriate test here.
8.4 Implement the procedure in Problem 8.3 using the critical-value method.
8.5 What is the p-value corresponding to your answer to Problem 8.4?
from scipy.stats import ttest_ind, ttest_ind_from_stats
print(mean1, mean2, mean1 - mean2)
6.56 6.8 -0.2400000000000002
statistic, pvalue = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2, equal_var=True)
print(statistic, pvalue)
-1.3135362295391815 0.19376598097353762
So the p value is 0.19, which means we don't have a significant difference in means between the two samples, and that they do not represent two distinct populations with distinct means (given a 5% signficance threshold).
8.6 Compute a 95% CI for the difference in means between the two groups.
diff, ci = ci_ind_from_stats(mean1, nobs1, std1, mean2, nobs2, std2, alpha=0.95)
print(diff, ci)
-0.2400000000000002 (-5.857270379229926, 6.164673642535278)
*8.7 Suppose an equal number of 12- to 14-year-old girls below and above the poverty level are recruited to study differences in calcium intake. How many girls should be recruited to have an 80% chance of detecting a significant difference using a two-sided test with α = .05?
Equation for computing the Sample Size Needed for Comparing the Means of Two Normally Distributed Samples of Equal Size Using a Two-Sided Test with Significance Level α and Power 1 − β:
INSERT EQUATION HERE
# problem setup
power = 0.8
alpha = 0.05
diff = mean1 - mean2
print(diff)
-0.2400000000000002
from scipy.stats import norm
z_alpha = norm.ppf(q=1-alpha/2, loc=0, scale=1)
z_power = norm.ppf(q=power, loc=0, scale=1)
# from equation 8.24
n = (std1**2 + std2**2) * (z_alpha + z_power)**2 / diff**2
print(n)
so ~134 people should be recruited for this study to have an 80% chance of detecting an effect (assuming that there is one). We already know there is no significant difference between the means though.
*8.8 Answer Problem 8.7 if a one-sided rather than a two- sided test is used.
z_alpha = norm.ppf(q=1-alpha, loc=0, scale=1)
z_power = norm.ppf(q=power, loc=0, scale=1)
# from equation 8.24
n = (std1**2 + std2**2) * (z_alpha + z_power)**2 / diff**2
print(n)
105.96216144878302
Intuitively it makes sense that we require less people since a two-tailed test is harder to pass, so it would require a higher number of people to achieve the same statistical power.
*8.9 Using a two-sided test with α = .05, answer Problem 8.7, anticipating that two girls above the poverty level will be recruited for every one girl below the poverty level who is recruited.
Sample Size Needed for Comparing the Means of Two Normally Distributed Samples of Unequal Size Using a Two-Sided Test with Significance Level α and Power 1 − β:
INSERT EQUATION HERE
k = 2
alpha = 0.05
power = 0.8
two_sided = True
n1, n2 = sample_size_from_stats(mean1, mean2, std1, std2, alpha=0.05, beta=0.2, two_sided=True, k=2)
print(n1, n2)
95.16766677898254 190.33533355796507
*8.10 Suppose 50 girls above the poverty level and 50 girls below the poverty level are recruited for the study. How much power will the study have of finding a significant difference using a two-sided test with α = .05, assuming that the population parameters are the same as the sample estimates in Problem 8.2?
# problem parameters
nobs1 = 25
mean1 = 6.56
std1 = 0.64
nobs2 = 40
mean2 = 6.8
std2 = 0.76
power_from_stats(mean1, nobs1, std1, mean2, nobs2, std2, alpha=0.05, two_sided=True)
z_alpha= 1.959963984540054 statistic= -3.326958410536717 power= 0.00043899738172182765
0.00043899738172182765
Almost no chance at all of detecting a significant difference, which makes sense, I think, given the difference of the means of the samples (quite small) and the sample standard deviations (large, relative to the difference in means).
*8.11 Answer Problem 8.10 assuming a one-sided rather than a two-sided test is used.
power_from_stats(mean1, nobs1, std1, mean2, nobs2, std2, alpha=0.05, two_sided=False)
z_alpha= 1.6448536269514722 statistic= -3.0118480529481353 power= 0.001298312684353761
0.001298312684353761
*8.12 Suppose 50 girls above the poverty level and 25 girls below the poverty level are recruited for the study. How much power will the study have if a two-sided test is used with α = .05?
power = power_from_stats(mean1, 50, std1, mean2, 25, std2, alpha=0.05, two_sided=True)
z_alpha= 1.959963984540054 statistic= -3.316610879478459 power= 0.000455582119197921
*8.13 Answer Problem 8.12 assuming a one-sided test is used with α = .05.
power = power_from_stats(mean1, 50, std1, mean2, 25, std2, alpha=0.05, two_sided=False)
z_alpha= 1.6448536269514722 statistic= -3.001500521889877 power= 0.0013432628940264794