import pandas as pd
import io
import requests
from IPython.core.display import display, HTML
import numpy as np
import matplotlib.pyplot as plt
import scipy
%matplotlib inline

pd.options.display.max_columns = 50


def sample_size_from_stats(mean1, mean2, std1, std2, alpha=0.05, beta=0.2, two_sided=True, k=None):
    '''
    Returns:
        n {int} 
    '''
    
    power = 1 - beta
    
    if two_sided:
        z_alpha = norm.ppf(q=1-alpha/2, loc=0, scale=1)
    else:
        z_alpha = norm.ppf(q=1-alpha, loc=0, scale=1)

    z_power = norm.ppf(q=power, loc=0, scale=1)

    # from equation 8.24
    if k == None:
        n = (std1**2 + std2**2) * (z_alpha + z_power)**2 / diff**2
        return int(n)
    else:
        n1 = (std1**2 + std2**2/k) * (z_alpha + z_power)**2 / (mean1 - mean2)**2
        n2 = (k*std1**2 + std2**2) * (z_alpha + z_power)**2 / (mean1 - mean2)**2
        return (n1, n2)


def power_from_stats(mean1, nobs1, std1, mean2, nobs2, std2, alpha=0.05, two_sided=True):
    '''Power for Comparing the Means of Two Normally Distributed Samples 
    Using a Significance Level α.
    
    Returns:
        power {float} -- value between [0, 1]
    
    '''
    
    from scipy.stats import norm
    from numpy import sqrt
    
    if two_sided:
        z_alpha = norm.ppf(q=1-alpha/2, loc=0, scale=1)
    else:
        z_alpha = norm.ppf(q=1-alpha, loc=0, scale=1)
    
     # instantiate a standard normal continuous random variable
    rv = scipy.stats.norm
    statistic = -1*z_alpha + (mean1 - mean2)/sqrt(((std1**2/nobs1) + (std2**2/nobs2)))
    power = rv.cdf(statistic)
    
    print("z_alpha=", z_alpha)
    print("statistic=", statistic)
    print("power=", power)
    
    return power


def ci_ind(a, b, alpha=0.95):
    '''Calculates the confidence interval corresponding to significance {alpha} for two independent
    samples a and b.
    
    Parameters:
        a -- {np.array}
        b -- {np.array}
        alpha -- {float}
    
    Returns:
        mean_diff -- {float}
        ci -- {tuple}
    
    '''
    
    # make sure input vectors are the same length
    assert(len(a) == len(b))
    n1 = len(a)
    n2 = len(b)
    
    s1 = np.std(a)
    s2 = np.std(b)
    var = ((n1 - 1)*s1**2 + (n2-1)*s2**2) / (n1 + n2 - 2)
    std = np.sqrt(var)
    
    # calculate mean difference
    diff = b.mean() - a.mean()
    

    # generate a t distributed random variable 
    dof = n1 + n2 - 2 # calculate degrees of freedom
    rv = scipy.stats.t(dof) # instantiate A Student’s t continuous random variable
    t = rv.ppf((1 - alpha) / 2)
    t = np.abs(t) 
    
    # calculate ci
    ci = (diff - t * (std / np.sqrt((1/n1) + (1/n2))), diff + t * (std / np.sqrt((1/n1) + (1/n2))))
    
    return  diff, ci


def ci_ind_from_stats(mean1, nobs1, std1, mean2, nobs2, std2, alpha=0.95):
    '''Calculates the confidence interval corresponding to significance {alpha} for two independent
    samples a and b.
    
    Returns:
        mean_diff -- {float}
        ci -- {tuple}
    
    '''
    import scipy.stats
    import numpy as np

    # calculate estimate
    diff = mean1 - mean2
    var = ((nobs1 - 1)*std1**2 + (nobs2-1)*std2**2) / (nobs1 + nobs2 - 2)
    std = np.sqrt(var)
    

    # generate a t distributed random variable 
    dof = nobs1 + nobs2 - 2 # calculate degrees of freedom
    rv = scipy.stats.t(dof) # instantiate A Student’s t continuous random variable
    t = rv.ppf((1 - alpha) / 2)
    t = np.abs(t) 
    
    # calculate ci
    ci = (diff - t * (std / np.sqrt((1/nobs1) + (1/nobs2))), diff + t * (std / np.sqrt((1/nobs2) + (1/nobs2))))
    
    return  diff, ci


def std_paired(a, b):
    assert(len(a) == len(b))
    n = len(a)
    d = b - a
    var = np.sum((d - d.mean())**2) / (n - 1)
    std = np.sqrt(var)
    return std

def ci_paired(a, b, alpha=0.95):
    '''Calculates the confidence interval corresponding to significance {alpha} for two related (i.e. "paired") 
    vectors a and b.
    
    Parameters:
        a -- {np.array}
        b -- {np.array}
        alpha -- {float}
    
    Returns:
        mean_diff -- {float}
        ci -- {tuple}
    
    '''
    
    # make sure input vectors are the same length
    assert(len(a) == len(b))
    n = len(a)
    
    # calculate mean difference
    mean_diff = (b - a).mean()
    
    # calculate standard deviation of paired samples
    s_d = std_paired(a, b)

    # generate a t distributed random variable 
    dof = n - 1 # calculate degrees of freedom
    rv = scipy.stats.t(dof) # instantiate A Student’s t continuous random variable
    
    # get the t statistics for degrees of freedom and specified alpha
    t = rv.ppf((1 - alpha) / 2)
    t = np.abs(t) 
    
    # calculate ci
    print(mean_diff, t, s_d, np.sqrt(n))
    ci = (mean_diff - t * (s_d / np.sqrt(n)), mean_diff + t * (s_d / np.sqrt(n)))
    
    return  mean_diff, ci


nobs1 = 25
mean1 = 6.56
std1 = 0.64

nobs2 = 40
mean2 = 6.8  
std2 = 0.76


from scipy.stats import f

f_statistic = std1**2 / std2**2
print(statistic)

# parameters
alpha = 0.05
dfn = nobs1 - 1
dfd = nobs2 - 1
loc = 0
scale = 1


f_lower = f.ppf(alpha/2, dfn, dfd, loc, scale)


f_upper = f.ppf(1-(alpha/2), dfn, dfd, loc, scale)


print(f_lower, f_upper)


print(f_lower < f_statistic < f_upper)

0.46491092633494036 2.0166484942971676
True


from scipy.stats import ttest_ind, ttest_ind_from_stats


print(mean1, mean2, mean1 - mean2)

6.56 6.8 -0.2400000000000002


statistic, pvalue = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2, equal_var=True)
print(statistic, pvalue)

-1.3135362295391815 0.19376598097353762


diff, ci = ci_ind_from_stats(mean1, nobs1, std1, mean2, nobs2, std2, alpha=0.95)


print(diff, ci)

-0.2400000000000002 (-5.857270379229926, 6.164673642535278)


# problem setup
power = 0.8
alpha = 0.05

diff = mean1 - mean2
print(diff)

-0.2400000000000002


from scipy.stats import norm


z_alpha = norm.ppf(q=1-alpha/2, loc=0, scale=1)

z_power = norm.ppf(q=power, loc=0, scale=1)

# from equation 8.24
n = (std1**2 + std2**2) * (z_alpha + z_power)**2 / diff**2

print(n)


z_alpha = norm.ppf(q=1-alpha, loc=0, scale=1)

z_power = norm.ppf(q=power, loc=0, scale=1)

# from equation 8.24
n = (std1**2 + std2**2) * (z_alpha + z_power)**2 / diff**2

print(n)

105.96216144878302


k = 2
alpha = 0.05
power = 0.8
two_sided = True

n1, n2 = sample_size_from_stats(mean1, mean2, std1, std2, alpha=0.05, beta=0.2, two_sided=True, k=2)

print(n1, n2)

95.16766677898254 190.33533355796507


# problem parameters
nobs1 = 25
mean1 = 6.56
std1 = 0.64

nobs2 = 40
mean2 = 6.8  
std2 = 0.76


power_from_stats(mean1, nobs1, std1, mean2, nobs2, std2, alpha=0.05, two_sided=True)

z_alpha= 1.959963984540054
statistic= -3.326958410536717
power= 0.00043899738172182765

0.00043899738172182765


power_from_stats(mean1, nobs1, std1, mean2, nobs2, std2, alpha=0.05, two_sided=False)

z_alpha= 1.6448536269514722
statistic= -3.0118480529481353
power= 0.001298312684353761

0.001298312684353761


power = power_from_stats(mean1, 50, std1, mean2, 25, std2, alpha=0.05, two_sided=True)

z_alpha= 1.959963984540054
statistic= -3.316610879478459
power= 0.000455582119197921


power = power_from_stats(mean1, 50, std1, mean2, 25, std2, alpha=0.05, two_sided=False)

z_alpha= 1.6448536269514722
statistic= -3.001500521889877
power= 0.0013432628940264794

Implement necessary functions¶

Problem set¶