import pandas as pd
import io
import requests
from IPython.core.display import display, HTML
import numpy as np
import matplotlib.pyplot as plt
import scipy
%matplotlib inline

pd.options.display.max_columns = 50


x1 = np.array([115,112,107,119,115,138,126,105,104,115])


x2 = np.array([128,115,106,128,122,145,132,109,102,117])


diff = x2 - x1
print(diff)

[13  3 -1  9  7  7  6  4 -2  2]


print(np.std(x1), np.std(x2))

9.77957054271812 12.547509713086498


from scipy.stats import ttest_rel


statistic, pvalue = ttest_rel(x1, x2)


print(statistic, pvalue)

-3.324651095085193 0.008874336881492044


(x1 - x2).mean()

-4.8


len(x1)

10


def std_paired(a, b):
    assert(len(a) == len(b))
    n = len(a)
    d = b - a
    var = np.sum((d - d.mean())**2) / (n - 1)
    std = np.sqrt(var)
    return std


# calculate the sample standard deviation of the paired data
std_paired(x1, x2)

4.565571644870382


def ci_paired(a, b, alpha=0.95):
    '''Calculates the confidence interval corresponding to significance {alpha} for two related (i.e. "paired") 
    vectors a and b.
    
    Parameters:
        a -- {np.array}
        b -- {np.array}
        alpha -- {float}
    
    Returns:
        mean_diff -- {float}
        ci -- {tuple}
    
    '''
    
    # make sure input vectors are the same length
    assert(len(a) == len(b))
    n = len(a)
    
    # calculate mean difference
    mean_diff = (b - a).mean()
    
    # calculate standard deviation of paired samples
    s_d = std_paired(a, b)

    # generate a t distributed random variable 
    dof = n - 1 # calculate degrees of freedom
    rv = scipy.stats.t(dof) # instantiate A Student’s t continuous random variable
    
    # get the t statistics for degrees of freedom and specified alpha
    t = rv.ppf((1 - alpha) / 2)
    t = np.abs(t) 
    
    # calculate ci
    print(mean_diff, t, s_d, np.sqrt(n))
    ci = (mean_diff - t * (s_d / np.sqrt(n)), mean_diff + t * (s_d / np.sqrt(n)))
    
    return  mean_diff, ci


ci_paired(x1, x2, alpha=0.95)

4.8 2.2621571627409915 4.565571644870382 3.1622776601683795

(4.8, (1.5339867942207275, 8.066013205779273))


from scipy.stats import ttest_ind, ttest_ind_from_stats


# sample data
mean1 = 132.86
std1 = 15.34
nobs1 = 8
mean2 = 127.44
std2 = 18.23
nobs2 = 21


print(mean2 - mean1)

-5.420000000000016


statistic, pvalue = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2)

print(statistic, pvalue)

0.7443175718105018 0.46311371461667283


def ci_ind(a, b, alpha=0.95):
    '''Calculates the confidence interval corresponding to significance {alpha} for two independent
    samples a and b.
    
    Parameters:
        a -- {np.array}
        b -- {np.array}
        alpha -- {float}
    
    Returns:
        mean_diff -- {float}
        ci -- {tuple}
    
    '''
    
    # make sure input vectors are the same length
    assert(len(a) == len(b))
    n1 = len(a)
    n2 = len(b)
    
    s1 = np.std(a)
    s2 = np.std(b)
    var = ((n1 - 1)*s1**2 + (n2-1)*s2**2) / (n1 + n2 - 2)
    std = np.sqrt(var)
    
    # calculate mean difference
    diff = b.mean() - a.mean()
    

    # generate a t distributed random variable 
    dof = n1 + n2 - 2 # calculate degrees of freedom
    rv = scipy.stats.t(dof) # instantiate A Student’s t continuous random variable
    t = rv.ppf((1 - alpha) / 2)
    t = np.abs(t) 
    
    # calculate ci
    ci = (diff - t * (std / np.sqrt((1/n1) + (1/n2))), diff + t * (std / np.sqrt((1/n1) + (1/n2))))
    
    return  diff, ci


ci_ind(x1, x2)

(4.800000000000011, (-10.896250035340964, 20.496250035340985))

two sample t-tests and confidence intervals, the paired and unpaired cases¶

Assumptions for the paired t-test¶

Paired 95% CI for the True Difference Between the Underlying Means of Two Paired Samples (Two-Sided)¶

Assumptions for the independent t test¶

Calculating the 95% CI¶