import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
os.makedirs('ipynb_export', exist_ok=True)
RNG = np.random.default_rng(seed=0xC0FFEE)
def sim_var_estimate(p, n):
x = RNG.binomial(1, p, size=n)
p_est = x.mean()
return p_est * (1 - p_est)
def many_var_estimates(p, n):
return np.array([sim_var_estimate(p, n) for i in range(2048)])
p = 0.5; n = 32
mle = many_var_estimates(p, n)
plt.hist(mle, range=(0,0.3), bins=200)
plt.title("Max Likelihood Estimator")
plt.savefig('ipynb_export/mle_p_half.png')
unbiased = n/(n-1) * mle
plt.hist(unbiased, range=(0,0.3), bins=200)
plt.title("Sample Variance")
plt.savefig('ipynb_export/unbiased_p_half.png')
def report(estimates, truth):
var = estimates.var()
ave_error = np.mean(estimates - truth)
median_error = np.median(estimates - truth)
pie = (estimates > 0.25).mean()
return dict(ave_error=ave_error,
median_error=median_error,
std_dev=np.sqrt(var),
sqrt_mse=np.sqrt(var + ave_error**2),
pie=pie)
def sweep_probs(step, n):
ret = pd.DataFrame()
for p in np.arange(0.0, 0.5+step, step):
mle = many_var_estimates(p, n)
unbiased = n/(n-1) * mle
truth = p * (1-p)
row = dict(prob=p)
for key, val in report(mle, truth).items():
row['mle.' + key] = val
for key, val in report(unbiased, truth).items():
row['unbiased.' + key] = val
ret = ret.append(row, ignore_index=True)
return ret
PERF = dict()
for n in [5, 20, 100]:
PERF[n] = sweep_probs(1/32, n)
fig, ax1 = plt.subplots()
ax1.set_xlabel("True p of Bernoulli distribution")
ax1.set_ylabel('Probability of Impossible Estimand (PIE)')
for N, df in PERF.items():
ax1.plot('prob', 'unbiased.pie', data=df, label="n={}".format(N))
plt.legend()
plt.title('PIE of sample variance')
plt.savefig('ipynb_export/unbiased_pie.png')
def plot_perf(df, fname=None, pie=True, title=None):
fig, ax1 = plt.subplots()
ax1.set_xlabel("True p of Bernoulli distribution")
ax1.set_ylabel('units of estimand')
ax1.plot('prob', 'mle.ave_error', data=df, linestyle='dashed', color='lightblue')
ax1.plot('prob', 'mle.sqrt_mse', data=df, linestyle='dashed', color='red')
ax1.plot('prob', 'unbiased.ave_error', data=df, color='lightblue')
ax1.plot('prob', 'unbiased.sqrt_mse', data=df, color='red')
leg1 = plt.legend(bbox_to_anchor=(1.14,1), loc="upper left")
xtras = (leg1,)
if pie:
ax2 = ax1.twinx()
ax2.set_ylabel('Probability of Impossible Estimand (PIE)')
ax2.plot('prob', 'unbiased.pie', data=df, color='black')
leg2 = plt.legend(bbox_to_anchor=(1.14,0.5), loc="center left")
xtras += (leg2,)
if title:
t = plt.title(title)
if fname:
plt.savefig(fname, bbox_extra_artists=xtras, bbox_inches='tight')
plot_perf(PERF[5], 'ipynb_export/perf5.png', pie=False, title="Unbiased vs biased at n=5")
plot_perf(PERF[20], 'ipynb_export/perf20.png', title="Unbiased vs biased at n=20")
plot_perf(PERF[100])
p = 0.5; n = 5
true_var = p * (1-p)
mle = many_var_estimates(p, n)
unbiased = n/(n-1) * mle
report(mle, true_var)
{'ave_error': -0.04968750000000001, 'median_error': -0.010000000000000009, 'std_dev': 0.06219849149095177, 'sqrt_mse': 0.07960841664045329, 'pie': 0.0}
report(unbiased, true_var)
{'ave_error': 0.00039062499999999335, 'median_error': 0.04999999999999999, 'std_dev': 0.07774811436368971, 'sqrt_mse': 0.07774909565390455, 'pie': 0.6220703125}