Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
A Practical Introduction to Data Science 2025
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Dokmanic-courses
A Practical Introduction to Data Science 2025
Commits
9f99473e
Commit
9f99473e
authored
2 months ago
by
Fabian Kruse
Browse files
Options
Downloads
Patches
Plain Diff
add lecture08 notebook
parent
224980b5
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
lecture/lecture08/8_confidence-intervals.ipynb
+265
-0
265 additions, 0 deletions
lecture/lecture08/8_confidence-intervals.ipynb
with
265 additions
and
0 deletions
lecture/lecture08/8_confidence-intervals.ipynb
0 → 100644
+
265
−
0
View file @
9f99473e
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "3735b0ad",
"metadata": {},
"outputs": [],
"source": [
"from scipy.stats import norm\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 100,
"id": "205c2aa6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Estimated proportion: 0.656\n",
"Probability that we are within 0.01 of the true estimate: 0.095\n",
"Confidence interval: [0.492, 0.821]\n"
]
}
],
"source": [
"p = 0.48\n",
"N = 32\n",
"\n",
"X = np.random.binomial(1, p, N)\n",
"\n",
"p_hat = X.mean() # Our estimate\n",
"se_hat = np.sqrt(p_hat * (1 - p_hat) / N) # The standard deviation of our estimate, aka standard error\n",
"std = X.std() # The standard deviation of the samples\n",
"\n",
"print(\"Estimated proportion: %.3f\" % p_hat)\n",
"print(\"Probability that we are within 0.01 of the true estimate: %.3f\" % (norm.cdf(0.01/se_hat) - norm.cdf(-0.01/se_hat)))\n",
"\n",
"print(\"Confidence interval: [%.3f, %.3f]\" % (p_hat - 1.96*se_hat, p_hat + 1.96*se_hat))"
]
},
{
"cell_type": "code",
"execution_count": 101,
"id": "88dc1054",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9285"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n_mc = 10000\n",
"n_inside = 0\n",
"\n",
"for i_mc in range(n_mc):\n",
" X = np.random.binomial(1, p, N)\n",
"\n",
" p_hat = X.mean() # Our estimate\n",
" se_hat = np.sqrt(p_hat * (1 - p_hat) / N) # The standard deviation of our estimate, aka standard error\n",
"\n",
" p_is_inside = (p_hat - 1.96*se_hat <= p) & (p_hat + 1.96*se_hat >= p)\n",
" n_inside += p_is_inside\n",
" \n",
"n_inside / n_mc"
]
},
{
"cell_type": "code",
"execution_count": 107,
"id": "1e9337d0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9297"
]
},
"execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import random\n",
"\n",
"n_boots = 10000\n",
"\n",
"n_mc = 10000\n",
"n_inside = 0\n",
"\n",
"for i_mc in range(n_mc):\n",
" X = np.random.binomial(1, p, N)\n",
"\n",
" boot_means = np.reshape(random.choices(X, k=N*n_boots), (N, n_boots)).mean(axis=0)\n",
"\n",
" # search for the symmetric 95% confidence interval \n",
"\n",
" # idx_sorted_boot_means = np.argsort(boot_means)\n",
" # print(boot_means[:20])\n",
" # a = boot_means[idx_sorted_boot_means[int((0.025 * n_boots))]]\n",
" # b = boot_means[idx_sorted_boot_means[int((0.975 * n_boots))]]\n",
" a, b = np.percentile(boot_means, [2.5, 97.5]) \n",
"\n",
" # print(a, b)\n",
"\n",
" p_is_inside = (a <= p) & (b >= p)\n",
" n_inside += p_is_inside\n",
"\n",
"n_inside / n_mc"
]
},
{
"cell_type": "code",
"execution_count": 87,
"id": "23097d5b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(5000,)"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"boot_means.shape"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "087e1a25",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.10100000000000008"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pm"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "de6a3e44",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.046\n"
]
},
{
"data": {
"text/plain": [
"(0.47, 0.53)"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"p = 0.49\n",
"N = 10000\n",
"sd_null = 0.5\n",
"p_null = 0.5\n",
"\n",
"t = np.sqrt(N) * np.abs(p_null - p) / sd_null\n",
"print(\"%5.3f\" % (1 - (norm.cdf(t) - norm.cdf(-t))))\n",
"\n",
"fig, ax = plt.subplots()\n",
"\n",
"sampdist = lambda u : 1/np.sqrt(2*np.pi*1/N*sd_null) * np.exp(-(u - p_null)**2 / (2 * 1/N*sd_null))\n",
"u_list = np.linspace(0, 1, 10000)\n",
"u_extreme_l = np.linspace(0, p_null - np.abs(p_null - p), 10000)\n",
"u_extreme_r = np.flip(1 - u_extreme_l)\n",
"\n",
"ax.plot(u_list, sampdist(u_list), 'k', linewidth=2)\n",
"for pval in [p, 1 - p, p_null]:\n",
" ax.vlines(pval, 0, sampdist(pval), 'k', linewidth=0.75)\n",
"ax.hlines(0, 0, 1, 'k')\n",
"ax.fill_between(u_extreme_l, sampdist(u_extreme_l), step=\"pre\", alpha=0.4)\n",
"ax.fill_between(u_extreme_r, sampdist(u_extreme_r), step=\"pre\", alpha=0.4)\n",
"ax.set_xlim(0.47, 0.53)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d170999f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
},
"vscode": {
"interpreter": {
"hash": "40d3a090f54c6569ab1632332b64b2c03c39dcf918b08424e98f38b5ae0af88f"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
%% Cell type:code id:3735b0ad tags:
```
python
from
scipy.stats
import
norm
import
numpy
as
np
import
seaborn
as
sns
import
matplotlib.pyplot
as
plt
```
%% Cell type:code id:205c2aa6 tags:
```
python
p
=
0.48
N
=
32
X
=
np
.
random
.
binomial
(
1
,
p
,
N
)
p_hat
=
X
.
mean
()
# Our estimate
se_hat
=
np
.
sqrt
(
p_hat
*
(
1
-
p_hat
)
/
N
)
# The standard deviation of our estimate, aka standard error
std
=
X
.
std
()
# The standard deviation of the samples
print
(
"
Estimated proportion: %.3f
"
%
p_hat
)
print
(
"
Probability that we are within 0.01 of the true estimate: %.3f
"
%
(
norm
.
cdf
(
0.01
/
se_hat
)
-
norm
.
cdf
(
-
0.01
/
se_hat
)))
print
(
"
Confidence interval: [%.3f, %.3f]
"
%
(
p_hat
-
1.96
*
se_hat
,
p_hat
+
1.96
*
se_hat
))
```
%% Output
Estimated proportion: 0.656
Probability that we are within 0.01 of the true estimate: 0.095
Confidence interval: [0.492, 0.821]
%% Cell type:code id:88dc1054 tags:
```
python
n_mc
=
10000
n_inside
=
0
for
i_mc
in
range
(
n_mc
):
X
=
np
.
random
.
binomial
(
1
,
p
,
N
)
p_hat
=
X
.
mean
()
# Our estimate
se_hat
=
np
.
sqrt
(
p_hat
*
(
1
-
p_hat
)
/
N
)
# The standard deviation of our estimate, aka standard error
p_is_inside
=
(
p_hat
-
1.96
*
se_hat
<=
p
)
&
(
p_hat
+
1.96
*
se_hat
>=
p
)
n_inside
+=
p_is_inside
n_inside
/
n_mc
```
%% Output
0.9285
%% Cell type:code id:1e9337d0 tags:
```
python
import
random
n_boots
=
10000
n_mc
=
10000
n_inside
=
0
for
i_mc
in
range
(
n_mc
):
X
=
np
.
random
.
binomial
(
1
,
p
,
N
)
boot_means
=
np
.
reshape
(
random
.
choices
(
X
,
k
=
N
*
n_boots
),
(
N
,
n_boots
)).
mean
(
axis
=
0
)
# search for the symmetric 95% confidence interval
# idx_sorted_boot_means = np.argsort(boot_means)
# print(boot_means[:20])
# a = boot_means[idx_sorted_boot_means[int((0.025 * n_boots))]]
# b = boot_means[idx_sorted_boot_means[int((0.975 * n_boots))]]
a
,
b
=
np
.
percentile
(
boot_means
,
[
2.5
,
97.5
])
# print(a, b)
p_is_inside
=
(
a
<=
p
)
&
(
b
>=
p
)
n_inside
+=
p_is_inside
n_inside
/
n_mc
```
%% Output
0.9297
%% Cell type:code id:23097d5b tags:
```
python
boot_means
.
shape
```
%% Output
(5000,)
%% Cell type:code id:087e1a25 tags:
```
python
pm
```
%% Output
0.10100000000000008
%% Cell type:code id:de6a3e44 tags:
```
python
p
=
0.49
N
=
10000
sd_null
=
0.5
p_null
=
0.5
t
=
np
.
sqrt
(
N
)
*
np
.
abs
(
p_null
-
p
)
/
sd_null
print
(
"
%5.3f
"
%
(
1
-
(
norm
.
cdf
(
t
)
-
norm
.
cdf
(
-
t
))))
fig
,
ax
=
plt
.
subplots
()
sampdist
=
lambda
u
:
1
/
np
.
sqrt
(
2
*
np
.
pi
*
1
/
N
*
sd_null
)
*
np
.
exp
(
-
(
u
-
p_null
)
**
2
/
(
2
*
1
/
N
*
sd_null
))
u_list
=
np
.
linspace
(
0
,
1
,
10000
)
u_extreme_l
=
np
.
linspace
(
0
,
p_null
-
np
.
abs
(
p_null
-
p
),
10000
)
u_extreme_r
=
np
.
flip
(
1
-
u_extreme_l
)
ax
.
plot
(
u_list
,
sampdist
(
u_list
),
'
k
'
,
linewidth
=
2
)
for
pval
in
[
p
,
1
-
p
,
p_null
]:
ax
.
vlines
(
pval
,
0
,
sampdist
(
pval
),
'
k
'
,
linewidth
=
0.75
)
ax
.
hlines
(
0
,
0
,
1
,
'
k
'
)
ax
.
fill_between
(
u_extreme_l
,
sampdist
(
u_extreme_l
),
step
=
"
pre
"
,
alpha
=
0.4
)
ax
.
fill_between
(
u_extreme_r
,
sampdist
(
u_extreme_r
),
step
=
"
pre
"
,
alpha
=
0.4
)
ax
.
set_xlim
(
0.47
,
0.53
)
```
%% Output
0.046
(0.47, 0.53)
%% Cell type:code id:d170999f tags:
```
python
```
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment