In [1]:
%matplotlib inline
# Load the "autoreload" extension
%load_ext autoreload
# always reload modules
%autoreload 2
# black formatter for jupyter notebooks
#%load_ext nb_black
# black formatter for jupyter lab
%load_ext lab_black

%run ../../../src/notebook_env.py


---------------------------------
Working on the host: Joachims-MacBook-Pro.local

---------------------------------
Python version: 3.10.2 | packaged by conda-forge | (main, Feb  1 2022, 19:30:18) [Clang 11.1.0 ]

---------------------------------
Python interpreter: /opt/miniconda3/envs/srh/bin/python


# Test auf Normalverteilung
Generieren Sie in 100.000 (gleichwahrscheinliche) Würfe eines Würfels und berechnen Sie Mittelwert und Standardabweichung der Würfelsumme. Wählen Sie aus den Würfelwürfen 200 Stichproben mit unterschiedlichen Stichprobenumfängen. Ab welchem Stichprobenumfang (`3, 5, 7, 10, 15, 20, 30, 50`) können wir dafon ausgehen, dass die Stichprobenverteilung des Mittelwertes normalverteilt ist. Nutzen sie zur Validierung des Hypothese den Wilk-Shapiro Test.

**Hilfsfunktionen**

In [2]:
import numpy as np
from scipy import stats


def test_for_normal_distribution(x, verbose=True):
    """Function to test if a sample is normally distributed.
    Therefore the Shapiro-Wilk test is employed. If the p-value is <0.05 we recject the null hypothesis and hence
    conclude that the data is not normally distrubuted for reference see
    https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.shapiro.html"""
    shapiro_test = stats.shapiro(x)
    pvalue = shapiro_test.pvalue
    if verbose:
        print(f"p-value: {pvalue}")
        if pvalue < 0.05:
            print(
                f"The null hypothesis is rejected, the data is NOT normally distributed."
            )
        else:
            print(
                f"Given the data the null hypothesis cannot be rejected, the data is likely normally distributed."
            )
    return pvalue


def dice_roll(nrolls: int, nsides: int = 6, seed=None) -> list:
    """Function to simulate a dice roll
    params:
       nrolls: number of rolls/dices
       nsides: number of sides
    """
    if seed is not None:
        np.random.seed(seed)

    return [np.random.randint(1, nsides + 1) for x in range(nrolls)]

-------------------------------------------------------

In [3]:
# Frage 1 ...

In [5]:
# experiment
N = 100000
seed = 42
experiment = dice_roll(N, seed=seed)

# Validierung
for n in [3, 5, 7, 10, 15, 20, 30, 50]:
    sample_means = []
    for i in range(200):
        sample = np.random.choice(experiment, n, replace=True)
        sample_means.append(np.mean(sample))
    print(f"\nSample size: {n}")
    pvalue = test_for_normal_distribution(sample_means)


Sample size: 3
p-value: 0.0007678926340304315
The null hypothesis is rejected, the data is NOT normally distributed.

Sample size: 5
p-value: 0.015407245606184006
The null hypothesis is rejected, the data is NOT normally distributed.

Sample size: 7
p-value: 0.027912858873605728
The null hypothesis is rejected, the data is NOT normally distributed.

Sample size: 10
p-value: 0.2314864993095398
Given the data the null hypothesis cannot be rejected, the data is likely normally distributed.

Sample size: 15
p-value: 0.4181775152683258
Given the data the null hypothesis cannot be rejected, the data is likely normally distributed.

Sample size: 20
p-value: 0.34083324670791626
Given the data the null hypothesis cannot be rejected, the data is likely normally distributed.

Sample size: 30
p-value: 0.5110695362091064
Given the data the null hypothesis cannot be rejected, the data is likely normally distributed.

Sample size: 50
p-value: 0.18506282567977905
Given the data the null hypothesis ca