Sleep Health Statistical Analysis

Sleep Health Statistical Analysis#

  1. Do male and female participants differ in stress level?

  2. Do participants with and without sleep disorders differ in sleep duration?

  3. Do insomnia and sleep apnea participants differ in physical activity level?

Dataset source: Sleep Health and Lifestyle Dataset

import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu

df = pd.read_csv("../data/sleep.csv")

print("Dataset shape:", df.shape)
print("\nColumns:")
print(df.columns)

print("\nFirst 5 rows:")
print(df.head())

# cleaning / preparation

df["Sleep Disorder Filled"] = df["Sleep Disorder"].fillna("None")

# Create binary sleep disorder column:
# No  = no sleep disorder
# Yes = has Insomnia or Sleep Apnea
df["Has Sleep Disorder"] = np.where(
    df["Sleep Disorder"].isna(),
    "No",
    "Yes"
)

print("\nSleep Disorder Filled counts:")
print(df["Sleep Disorder Filled"].value_counts())

print("\nHas Sleep Disorder counts:")
print(df["Has Sleep Disorder"].value_counts())

# helper function for Mann-Whitney U test

def run_mann_whitney(data, group_col, group_1, group_2, outcome_col):
    """
    Runs Mann-Whitney U test between two independent groups.

    Parameters
    ----------
    data : DataFrame
        Dataset.
    group_col : str
        Categorical column with group names.
    group_1 : str
        First group name.
    group_2 : str
        Second group name.
    outcome_col : str
        Numeric or ordinal outcome column.

    Returns
    -------
    dict
        Summary results.
    """

    x = data.loc[data[group_col] == group_1, outcome_col].dropna()
    y = data.loc[data[group_col] == group_2, outcome_col].dropna()

    result = mannwhitneyu(
        x=x,
        y=y,
        alternative="two-sided",
        method="asymptotic",
        nan_policy="omit"
    )

    return {
        "Research Question": f"Does {outcome_col} differ between {group_1} and {group_2}?",
        "Grouping Column": group_col,
        "Outcome": outcome_col,
        "Group 1": group_1,
        "N1": len(x),
        "Median 1": x.median(),
        "Mean 1": x.mean(),
        "Group 2": group_2,
        "N2": len(y),
        "Median 2": y.median(),
        "Mean 2": y.mean(),
        "U Statistic": result.statistic,
        "p-value": result.pvalue,
        "Significant": result.pvalue < 0.05
    }

# Scenario 1

# Question:
# Do male and female participants differ in stress level?

scenario_1 = run_mann_whitney(
    data=df,
    group_col="Gender",
    group_1="Male",
    group_2="Female",
    outcome_col="Stress Level"
)

print("\n" + "=" * 80)
print("SCENARIO 1: Gender vs Stress Level")
print("=" * 80)

for key, value in scenario_1.items():
    print(f"{key}: {value}")


# Scenario 2

# Question:
# Do participants with and without sleep disorders differ in sleep duration?

scenario_2 = run_mann_whitney(
    data=df,
    group_col="Has Sleep Disorder",
    group_1="No",
    group_2="Yes",
    outcome_col="Sleep Duration"
)

print("\n" + "=" * 80)
print("SCENARIO 2: Sleep Disorder Status vs Sleep Duration")
print("=" * 80)

for key, value in scenario_2.items():
    print(f"{key}: {value}")


# Scenario 3

# Question:
# Do insomnia and sleep apnea participants differ in physical activity level?

scenario_3 = run_mann_whitney(
    data=df,
    group_col="Sleep Disorder Filled",
    group_1="Insomnia",
    group_2="Sleep Apnea",
    outcome_col="Physical Activity Level"
)

print("\n" + "=" * 80)
print("SCENARIO 3: Insomnia vs Sleep Apnea on Physical Activity Level")
print("=" * 80)

for key, value in scenario_3.items():
    print(f"{key}: {value}")


# combine all results into one table

results_df = pd.DataFrame([scenario_1, scenario_2, scenario_3])

print("\n" + "=" * 80)
print("FINAL RESULTS TABLE")
print("=" * 80)

print(results_df)


results_clean = results_df.copy()

numeric_cols = [
    "Median 1", "Mean 1",
    "Median 2", "Mean 2",
    "U Statistic", "p-value"
]

# results

results_clean[numeric_cols] = results_clean[numeric_cols]

print("\n" + "=" * 80)
print("CLEAN FINAL RESULTS TABLE")
print("=" * 80)

print(results_clean)


# interpretation

print("\n" + "=" * 80)
print("INTERPRETATIONS")
print("=" * 80)

alpha = 0.05

for i, row in results_clean.iterrows():
    print(f"\nScenario {i + 1}:")
    print(row["Research Question"])
    print(f"{row['Group 1']} median = {row['Median 1']}")
    print(f"{row['Group 2']} median = {row['Median 2']}")
    print(f"U = {row['U Statistic']}")
    print(f"p-value = {row['p-value']}")

    if row["p-value"] < alpha:
        print("Conclusion: Significant difference between the two groups.")
    else:
        print("Conclusion: No significant difference between the two groups.")
Dataset shape: (374, 13)

Columns:
Index(['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration',
       'Quality of Sleep', 'Physical Activity Level', 'Stress Level',
       'BMI Category', 'Blood Pressure', 'Heart Rate', 'Daily Steps',
       'Sleep Disorder'],
      dtype='object')

First 5 rows:
   Person ID Gender  Age            Occupation  Sleep Duration  \
0          1   Male   27     Software Engineer             6.1   
1          2   Male   28                Doctor             6.2   
2          3   Male   28                Doctor             6.2   
3          4   Male   28  Sales Representative             5.9   
4          5   Male   28  Sales Representative             5.9   

   Quality of Sleep  Physical Activity Level  Stress Level BMI Category  \
0                 6                       42             6   Overweight   
1                 6                       60             8       Normal   
2                 6                       60             8       Normal   
3                 4                       30             8        Obese   
4                 4                       30             8        Obese   

  Blood Pressure  Heart Rate  Daily Steps Sleep Disorder  
0         126/83          77         4200            NaN  
1         125/80          75        10000            NaN  
2         125/80          75        10000            NaN  
3         140/90          85         3000    Sleep Apnea  
4         140/90          85         3000    Sleep Apnea  

Sleep Disorder Filled counts:
Sleep Disorder Filled
None           219
Sleep Apnea     78
Insomnia        77
Name: count, dtype: int64

Has Sleep Disorder counts:
Has Sleep Disorder
No     219
Yes    155
Name: count, dtype: int64

================================================================================
SCENARIO 1: Gender vs Stress Level
================================================================================
Research Question: Does Stress Level differ between Male and Female?
Grouping Column: Gender
Outcome: Stress Level
Group 1: Male
N1: 189
Median 1: 6.0
Mean 1: 6.079365079365079
Group 2: Female
N2: 185
Median 2: 4.0
Mean 2: 4.675675675675675
U Statistic: 25931.0
p-value: 2.2929416314698734e-16
Significant: True

================================================================================
SCENARIO 2: Sleep Disorder Status vs Sleep Duration
================================================================================
Research Question: Does Sleep Duration differ between No and Yes?
Grouping Column: Has Sleep Disorder
Outcome: Sleep Duration
Group 1: No
N1: 219
Median 1: 7.4
Mean 1: 7.358447488584476
Group 2: Yes
N2: 155
Median 2: 6.5
Mean 2: 6.812258064516129
U Statistic: 23039.0
p-value: 3.637710023820107e-09
Significant: True

================================================================================
SCENARIO 3: Insomnia vs Sleep Apnea on Physical Activity Level
================================================================================
Research Question: Does Physical Activity Level differ between Insomnia and Sleep Apnea?
Grouping Column: Sleep Disorder Filled
Outcome: Physical Activity Level
Group 1: Insomnia
N1: 77
Median 1: 45.0
Mean 1: 46.81818181818182
Group 2: Sleep Apnea
N2: 78
Median 2: 75.0
Mean 2: 74.7948717948718
U Statistic: 910.5
p-value: 5.398884481874879e-15
Significant: True

================================================================================
FINAL RESULTS TABLE
================================================================================
                                   Research Question        Grouping Column  \
0  Does Stress Level differ between Male and Female?                 Gender   
1     Does Sleep Duration differ between No and Yes?     Has Sleep Disorder   
2  Does Physical Activity Level differ between In...  Sleep Disorder Filled   

                   Outcome   Group 1   N1  Median 1     Mean 1      Group 2  \
0             Stress Level      Male  189       6.0   6.079365       Female   
1           Sleep Duration        No  219       7.4   7.358447          Yes   
2  Physical Activity Level  Insomnia   77      45.0  46.818182  Sleep Apnea   

    N2  Median 2     Mean 2  U Statistic       p-value  Significant  
0  185       4.0   4.675676      25931.0  2.292942e-16         True  
1  155       6.5   6.812258      23039.0  3.637710e-09         True  
2   78      75.0  74.794872        910.5  5.398884e-15         True  

================================================================================
CLEAN FINAL RESULTS TABLE
================================================================================
                                   Research Question        Grouping Column  \
0  Does Stress Level differ between Male and Female?                 Gender   
1     Does Sleep Duration differ between No and Yes?     Has Sleep Disorder   
2  Does Physical Activity Level differ between In...  Sleep Disorder Filled   

                   Outcome   Group 1   N1  Median 1     Mean 1      Group 2  \
0             Stress Level      Male  189       6.0   6.079365       Female   
1           Sleep Duration        No  219       7.4   7.358447          Yes   
2  Physical Activity Level  Insomnia   77      45.0  46.818182  Sleep Apnea   

    N2  Median 2     Mean 2  U Statistic       p-value  Significant  
0  185       4.0   4.675676      25931.0  2.292942e-16         True  
1  155       6.5   6.812258      23039.0  3.637710e-09         True  
2   78      75.0  74.794872        910.5  5.398884e-15         True  

================================================================================
INTERPRETATIONS
================================================================================

Scenario 1:
Does Stress Level differ between Male and Female?
Male median = 6.0
Female median = 4.0
U = 25931.0
p-value = 2.2929416314698734e-16
Conclusion: Significant difference between the two groups.

Scenario 2:
Does Sleep Duration differ between No and Yes?
No median = 7.4
Yes median = 6.5
U = 23039.0
p-value = 3.637710023820107e-09
Conclusion: Significant difference between the two groups.

Scenario 3:
Does Physical Activity Level differ between Insomnia and Sleep Apnea?
Insomnia median = 45.0
Sleep Apnea median = 75.0
U = 910.5
p-value = 5.398884481874879e-15
Conclusion: Significant difference between the two groups.