sampling/stratified_sample.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

# Import packages
import pandas as pd
import math

# Load data
df = pd.read_csv("FILENAME_GOES_HERE.csv")

# ALTERNATIVE: If you use Excel, use this instead. Supports xls, xlsx, xlsm,
# xlsb, odf, ods and odt file extensions.
# df = pd.read_excel("FILENAME_GOES_HERE.xlsx")

# Print totals prior to sampling
print("Dataframe size (rows, columns):", df.shape)

# User-defined parameters
SAMPLE_SIZE = 25
STRATIFY_COLUMN = "Category"  # <- Change this to your column name

# Define stratum proportions (as fractions)
# Example: if you have categories A, B, and C
stratum_proportions = {"A": 0.4, "B": 0.4, "C": 0.2}

# Validate proportions sum to 1
if not math.isclose(sum(stratum_proportions.values()), 1.0):
    raise ValueError("Stratum proportions must sum to 1.")

# Check that all strata exist in the data
missing_strata = set(stratum_proportions.keys()) - set(df[STRATIFY_COLUMN].unique())
if missing_strata:
    raise ValueError(
        f"Strata {missing_strata} not found in column '{STRATIFY_COLUMN}'."
    )

# Perform stratified sampling
samples = []
for stratum, proportion in stratum_proportions.items():
    stratum_df = df[df[STRATIFY_COLUMN] == stratum]
    n_samples = math.floor(SAMPLE_SIZE * proportion)
    if n_samples > len(stratum_df):
        raise ValueError(
            f"Not enough data in stratum '{stratum}' to sample {n_samples} rows."
        )
    stratum_sample = stratum_df.sample(n=n_samples, random_state=42)
    samples.append(stratum_sample)

# Combine all stratum samples into one DataFrame
final_sample = pd.concat(samples).reset_index()

# If needed, randomly sample extra rows to fill any rounding gap
current_sample_size = len(final_sample)
if current_sample_size < SAMPLE_SIZE:
    remaining = SAMPLE_SIZE - current_sample_size
    remaining_sample = df.sample(n=remaining, random_state=42)
    final_sample = pd.concat([final_sample, remaining_sample])

# Print sample results
print("Final sample size:", final_sample.shape[0])
print("Sample breakdown by stratum:\n", final_sample[STRATIFY_COLUMN].value_counts())
print("\nSample:\n", final_sample)

# Optionally, save the sample to a new CSV
# final_sample.to_csv("sample_output.csv", index=False)