diff options
author | Christian Cleberg <hello@cleberg.net> | 2025-05-29 11:41:39 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-05-29 11:41:39 -0500 |
commit | 6226695a2072c2e269ece38254389e871c26e01f (patch) | |
tree | b0564444885e1aa42910d3e287084be939b7f733 | |
parent | ae0b864a92cefc33593f5817fe4a6afe75e395d1 (diff) | |
download | audit-tools-6226695a2072c2e269ece38254389e871c26e01f.tar.gz audit-tools-6226695a2072c2e269ece38254389e871c26e01f.tar.bz2 audit-tools-6226695a2072c2e269ece38254389e871c26e01f.zip |
feat: add stratified sampling script (#12)
* feat: add stratified sampling script
* Commit from GitHub Actions (Ruff)
---------
Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
-rw-r--r-- | .gitignore | 3 | ||||
-rw-r--r-- | sampling/stratified_sample.py | 62 |
2 files changed, 64 insertions, 1 deletions
@@ -1,2 +1,3 @@ -venv/ +.venv +venv readme.html diff --git a/sampling/stratified_sample.py b/sampling/stratified_sample.py new file mode 100644 index 0000000..1ca94f9 --- /dev/null +++ b/sampling/stratified_sample.py @@ -0,0 +1,62 @@ +# Import packages +import pandas as pd +import math + +# Load data +df = pd.read_csv("FILENAME_GOES_HERE.csv") + +# ALTERNATIVE: If you use Excel, use this instead. Supports xls, xlsx, xlsm, +# xlsb, odf, ods and odt file extensions. +# df = pd.read_excel("FILENAME_GOES_HERE.xlsx") + +# Print totals prior to sampling +print("Dataframe size (rows, columns):", df.shape) + +# User-defined parameters +SAMPLE_SIZE = 25 +STRATIFY_COLUMN = "Category" # <- Change this to your column name + +# Define stratum proportions (as fractions) +# Example: if you have categories A, B, and C +stratum_proportions = {"A": 0.4, "B": 0.4, "C": 0.2} + +# Validate proportions sum to 1 +if not math.isclose(sum(stratum_proportions.values()), 1.0): + raise ValueError("Stratum proportions must sum to 1.") + +# Check that all strata exist in the data +missing_strata = set(stratum_proportions.keys()) - set(df[STRATIFY_COLUMN].unique()) +if missing_strata: + raise ValueError( + f"Strata {missing_strata} not found in column '{STRATIFY_COLUMN}'." + ) + +# Perform stratified sampling +samples = [] +for stratum, proportion in stratum_proportions.items(): + stratum_df = df[df[STRATIFY_COLUMN] == stratum] + n_samples = math.floor(SAMPLE_SIZE * proportion) + if n_samples > len(stratum_df): + raise ValueError( + f"Not enough data in stratum '{stratum}' to sample {n_samples} rows." + ) + stratum_sample = stratum_df.sample(n=n_samples, random_state=42) + samples.append(stratum_sample) + +# Combine all stratum samples into one DataFrame +final_sample = pd.concat(samples).reset_index() + +# If needed, randomly sample extra rows to fill any rounding gap +current_sample_size = len(final_sample) +if current_sample_size < SAMPLE_SIZE: + remaining = SAMPLE_SIZE - current_sample_size + remaining_sample = df.sample(n=remaining, random_state=42) + final_sample = pd.concat([final_sample, remaining_sample]) + +# Print sample results +print("Final sample size:", final_sample.shape[0]) +print("Sample breakdown by stratum:\n", final_sample[STRATIFY_COLUMN].value_counts()) +print("\nSample:\n", final_sample) + +# Optionally, save the sample to a new CSV +# final_sample.to_csv("sample_output.csv", index=False) |