aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Cleberg <hello@cleberg.net>2025-05-29 11:41:39 -0500
committerGitHub <noreply@github.com>2025-05-29 11:41:39 -0500
commit6226695a2072c2e269ece38254389e871c26e01f (patch)
treeb0564444885e1aa42910d3e287084be939b7f733
parentae0b864a92cefc33593f5817fe4a6afe75e395d1 (diff)
downloadaudit-tools-6226695a2072c2e269ece38254389e871c26e01f.tar.gz
audit-tools-6226695a2072c2e269ece38254389e871c26e01f.tar.bz2
audit-tools-6226695a2072c2e269ece38254389e871c26e01f.zip
feat: add stratified sampling script (#12)
* feat: add stratified sampling script * Commit from GitHub Actions (Ruff) --------- Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
-rw-r--r--.gitignore3
-rw-r--r--sampling/stratified_sample.py62
2 files changed, 64 insertions, 1 deletions
diff --git a/.gitignore b/.gitignore
index 07ed393..840093e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
-venv/
+.venv
+venv
readme.html
diff --git a/sampling/stratified_sample.py b/sampling/stratified_sample.py
new file mode 100644
index 0000000..1ca94f9
--- /dev/null
+++ b/sampling/stratified_sample.py
@@ -0,0 +1,62 @@
+# Import packages
+import pandas as pd
+import math
+
+# Load data
+df = pd.read_csv("FILENAME_GOES_HERE.csv")
+
+# ALTERNATIVE: If you use Excel, use this instead. Supports xls, xlsx, xlsm,
+# xlsb, odf, ods and odt file extensions.
+# df = pd.read_excel("FILENAME_GOES_HERE.xlsx")
+
+# Print totals prior to sampling
+print("Dataframe size (rows, columns):", df.shape)
+
+# User-defined parameters
+SAMPLE_SIZE = 25
+STRATIFY_COLUMN = "Category" # <- Change this to your column name
+
+# Define stratum proportions (as fractions)
+# Example: if you have categories A, B, and C
+stratum_proportions = {"A": 0.4, "B": 0.4, "C": 0.2}
+
+# Validate proportions sum to 1
+if not math.isclose(sum(stratum_proportions.values()), 1.0):
+ raise ValueError("Stratum proportions must sum to 1.")
+
+# Check that all strata exist in the data
+missing_strata = set(stratum_proportions.keys()) - set(df[STRATIFY_COLUMN].unique())
+if missing_strata:
+ raise ValueError(
+ f"Strata {missing_strata} not found in column '{STRATIFY_COLUMN}'."
+ )
+
+# Perform stratified sampling
+samples = []
+for stratum, proportion in stratum_proportions.items():
+ stratum_df = df[df[STRATIFY_COLUMN] == stratum]
+ n_samples = math.floor(SAMPLE_SIZE * proportion)
+ if n_samples > len(stratum_df):
+ raise ValueError(
+ f"Not enough data in stratum '{stratum}' to sample {n_samples} rows."
+ )
+ stratum_sample = stratum_df.sample(n=n_samples, random_state=42)
+ samples.append(stratum_sample)
+
+# Combine all stratum samples into one DataFrame
+final_sample = pd.concat(samples).reset_index()
+
+# If needed, randomly sample extra rows to fill any rounding gap
+current_sample_size = len(final_sample)
+if current_sample_size < SAMPLE_SIZE:
+ remaining = SAMPLE_SIZE - current_sample_size
+ remaining_sample = df.sample(n=remaining, random_state=42)
+ final_sample = pd.concat([final_sample, remaining_sample])
+
+# Print sample results
+print("Final sample size:", final_sample.shape[0])
+print("Sample breakdown by stratum:\n", final_sample[STRATIFY_COLUMN].value_counts())
+print("\nSample:\n", final_sample)
+
+# Optionally, save the sample to a new CSV
+# final_sample.to_csv("sample_output.csv", index=False)