diff options
-rw-r--r-- | .gitattributes | 2 | ||||
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | README.md | 12 | ||||
-rw-r--r-- | notebooks/db_exploration.ipynb | 187 | ||||
-rw-r--r-- | notebooks/raw_data_exploration.ipynb | 99 | ||||
-rw-r--r-- | raw_data/Incidents_2015.csv | 3 | ||||
-rw-r--r-- | raw_data/Incidents_2016.csv | 3 | ||||
-rw-r--r-- | raw_data/Incidents_2017.csv | 3 | ||||
-rw-r--r-- | raw_data/Incidents_2018.csv | 3 | ||||
-rw-r--r-- | raw_data/Incidents_2019.csv | 3 | ||||
-rw-r--r-- | raw_data/Incidents_2020.csv | 3 | ||||
-rw-r--r-- | raw_data/Incidents_2021.csv | 3 | ||||
-rw-r--r-- | raw_data/Incidents_2022.csv | 3 | ||||
-rw-r--r-- | raw_data/Incidents_2023.csv | 3 | ||||
-rw-r--r-- | raw_data/ingress.db | 3 | ||||
-rw-r--r-- | scripts/dashboard.py | 76 | ||||
-rw-r--r-- | scripts/load.py | 73 |
17 files changed, 480 insertions, 0 deletions
diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..33e4788 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +*.csv filter=lfs diff=lfs merge=lfs -text +*.db filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e43b0f9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.DS_Store diff --git a/README.md b/README.md new file mode 100644 index 0000000..72d36f9 --- /dev/null +++ b/README.md @@ -0,0 +1,12 @@ +# Omaha Incidents + +Data from the Omaha police department, used to analyze and visualize statistics. + +## TODO + +- [x] Import script +- [ ] Remove duplicate instances of headers being inserted into the database as + records +- [ ] Analysis script +- [~] Visualization script +- [ ] Build API to connect to database? diff --git a/notebooks/db_exploration.ipynb b/notebooks/db_exploration.ipynb new file mode 100644 index 0000000..68ac55e --- /dev/null +++ b/notebooks/db_exploration.ipynb @@ -0,0 +1,187 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Omaha Incidents" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Exploration\n", + "\n", + "Let\"s explore the data a little bit to see what kind of analysis and visualizations we want to implement." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip3 install ipykernel\n", + "!pip3 install --upgrade pandas plotly dash \"nbformat>=4.2.0\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import sqlite3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "connection = sqlite3.connect(\"../raw_data/ingress.db\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cursor = connection.cursor()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test query to see if the data loaded\n", + "select_all = \"SELECT * FROM incidents;\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_sql_query(select_all, connection)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test plotting by sorting & plotting top 5 crime categories\n", + "s = df.value_counts(subset=[\"description\"])\n", + "t = s.nlargest(5)\n", + "t.head()\n", + "t.plot(kind=\"bar\", title=\"Top 5 Incident Categories\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.express as px\n", + "import plotly.graph_objects as go" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "filtered_df = df[(df['date'] > '01/01/2023') & (df['date'] < '12/31/2023')]\n", + "filtered_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = px.scatter_mapbox(\n", + " filtered_df,\n", + " lat=\"lat\",\n", + " lon=\"lon\",\n", + " color=\"description\",\n", + " hover_name=\"description\",\n", + " hover_data=[\"date\", \"time\"],\n", + " title=\"Incident Count by Coordinates\",\n", + " center={\"lat\": 41.257160, \"lon\": -95.995102},\n", + " zoom=10\n", + ")\n", + "\n", + "# fig.update_layout(showlegend=False)\n", + "fig.update_layout(mapbox_style=\"open-street-map\")\n", + "fig.update_layout(margin={\"r\": 0, \"t\": 0, \"l\": 0, \"b\": 0})\n", + "fig.update_layout(mapbox_bounds={\"west\": -180, \"east\": -50, \"south\": 20, \"north\": 90})\n", + "# fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.io as pio\n", + "pio.write_html(fig, file=\"test.html\", auto_open=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# clean up and close it out\n", + "connection.commit()\n", + "connection.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/raw_data_exploration.ipynb b/notebooks/raw_data_exploration.ipynb new file mode 100644 index 0000000..46c3a43 --- /dev/null +++ b/notebooks/raw_data_exploration.ipynb @@ -0,0 +1,99 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Omaha Incidents" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "You must download the data from the URL below first.\n", + "\n", + "https://police.cityofomaha.org/crime-information/incident-data-download" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Exploration\n", + "\n", + "Let's explore the data a little bit to see what kind of analysis and visualizations we want to implement." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import data\n", + "df = pd.read_csv(\"../raw_data/Incidents_2015.csv\")\n", + "\n", + "# test to see what the dataframe looks like\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install \"matplotlib\"\n", + "import numpy\n", + "import matplotlib\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test plotting by sorting & plotting top 5 crime categories\n", + "s = df.value_counts(subset=[\"Statute/Ordinance Description\"])\n", + "t = s.nlargest(5)\n", + "t.head()\n", + "t.plot(kind=\"bar\", title=\"Top 5 Incident Categories\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/raw_data/Incidents_2015.csv b/raw_data/Incidents_2015.csv new file mode 100644 index 0000000..b11b4d3 --- /dev/null +++ b/raw_data/Incidents_2015.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46504a13c49c3133e4ebfb7329cb6ac04b4571c513782700ca00da2679ba11af +size 3252033 diff --git a/raw_data/Incidents_2016.csv b/raw_data/Incidents_2016.csv new file mode 100644 index 0000000..9b1fba0 --- /dev/null +++ b/raw_data/Incidents_2016.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8faab205c00118f2157b239f71fa184f0d79adba781259931ba81407140fcf92 +size 6418868 diff --git a/raw_data/Incidents_2017.csv b/raw_data/Incidents_2017.csv new file mode 100644 index 0000000..9c05a93 --- /dev/null +++ b/raw_data/Incidents_2017.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60b0cff58ec5a0ab7b5ff8bdcd6177f6a8548b4b05715e380c032b1cde5a98b8 +size 6750996 diff --git a/raw_data/Incidents_2018.csv b/raw_data/Incidents_2018.csv new file mode 100644 index 0000000..2fc1b22 --- /dev/null +++ b/raw_data/Incidents_2018.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a6f3bbdd3832db4d23856620b006a9fc7788df88ce3c107d25729ffafd15af7 +size 6077309 diff --git a/raw_data/Incidents_2019.csv b/raw_data/Incidents_2019.csv new file mode 100644 index 0000000..ea6de20 --- /dev/null +++ b/raw_data/Incidents_2019.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:841b4d15ae718ce4dda8382d25140df4452055b603c879c38b57c10c560da11d +size 5885305 diff --git a/raw_data/Incidents_2020.csv b/raw_data/Incidents_2020.csv new file mode 100644 index 0000000..9c65927 --- /dev/null +++ b/raw_data/Incidents_2020.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d49127d47b098f925f168ff9191205b6f1779e8d93914f7c1595c980429276b +size 5345985 diff --git a/raw_data/Incidents_2021.csv b/raw_data/Incidents_2021.csv new file mode 100644 index 0000000..d15abff --- /dev/null +++ b/raw_data/Incidents_2021.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d4742625748b87d0da6e747512343c0476d6dfb8d8be01f6391036ece57cb60 +size 7013108 diff --git a/raw_data/Incidents_2022.csv b/raw_data/Incidents_2022.csv new file mode 100644 index 0000000..8ccc37c --- /dev/null +++ b/raw_data/Incidents_2022.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d511a12df510be0a7aaf0d757ce4240209f539559ffa71c55c9db8da7b7fc0d4 +size 7347606 diff --git a/raw_data/Incidents_2023.csv b/raw_data/Incidents_2023.csv new file mode 100644 index 0000000..47e44ca --- /dev/null +++ b/raw_data/Incidents_2023.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d3d38c8415a5fb53b3e5b0cdf5c5d5af8b67f2e2a5da0d0d86df5d7c68da193 +size 7140489 diff --git a/raw_data/ingress.db b/raw_data/ingress.db new file mode 100644 index 0000000..9ce6d2a --- /dev/null +++ b/raw_data/ingress.db @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff5293a40fb466e45ee6b69c85756f3aec3e8704094324616048321decdaae87 +size 43061248 diff --git a/scripts/dashboard.py b/scripts/dashboard.py new file mode 100644 index 0000000..23e72c4 --- /dev/null +++ b/scripts/dashboard.py @@ -0,0 +1,76 @@ +from dash import Dash, html, dcc, callback, Output, Input +import plotly.express as px +import pandas as pd +import sqlite3 + +# Connect to database and query all incidents +connection = sqlite3.connect("../raw_data/ingress.db") +cursor = connection.cursor() +query = "SELECT * FROM incidents;" +df = pd.read_sql_query(query, connection).sort_values(by="description") + +# Create custom YEAR column to use in dropdown +df['year'] = df['date'].str[-4:] + +# Configure HTML layout +app = Dash(__name__) +app.layout = html.Div(children = [ + html.Div([ + html.H1(children="Omaha Police Invidents", style={"textAlign":"center"}), + dcc.Dropdown(df.sort_values("description").description.unique(), "INJURY", id="bar-dropdown"), + dcc.Dropdown(df.sort_values("year").year.unique(), "2023", id="bar-year-dropdown"), + dcc.Graph(id="bar-graph") + ]), + html.Div([ + html.H2(children="Map Coordinates", style={"textAlign":"center"}), + dcc.Dropdown(df.sort_values("description").description.unique(), "INJURY", id="map-dropdown"), + dcc.Dropdown(df.sort_values("year").year.unique(), "2023", id="map-year-dropdown"), + dcc.Graph(id="map-graph") + ]) +]) + +# Create bar graph +@callback( + Output("bar-graph", "figure"), + Input("bar-dropdown", "value"), + Input("bar-year-dropdown", "value") +) +def update_bar_graph(description, year): + dff = df[df.year == year] + dff = dff.value_counts(subset=["description"]) + dff = dff.reset_index() + dff = dff[dff.description == description] + return px.bar(dff, x="description", y="count") + +# Create map +@callback( + Output("map-graph", "figure"), + Input("map-dropdown", "value"), + Input("map-year-dropdown", "value") +) +def update_map(description, year): + dff = df[df.year == year] + dff = dff.reset_index() + dff = dff[dff.description == description] + + fig = px.scatter_mapbox( + dff, + lat="lat", + lon="lon", + color="description", + hover_name="description", + hover_data=["date", "time"], + title="Incident Count by Coordinates", + center={"lat": 41.257160, "lon": -95.995102}, + zoom=10 + ) + + fig.update_layout(showlegend=False) + fig.update_layout(mapbox_style="open-street-map") + fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0}) + fig.update_layout(mapbox_bounds={"west": -180, "east": -50, "south": 20, "north": 90}) + + return fig + +if __name__ == "__main__": + app.run(debug=True) diff --git a/scripts/load.py b/scripts/load.py new file mode 100644 index 0000000..ea362cb --- /dev/null +++ b/scripts/load.py @@ -0,0 +1,73 @@ +# Import required modules +import csv +import sqlite3 +import os + +# Create the database file +connection = sqlite3.connect('../raw_data/ingress.db') + +# Creating a cursor object to execute SQL queries +cursor = connection.cursor() + +# Table Definition +# rb = RB Number +# date = Reported Date +# time = Reported Time +# description = Statute/Ordinance Description +# location = Occurred Location +# district = Occurred District +# lat = Occurred Block LAT +# lon = Occurred Block LON +create_table = '''CREATE TABLE incidents( + id INTEGER PRIMARY KEY AUTOINCREMENT, + rb TEXT NOT NULL, + date TEXT NOT NULL, + time TEXT NOT NULL, + description TEXT NOT NULL, + location TEXT NOT NULL, + district TEXT NOT NULL, + lat REAL NOT NULL, + lon REAL NOT NULL); + ''' + +# Create the table +cursor.execute(create_table) + +# Point to the data directory +directory = os.fsencode("../raw_data/") + +# Loop through all raw data files +for file in os.listdir(directory): + filename = os.fsdecode(file) + if filename.endswith(".csv"): + # Opening the file + file = open("../raw_data/" + filename) + + # Reading the contents of the file + contents = csv.reader(file) + + # SQL query to insert data into the + # table + insert_records = "INSERT INTO incidents (rb, date, time, description, location, district, lat, lon) VALUES(?, ?, ?, ?, ?, ?, ?, ?)" + + # Importing the contents of the file + # into our table + cursor.executemany(insert_records, contents) + print("Inserted data from: ", filename) + continue + else: + continue + +# Test query to see if the data loaded +select_all = "SELECT * FROM incidents" +rows = cursor.execute(select_all).fetchall() + +# Output to the console screen +for r in rows: + print(r) + +# Commit the changes +connection.commit() + +# Close the database connection +connection.close() |