aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--notebooks/db_exploration.ipynb138
1 files changed, 119 insertions, 19 deletions
diff --git a/notebooks/db_exploration.ipynb b/notebooks/db_exploration.ipynb
index 68ac55e..f6787d9 100644
--- a/notebooks/db_exploration.ipynb
+++ b/notebooks/db_exploration.ipynb
@@ -17,13 +17,23 @@
]
},
{
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Set up environment\n",
+ "\n",
+ "Start by installating and importing the necessary packages. "
+ ]
+ },
+ {
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "!pip3 install ipykernel\n",
- "!pip3 install --upgrade pandas plotly dash \"nbformat>=4.2.0\""
+ "# Install packages, if needed\n",
+ "# !pip3 install ipykernel\n",
+ "# !pip3 install --upgrade pandas plotly dash \"nbformat>=4.2.0\""
]
},
{
@@ -32,8 +42,22 @@
"metadata": {},
"outputs": [],
"source": [
+ "# Import packages\n",
"import pandas as pd\n",
- "import sqlite3"
+ "import numpy as np\n",
+ "import sqlite3\n",
+ "import plotly.express as px\n",
+ "import plotly.graph_objects as go\n",
+ "import plotly.io as pio"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Load Data\n",
+ "\n",
+ "To load the data, we need to connect to the SQLite3 database file and query it for the data we want."
]
},
{
@@ -42,7 +66,9 @@
"metadata": {},
"outputs": [],
"source": [
- "connection = sqlite3.connect(\"../raw_data/ingress.db\")"
+ "# Connect to the database\n",
+ "connection = sqlite3.connect(\"../raw_data/ingress.db\")\n",
+ "cursor = connection.cursor()"
]
},
{
@@ -51,7 +77,9 @@
"metadata": {},
"outputs": [],
"source": [
- "cursor = connection.cursor()"
+ "# If exists, delete extra header rows\n",
+ "# delete_headers = \"DELETE FROM incidents WHERE rb = 'RB Number'\"\n",
+ "# cursor.execute(delete_headers)"
]
},
{
@@ -60,8 +88,19 @@
"metadata": {},
"outputs": [],
"source": [
- "# Test query to see if the data loaded\n",
- "select_all = \"SELECT * FROM incidents;\""
+ "# Grab all data\n",
+ "select_all = \"SELECT * FROM incidents\"\n",
+ "df = pd.read_sql_query(select_all, connection)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Data Cleaning\n",
+ "\n",
+ "We will clean up the data before we use: inserting NaN, converting types, etc."
]
},
{
@@ -70,7 +109,8 @@
"metadata": {},
"outputs": [],
"source": [
- "df = pd.read_sql_query(select_all, connection)\n",
+ "# Replace empty cells in [lat, lon] with NaN\n",
+ "df = df.replace(r'^\\s*$', np.nan, regex=True)\n",
"df.head()"
]
},
@@ -80,21 +120,64 @@
"metadata": {},
"outputs": [],
"source": [
+ "# Convert date col to datetime format\n",
+ "df[\"date\"] = pd.to_datetime(df[\"date\"])\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Plotting\n",
+ "\n",
+ "Let's test a plot that will show us the top categories of incidents."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
"# test plotting by sorting & plotting top 5 crime categories\n",
- "s = df.value_counts(subset=[\"description\"])\n",
+ "s = dff.value_counts(subset=[\"description\"])\n",
"t = s.nlargest(5)\n",
"t.head()\n",
"t.plot(kind=\"bar\", title=\"Top 5 Incident Categories\")"
]
},
{
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Data Filtering\n",
+ "\n",
+ "To reduce the workload in this rest of this notebook, I am filtering just for one description and a range of dates.\n",
+ "\n",
+ "If you are doing a lot of analysis, I recommend modifying the query at the beginning to only the pull the data you need instead of filtering after querying."
+ ]
+ },
+ {
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "import plotly.express as px\n",
- "import plotly.graph_objects as go"
+ "# Create a smaller dataframe based on a selected date and description\n",
+ "start_date = \"2023-01-01\"\n",
+ "end_date = \"2023-12-31\"\n",
+ "description = \"INJURY\"\n",
+ "\n",
+ "dff = df[(df['date'] > start_date) & (df['date'] < end_date)]\n",
+ "dff = dff.reset_index()\n",
+ "dff = dff[dff.description == description]\n",
+ "\n",
+ "dff_grouped = dff.groupby(by=\"date\").count()\n",
+ "dff_grouped = dff_grouped.reset_index()\n",
+ "\n",
+ "print(dff.head())\n",
+ "print(dff_grouped.head())"
]
},
{
@@ -103,7 +186,7 @@
"metadata": {},
"outputs": [],
"source": [
- "s.head(10)"
+ "dff.size"
]
},
{
@@ -112,8 +195,16 @@
"metadata": {},
"outputs": [],
"source": [
- "filtered_df = df[(df['date'] > '01/01/2023') & (df['date'] < '12/31/2023')]\n",
- "filtered_df.head()"
+ "dff.info()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Mapping\n",
+ "\n",
+ "Let's create a geo map of the crime data."
]
},
{
@@ -123,7 +214,7 @@
"outputs": [],
"source": [
"fig = px.scatter_mapbox(\n",
- " filtered_df,\n",
+ " dff,\n",
" lat=\"lat\",\n",
" lon=\"lon\",\n",
" color=\"description\",\n",
@@ -138,7 +229,7 @@
"fig.update_layout(mapbox_style=\"open-street-map\")\n",
"fig.update_layout(margin={\"r\": 0, \"t\": 0, \"l\": 0, \"b\": 0})\n",
"fig.update_layout(mapbox_bounds={\"west\": -180, \"east\": -50, \"south\": 20, \"north\": 90})\n",
- "# fig.show()"
+ "fig.show()"
]
},
{
@@ -147,8 +238,17 @@
"metadata": {},
"outputs": [],
"source": [
- "import plotly.io as pio\n",
- "pio.write_html(fig, file=\"test.html\", auto_open=False)"
+ "# Optionally, save the figure to an HTML file\n",
+ "# pio.write_html(fig, file=\"test.html\", auto_open=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Wrapping Up\n",
+ "\n",
+ "To finish, remember to close your database connections and save any data you need."
]
},
{
@@ -157,7 +257,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# clean up and close it out\n",
+ "# clean up and close out the database\n",
"connection.commit()\n",
"connection.close()"
]