In [1]:
# Optionally, install needed libraries
!conda install -c conda-forge geocoder --yes
!pip install geopy
!pip install folium

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [2]:
# Import the necessary libraries
import pandas as pd
import requests
import folium
import math
import json
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans

In [3]:
# Define the latitude and longitude of Lincoln, then map the results
latitude = 40.806862
longitude = -96.681679
map_LNK = folium.Map(location=[latitude, longitude], zoom_start=12)
    
map_LNK

## Important: The Foursquare API only returns 100 venues at a time using this endpoint.
Pull data back in 3 groups to get all 232 results

In [4]:
# Foursquare API credentials
CLIENT_ID = 'MPMD3J0GGDV0HKDJFEDRK0USSGW0MQUD0DMN3C4ZAMWT2XTN'
CLIENT_SECRET = 'YVCSGVSCX02EFWITMI3RDOTUSJEHV4APRMLBFCN5CKWXCTPJ'
VERSION = '20180604'

In [5]:
# Set up the URL to fetch the first 100 results
LIMIT = 100
radius = 10000
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=MPMD3J0GGDV0HKDJFEDRK0USSGW0MQUD0DMN3C4ZAMWT2XTN&client_secret=YVCSGVSCX02EFWITMI3RDOTUSJEHV4APRMLBFCN5CKWXCTPJ&v=20180604&ll=40.806862,-96.681679&radius=10000&limit=100'

In [6]:
# Fetch the first 100 results
results = requests.get(url).json()

In [7]:
# Determine the total number of results needed to fetch
totalResults = results['response']['totalResults']
totalResults

232

In [8]:
# Set up the URL to fetch the second 100 results (101-200)
LIMIT = 100
offset = 100
radius = 10000
url2 = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&offset={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT,
    offset)
url2

'https://api.foursquare.com/v2/venues/explore?&client_id=MPMD3J0GGDV0HKDJFEDRK0USSGW0MQUD0DMN3C4ZAMWT2XTN&client_secret=YVCSGVSCX02EFWITMI3RDOTUSJEHV4APRMLBFCN5CKWXCTPJ&v=20180604&ll=40.806862,-96.681679&radius=10000&limit=100&offset=100'

In [9]:
# Fetch the second 100 results (101-200)
results2 = requests.get(url2).json()

In [10]:
# Set up the URL to fetch the final results (201 - 232)
LIMIT = 100
offset = 200
radius = 10000
url3 = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&offset={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT,
    offset)
url3

'https://api.foursquare.com/v2/venues/explore?&client_id=MPMD3J0GGDV0HKDJFEDRK0USSGW0MQUD0DMN3C4ZAMWT2XTN&client_secret=YVCSGVSCX02EFWITMI3RDOTUSJEHV4APRMLBFCN5CKWXCTPJ&v=20180604&ll=40.806862,-96.681679&radius=10000&limit=100&offset=200'

In [11]:
# Fetch the final results (201 - 232)
results3 = requests.get(url3).json()

In [12]:
# This function will extract the category of the venue from the API dictionary
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [13]:
# Get the first 100 venues
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,Sunken Gardens,Garden,40.802319,-96.683149
1,Lincoln Children's Zoo,Zoo,40.800463,-96.680036
2,Honest Abe's Burgers & Freedom,Burger Joint,40.814234,-96.701008
3,Yia Yia's,Pizza Place,40.813599,-96.700540
4,Antelope Park,Park,40.796821,-96.674826
...,...,...,...,...
95,Jack's Bar & Grill,Bar,40.813895,-96.709815
96,Trade A Tape Comic Book Center,Comic Shop,40.812996,-96.708490
97,Mazatlan,Mexican Restaurant,40.815196,-96.625641
98,Rosie's,Bar,40.765226,-96.700645


In [14]:
# Get the second 100 venues
venues2 = results2['response']['groups'][0]['items']
nearby_venues2 = json_normalize(venues2) # flatten JSON

# filter columns
filtered_columns2 = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues2 = nearby_venues2.loc[:, filtered_columns]

# filter the category for each row
nearby_venues2['venue.categories'] = nearby_venues2.apply(get_category_type, axis=1)

# clean columns
nearby_venues2.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues = nearby_venues.append(nearby_venues2)
nearby_venues

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,Sunken Gardens,Garden,40.802319,-96.683149
1,Lincoln Children's Zoo,Zoo,40.800463,-96.680036
2,Honest Abe's Burgers & Freedom,Burger Joint,40.814234,-96.701008
3,Yia Yia's,Pizza Place,40.813599,-96.700540
4,Antelope Park,Park,40.796821,-96.674826
...,...,...,...,...
95,Engine House Cafe,American Restaurant,40.857195,-96.637721
96,Marcus Edgewood Cinema,Movie Theater,40.760190,-96.642499
97,Victoria's Secret,Lingerie Store,40.742087,-96.679395
98,Pancho Villa Mexican Grill,Mexican Restaurant,40.860718,-96.640711


In [15]:
# Get the rest of the venues
venues3 = results3['response']['groups'][0]['items']
nearby_venues3 = json_normalize(venues3) # flatten JSON

# filter columns
filtered_columns3 = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues3 = nearby_venues3.loc[:, filtered_columns]

# filter the category for each row
nearby_venues3['venue.categories'] = nearby_venues3.apply(get_category_type, axis=1)

# clean columns
nearby_venues3.columns = [col.split(".")[-1] for col in nearby_venues3.columns]

nearby_venues = nearby_venues.append(nearby_venues3)
nearby_venues = nearby_venues.reset_index(drop=True)
nearby_venues

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,Sunken Gardens,Garden,40.802319,-96.683149
1,Lincoln Children's Zoo,Zoo,40.800463,-96.680036
2,Honest Abe's Burgers & Freedom,Burger Joint,40.814234,-96.701008
3,Yia Yia's,Pizza Place,40.813599,-96.700540
4,Antelope Park,Park,40.796821,-96.674826
...,...,...,...,...
227,PepperJax Grill,American Restaurant,40.738810,-96.680150
228,ALDI,Grocery Store,40.723974,-96.681705
229,Crete Carrier/Shaffer Trucking,Building,40.816108,-96.795131
230,Hampton Inn & Suites,Hotel,40.887348,-96.678617


In [16]:
# add markers to map
for lat, lng, name, categories in zip(nearby_venues['lat'], nearby_venues['lng'], nearby_venues['name'], nearby_venues['categories']):
    label = '{} ({})'.format(name, categories)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        ).add_to(map_LNK)

map_LNK

In [17]:
# This function will return the sum of squares found in the data
def calculate_wcss(data):
    wcss = []
    for n in range(2, 21):
        kmeans = KMeans(n_clusters=n)
        kmeans.fit(X=data)
        wcss.append(kmeans.inertia_)

    return wcss

In [18]:
# Drop 'str' cols so we can use k-means clustering
cluster_df = nearby_venues.drop(columns=['name', 'categories'])
cluster_df

Unnamed: 0,lat,lng
0,40.802319,-96.683149
1,40.800463,-96.680036
2,40.814234,-96.701008
3,40.813599,-96.700540
4,40.796821,-96.674826
...,...,...
227,40.738810,-96.680150
228,40.723974,-96.681705
229,40.816108,-96.795131
230,40.887348,-96.678617


In [19]:
# calculating the within clusters sum-of-squares for 19 cluster amounts
sum_of_squares = calculate_wcss(cluster_df)
sum_of_squares

[0.34804817245236686,
 0.2123621655263781,
 0.15941308819059363,
 0.1124422749740121,
 0.08793974648902303,
 0.07787546934076488,
 0.06760201626926661,
 0.058603531149823695,
 0.05307694673631565,
 0.046405418408475035,
 0.04248508032750019,
 0.03737141937875458,
 0.03418960068046657,
 0.030142252086911396,
 0.027788439492930205,
 0.025640780223738995,
 0.023750332291644125,
 0.022590054827965667,
 0.019589286338306075]

In [20]:
# This function will return the optimal number of clusters
def optimal_number_of_clusters(wcss):
    x1, y1 = 2, wcss[0]
    x2, y2 = 20, wcss[len(wcss)-1]

    distances = []
    for i in range(len(wcss)):
        x0 = i+2
        y0 = wcss[i]
        numerator = abs((y2-y1)*x0 - (x2-x1)*y0 + x2*y1 - y2*x1)
        denominator = math.sqrt((y2 - y1)**2 + (x2 - x1)**2)
        distances.append(numerator/denominator)
    
    return distances.index(max(distances)) + 2

In [21]:
# calculating the optimal number of clusters
n = optimal_number_of_clusters(sum_of_squares)
n

6

In [22]:
# set number of clusters equal to the optimal number
kclusters = n

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cluster_df)

# check cluster labels generated for each row in the dataframe
print("Labels:", "\n", kmeans.labels_, "\n")
print("Cluster centers:", "\n", kmeans.cluster_centers_)

Labels: 
 [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 0 2 2 2 2 2 2 0 2 2 2 2 2 2
 0 2 2 2 2 2 2 2 0 0 2 3 0 0 2 0 2 2 2 3 2 2 2 0 2 2 2 4 0 0 4 2 2 0 3 0 0
 2 3 5 2 3 3 0 3 5 0 2 0 2 0 2 0 2 0 0 2 0 2 2 0 1 2 2 5 2 1 0 0 3 3 5 3 3
 0 0 0 5 4 1 0 0 3 3 3 3 1 1 0 4 5 1 3 0 0 3 3 4 4 3 1 3 3 5 3 4 4 0 3 3 3
 1 0 2 1 1 3 1 1 3 5 1 1 3 4 4 0 3 0 0 1 5 3 3 3 3 0 1 3 3 1 1 3 4 3 3 1 1
 4 1 5 3 1 3 1 4 3 1 0 3 1 0 3 1 3 1 1 3 4 1 3 1 1 1 4 1 1 1 1 1 1 1 3 1 4
 4 4 3 4 5 1 1 5 4 4] 

Cluster centers: 
 [[ 40.820588   -96.63913461]
 [ 40.74178794 -96.68440311]
 [ 40.81084834 -96.70181345]
 [ 40.76406174 -96.64015094]
 [ 40.86489106 -96.69252935]
 [ 40.80880071 -96.75189142]]


In [23]:
# add clustering labels to dataframe
nearby_venues.insert(0, 'Cluster Labels', kmeans.labels_)
nearby_venues

Unnamed: 0,Cluster Labels,name,categories,lat,lng
0,2,Sunken Gardens,Garden,40.802319,-96.683149
1,2,Lincoln Children's Zoo,Zoo,40.800463,-96.680036
2,2,Honest Abe's Burgers & Freedom,Burger Joint,40.814234,-96.701008
3,2,Yia Yia's,Pizza Place,40.813599,-96.700540
4,2,Antelope Park,Park,40.796821,-96.674826
...,...,...,...,...,...
227,1,PepperJax Grill,American Restaurant,40.738810,-96.680150
228,1,ALDI,Grocery Store,40.723974,-96.681705
229,5,Crete Carrier/Shaffer Trucking,Building,40.816108,-96.795131
230,4,Hampton Inn & Suites,Hotel,40.887348,-96.678617


In [36]:
# create map with clusters
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)
colors = ['#0F9D58', '#DB4437', '#4285F4', '#800080', '#ce12c0', '#171717']

# add markers to the map
for lat, lng, name, categories, cluster in zip(nearby_venues['lat'], nearby_venues['lng'], nearby_venues['name'], nearby_venues['categories'], nearby_venues['Cluster Labels']):
    label = '[{}] {} ({})'.format(cluster, name, categories)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=colors[int(cluster)],
        fill=True,
        fill_color=colors[int(cluster)],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters

In [37]:
# Show how many venues are in each cluster
color_names = ['Dark Green', 'Red', 'Blue', 'Purple', 'Pink', 'Black']
for x in range(0,6):
    print("Color of Cluster", x, ":", color_names[x])
    print("Venues found in Cluster", x, ":", nearby_venues.loc[nearby_venues['Cluster Labels'] == x, nearby_venues.columns[:]].shape[0])
    print("---")

Color of Cluster 0 : Dark Green
Venues found in Cluster 0 : 42
---
Color of Cluster 1 : Red
Venues found in Cluster 1 : 42
---
Color of Cluster 2 : Blue
Venues found in Cluster 2 : 67
---
Color of Cluster 3 : Purple
Venues found in Cluster 3 : 48
---
Color of Cluster 4 : Pink
Venues found in Cluster 4 : 21
---
Color of Cluster 5 : Black
Venues found in Cluster 5 : 12
---


In [26]:
# Reminder of what the dataframe looks like
nearby_venues

Unnamed: 0,Cluster Labels,name,categories,lat,lng
0,2,Sunken Gardens,Garden,40.802319,-96.683149
1,2,Lincoln Children's Zoo,Zoo,40.800463,-96.680036
2,2,Honest Abe's Burgers & Freedom,Burger Joint,40.814234,-96.701008
3,2,Yia Yia's,Pizza Place,40.813599,-96.700540
4,2,Antelope Park,Park,40.796821,-96.674826
...,...,...,...,...,...
227,1,PepperJax Grill,American Restaurant,40.738810,-96.680150
228,1,ALDI,Grocery Store,40.723974,-96.681705
229,5,Crete Carrier/Shaffer Trucking,Building,40.816108,-96.795131
230,4,Hampton Inn & Suites,Hotel,40.887348,-96.678617


In [27]:
# Calculate how many venues there are in each category
# Sort from largest to smallest
temp_df = nearby_venues.drop(columns=['name', 'lat', 'lng'])

cluster0_grouped = temp_df.loc[temp_df['Cluster Labels'] == 0].groupby(['categories']).count().sort_values(by='Cluster Labels', ascending=False)
cluster1_grouped = temp_df.loc[temp_df['Cluster Labels'] == 1].groupby(['categories']).count().sort_values(by='Cluster Labels', ascending=False)
cluster2_grouped = temp_df.loc[temp_df['Cluster Labels'] == 2].groupby(['categories']).count().sort_values(by='Cluster Labels', ascending=False)
cluster3_grouped = temp_df.loc[temp_df['Cluster Labels'] == 3].groupby(['categories']).count().sort_values(by='Cluster Labels', ascending=False)
cluster4_grouped = temp_df.loc[temp_df['Cluster Labels'] == 4].groupby(['categories']).count().sort_values(by='Cluster Labels', ascending=False)
cluster5_grouped = temp_df.loc[temp_df['Cluster Labels'] == 5].groupby(['categories']).count().sort_values(by='Cluster Labels', ascending=False)

# show how many venues there are in each cluster (> 1)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print("\n\n", "Cluster 0:", "\n", cluster0_grouped.loc[cluster0_grouped['Cluster Labels'] > 1])
    print("\n\n", "Cluster 1:", "\n", cluster1_grouped.loc[cluster1_grouped['Cluster Labels'] > 1])
    print("\n\n", "Cluster 2:", "\n", cluster2_grouped.loc[cluster2_grouped['Cluster Labels'] > 1])
    print("\n\n", "Cluster 3:", "\n", cluster3_grouped.loc[cluster3_grouped['Cluster Labels'] > 1])
    print("\n\n", "Cluster 4:", "\n", cluster4_grouped.loc[cluster4_grouped['Cluster Labels'] > 1])
    print("\n\n", "Cluster 5:", "\n", cluster5_grouped.loc[cluster5_grouped['Cluster Labels'] > 1])



 Cluster 0: 
                      Cluster Labels
categories                         
Mexican Restaurant                6
Chinese Restaurant                4
Grocery Store                     4
American Restaurant               2
Bakery                            2
Taco Place                        2


 Cluster 1: 
                       Cluster Labels
categories                          
Grocery Store                      3
Sandwich Place                     3
Burger Joint                       2
Pharmacy                           2
Mexican Restaurant                 2
Fast Food Restaurant               2
Coffee Shop                        2
Chinese Restaurant                 2
American Restaurant                2


 Cluster 2: 
                     Cluster Labels
categories                        
Coffee Shop                      5
Brewery                          4
Bar                              3
Park                             3
Italian Restaurant               2
Café        