# Initial imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
Week 8
Analyzing and Visualizing Large Datasets
- Oct 25, 2023
- Section 401
This week’s agenda: working with big data
By example: - Open Street Map data - Census data - NYC taxi cab trips
# Ignore numpy warnings
"ignore"); np.seterr(
Use intake to load the dataset instructions for this week
import intake
= intake.open_catalog("./datasets.yml") datasets
Import datashader and related modules:
# Datashader imports
import datashader as ds
import datashader.transfer_functions as tf
# Color-related imports
from datashader.colors import Greys9, viridis, inferno
from colorcet import fire
Load the Census data to a dask array
# Load the data
# REMEMBER: this will take some time to download the first time
= datasets.census.to_dask() census_ddf
census_ddf
easting | northing | race | |
---|---|---|---|
npartitions=36 | |||
float32 | float32 | category[unknown] | |
... | ... | ... | |
... | ... | ... | ... |
... | ... | ... | |
... | ... | ... |
census_ddf.head()
easting | northing | race | |
---|---|---|---|
0 | -12418767.0 | 3697425.00 | h |
1 | -12418512.0 | 3697143.50 | h |
2 | -12418245.0 | 3697584.50 | h |
3 | -12417703.0 | 3697636.75 | w |
4 | -12418120.0 | 3697129.25 | h |
Setup canvas parameters for USA image:
from datashader.utils import lnglat_to_meters
# Sensible lat/lng coordinates for U.S. cities
# NOTE: these are in lat/lng so EPSG=4326
= [(-124.72, -66.95), (23.55, 50.06)]
USA
# Get USA xlim and ylim in meters (EPSG=3857)
= [list(r) for r in lnglat_to_meters(USA[0], USA[1])] USA_xlim_meters, USA_ylim_meters
# Define some a default plot width & height
= 900
plot_width = int(plot_width*7.0/12) plot_height
Use a custom color scheme to map racial demographics:
= {"w": "aqua", "b": "lime", "a": "red", "h": "fuchsia", "o": "yellow"} color_key
Can we learn more than just population density and race?
We can use xarray to slice the array of aggregated pixel values to examine specific aspects of the data.
Question #1: Where do African Americans live?
Use the sel()
function of the xarray array
# Step 1: Setup canvas
= ds.Canvas(plot_width=plot_width, plot_height=plot_height)
cvs
# Step 2: Aggregate and count race category
= cvs.points(census_ddf, "easting", "northing", agg=ds.count_cat("race")) aggc
aggc
<xarray.DataArray (northing: 525, easting: 900, race: 5)> array([[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], ..., [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], ..., [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], ..., ... ..., [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], ..., [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], ..., [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]], dtype=uint32) Coordinates: * easting (easting) float64 -1.388e+07 -1.387e+07 ... -7.464e+06 -7.457e+06 * northing (northing) float64 2.822e+06 2.828e+06 ... 6.326e+06 6.333e+06 * race (race) <U1 'a' 'b' 'h' 'o' 'w' Attributes: x_range: (-13884029.0, -7453303.5) y_range: (2818291.5, 6335972.0)
# NEW: Select only African Americans (where "race" column is equal to "b")
= aggc.sel(race="b") agg_b
agg_b
<xarray.DataArray (northing: 525, easting: 900)> array([[0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], ..., [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0]], dtype=uint32) Coordinates: * easting (easting) float64 -1.388e+07 -1.387e+07 ... -7.464e+06 -7.457e+06 * northing (northing) float64 2.822e+06 2.828e+06 ... 6.326e+06 6.333e+06 race <U1 'b' Attributes: x_range: (-13884029.0, -7453303.5) y_range: (2818291.5, 6335972.0)
# Step 3: Shade and set background
= tf.shade(agg_b, cmap=fire, how="eq_hist")
img = tf.set_background(img, "black")
img
img
Question #2: How to identify diverse areas?
Goal: Select pixels where each race has a non-zero count.
=["w", "b", "a", "h"]) > 0 aggc.sel(race
<xarray.DataArray (northing: 525, easting: 900, race: 4)> array([[[False, False, False, False], [False, False, False, False], [False, False, False, False], ..., [False, False, False, False], [False, False, False, False], [False, False, False, False]], [[False, False, False, False], [False, False, False, False], [False, False, False, False], ..., [False, False, False, False], [False, False, False, False], [False, False, False, False]], [[False, False, False, False], [False, False, False, False], [False, False, False, False], ..., ... ..., [False, False, False, False], [False, False, False, False], [False, False, False, False]], [[False, False, False, False], [False, False, False, False], [False, False, False, False], ..., [False, False, False, False], [False, False, False, False], [False, False, False, False]], [[False, False, False, False], [False, False, False, False], [False, False, False, False], ..., [False, False, False, False], [False, False, False, False], [False, False, False, False]]]) Coordinates: * easting (easting) float64 -1.388e+07 -1.387e+07 ... -7.464e+06 -7.457e+06 * northing (northing) float64 2.822e+06 2.828e+06 ... 6.326e+06 6.333e+06 * race (race) <U1 'w' 'b' 'a' 'h'
=['w', 'b', 'a', 'h']) > 0).all(dim='race') (aggc.sel(race
<xarray.DataArray (northing: 525, easting: 900)> array([[False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], ..., [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False]]) Coordinates: * easting (easting) float64 -1.388e+07 -1.387e+07 ... -7.464e+06 -7.457e+06 * northing (northing) float64 2.822e+06 2.828e+06 ... 6.326e+06 6.333e+06
# Do a "logical and" operation across the "race" dimension
# Pixels will be "True" if the pixel has a positive count for each race
= (aggc.sel(race=['w', 'b', 'a', 'h']) > 0).all(dim='race')
diverse_selection
diverse_selection
<xarray.DataArray (northing: 525, easting: 900)> array([[False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], ..., [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False]]) Coordinates: * easting (easting) float64 -1.388e+07 -1.387e+07 ... -7.464e+06 -7.457e+06 * northing (northing) float64 2.822e+06 2.828e+06 ... 6.326e+06 6.333e+06
# Select the pixel values where our diverse selection criteria is True
= aggc.where(diverse_selection).fillna(0)
agg2
# and shade using our color key
= tf.shade(agg2, color_key=color_key)
img = tf.set_background(img,"black")
img
img
Question #3: Where is African American population greater than the White population?
# Select where the "b" race dimension is greater than the "w" race dimension
= aggc.sel(race='b') > aggc.sel(race='w')
selection
selection
<xarray.DataArray (northing: 525, easting: 900)> array([[False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], ..., [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False]]) Coordinates: * easting (easting) float64 -1.388e+07 -1.387e+07 ... -7.464e+06 -7.457e+06 * northing (northing) float64 2.822e+06 2.828e+06 ... 6.326e+06 6.333e+06
# Select based on the selection criteria
= aggc.where(selection).fillna(0)
agg3
= tf.shade(agg3, color_key=color_key)
img = tf.set_background(img, "black")
img
img
Now let’s make it interactive!
Let’s use hvplot
# Initialize hvplot and dask
import hvplot.pandas
import hvplot.dask # NEW: dask works with hvplot too!
import holoviews as hv
import geoviews as gv
To speed up interactive calculations, you can “persist” a dask array in memory (load the data fully into memory). You should have at least 16 GB of memory to avoid memory errors, though!
If not persisted, the data will be loaded on demand to avoid memory issues, which will slow the interactive nature of the plots down slightly.
# UNCOMMENT THIS LINE IF YOU HAVE AT LEAST 16 GB OF MEMORY
= census_ddf.persist() census_ddf
census_ddf
easting | northing | race | |
---|---|---|---|
npartitions=36 | |||
float32 | float32 | category[unknown] | |
... | ... | ... | |
... | ... | ... | ... |
... | ... | ... | |
... | ... | ... |
census_ddf.hvplot
<hvplot.plotting.core.hvPlotTabular at 0x295bcf3a0>
# Plot the points
= census_ddf.hvplot.points(
points ="easting",
x="northing",
y=True, # NEW: tell hvplot to use datashader!
datashade=ds.count(), # NEW: how to aggregate
aggregator=fire,
cmap=True,
geo=3857, # Input data is in 3857, so we need to tell hvplot
crs=plot_width,
frame_width=plot_height,
frame_height=USA_xlim_meters, # NEW: Specify the xbounds in meters (EPSG=3857)
xlim=USA_ylim_meters, # NEW: Specify the ybounds in meters (EPSG=3857)
ylim
)
# Put a tile source behind it
= gv.tile_sources.CartoDark
bg
* points bg
Note: interactive features (panning, zooming, etc) can be slow, but the map will eventually re-load!
We can visualize color-coded race interactively as well
Similar syntax to previous examples…
# Points with categorical colormap
= census_ddf.hvplot.points(
race_map ="easting",
x="northing",
y=True,
datashade="race", # NEW: color pixels by "race" column
c=ds.count_cat("race"), # NEW: specify the aggregator
aggregator=color_key, # NEW: use our custom color map dictionary
cmap=3857,
crs=True,
geo=plot_width,
frame_width=plot_height,
frame_height=USA_xlim_meters,
xlim=USA_ylim_meters,
ylim
)
= gv.tile_sources.CartoDark
bg
* race_map bg
Use case: exploring gerrymandering
We can easily overlay Congressional districts on our map…
# Load congressional districts and convert to EPSG=3857
= gpd.read_file('./data/cb_2015_us_cd114_5m').to_crs(epsg=3857) districts
# Plot the district map
= districts.hvplot.polygons(
districts_map =True,
geo=3857,
crs="white",
line_color=0,
fill_alpha=plot_width,
frame_width=plot_height,
frame_height=USA_xlim_meters,
xlim=USA_ylim_meters
ylim
)
* districts_map bg
# Combine the background, race map, and districts into a single map
= bg * race_map * districts_map
img
img
Example 3: NYC taxi data
12 million taxi trips from 2015…
# Load from our intake catalog
# Remember: this will take some time to download the first time!
= datasets.nyc_taxi_wide.to_dask() taxi_ddf
taxi_ddf
tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | pickup_x | pickup_y | dropoff_x | dropoff_y | fare_amount | tip_amount | dropoff_hour | pickup_hour | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
npartitions=1 | ||||||||||||
datetime64[ns] | datetime64[ns] | uint8 | float32 | float32 | float32 | float32 | float32 | float32 | float32 | uint8 | uint8 | |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
print(f"{len(taxi_ddf)} Rows")
print(f"Columns: {list(taxi_ddf.columns)}")
11842094 Rows
Columns: ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'pickup_x', 'pickup_y', 'dropoff_x', 'dropoff_y', 'fare_amount', 'tip_amount', 'dropoff_hour', 'pickup_hour']
taxi_ddf.head()
tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | pickup_x | pickup_y | dropoff_x | dropoff_y | fare_amount | tip_amount | dropoff_hour | pickup_hour | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2015-01-15 19:05:39 | 2015-01-15 19:23:42 | 1 | 1.59 | -8236963.0 | 4975552.5 | -8234835.5 | 4975627.0 | 12.0 | 3.25 | 19 | 19 |
1 | 2015-01-10 20:33:38 | 2015-01-10 20:53:28 | 1 | 3.30 | -8237826.0 | 4971752.5 | -8237020.5 | 4976875.0 | 14.5 | 2.00 | 20 | 20 |
2 | 2015-01-10 20:33:38 | 2015-01-10 20:43:41 | 1 | 1.80 | -8233561.5 | 4983296.5 | -8232279.0 | 4986477.0 | 9.5 | 0.00 | 20 | 20 |
3 | 2015-01-10 20:33:39 | 2015-01-10 20:35:31 | 1 | 0.50 | -8238654.0 | 4970221.0 | -8238124.0 | 4971127.0 | 3.5 | 0.00 | 20 | 20 |
4 | 2015-01-10 20:33:39 | 2015-01-10 20:52:58 | 1 | 3.00 | -8234433.5 | 4977363.0 | -8238107.5 | 4974457.0 | 15.0 | 0.00 | 20 | 20 |
# Trim to the columns
= taxi_ddf[
taxi_ddf
["passenger_count",
"pickup_x",
"pickup_y",
"dropoff_x",
"dropoff_y",
"dropoff_hour",
"pickup_hour",
] ]
Exploring the taxi pick ups…
= taxi_ddf.hvplot.points(
pickups_map ="pickup_x",
x="pickup_y",
y=fire,
cmap=True,
datashade=800,
frame_width=600,
frame_height=True,
geo=3857
crs
)
* pickups_map gv.tile_sources.CartoDark
Group by the hour column to add a slider widget:
= taxi_ddf.hvplot.points(
pickups_map ="pickup_x",
x="pickup_y",
y="pickup_hour",
groupby=fire,
cmap=True,
datashade=800,
frame_width=600,
frame_height=True,
geo=3857
crs
)
* pickups_map gv.tile_sources.CartoDark
Comparing pickups and dropoffs
- Pixels with more pickups: shaded red
- Pixels with more dropoffs: shaded blue
# Bounds in meters for NYC (EPSG=3857)
= [(-8242000,-8210000), (4965000,4990000)]
NYC
# Set a plot width and height
= int(750)
plot_width = int(plot_width//1.2) plot_height
= plot_width
w = plot_height h
= NYC[0]
x_range = NYC[1] y_range
# Step 1: Create the canvas
= ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
cvs
# Step 2: Aggregate the pick ups
= cvs.points(taxi_ddf, "pickup_x", "pickup_y", ds.count("passenger_count"))
picks
# Step 2: Aggregate the drop offs
= cvs.points(taxi_ddf, "dropoff_x", "dropoff_y", ds.count("passenger_count"))
drops
# Rename to same names
= drops.rename({"dropoff_x": "x", "dropoff_y": "y"})
drops = picks.rename({"pickup_x": "x", "pickup_y": "y"}) picks
picks
<xarray.DataArray (y: 625, x: 750)> array([[0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], ..., [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0]], dtype=uint32) Coordinates: * x (x) float64 -8.242e+06 -8.242e+06 ... -8.21e+06 -8.21e+06 * y (y) float64 4.965e+06 4.965e+06 4.965e+06 ... 4.99e+06 4.99e+06 Attributes: x_range: (-8242000, -8210000) y_range: (4965000, 4990000)
drops
<xarray.DataArray (y: 625, x: 750)> array([[0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], ..., [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0]], dtype=uint32) Coordinates: * x (x) float64 -8.242e+06 -8.242e+06 ... -8.21e+06 -8.21e+06 * y (y) float64 4.965e+06 4.965e+06 4.965e+06 ... 4.99e+06 4.99e+06 Attributes: x_range: (-8242000, -8210000) y_range: (4965000, 4990000)
def create_merged_taxi_image(
=plot_width, h=plot_height, how="eq_hist"
x_range, y_range, w
):"""
Create a merged taxi image, showing areas with:
- More pickups than dropoffs in red
- More dropoffs than pickups in blue
"""
# Step 1: Create the canvas
= ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
cvs
# Step 2: Aggregate the pick ups
= cvs.points(taxi_ddf, "pickup_x", "pickup_y", ds.count("passenger_count"))
picks
# Step 2: Aggregate the drop offs
= cvs.points(taxi_ddf, "dropoff_x", "dropoff_y", ds.count("passenger_count"))
drops
# Rename to same names
= drops.rename({"dropoff_x": "x", "dropoff_y": "y"})
drops = picks.rename({"pickup_x": "x", "pickup_y": "y"})
picks
# Step 3: Shade
# NEW: shade pixels there are more drop offs than pick ups
# These are colored blue
= tf.shade(
more_drops > picks), cmap=["darkblue", "cornflowerblue"], how=how
drops.where(drops
)
# Step 3: Shade
# NEW: shade pixels where there are more pick ups than drop offs
# These are colored red
= tf.shade(
more_picks > drops), cmap=["darkred", "orangered"], how=how
picks.where(picks
)
# Step 4: Combine
# NEW: add the images together!
= tf.stack(more_picks, more_drops)
img
return tf.set_background(img, "black")
0], NYC[1]) create_merged_taxi_image(NYC[
Takeaway: pickups occur more often on major roads, and dropoffs on smaller roads
Let’s generate a time-lapse GIF of drop-offs over time
Powerful tool for visualizing trends over time
Define some functions…
Important: We can convert our datashaded images to the format of the Python Imaging Library (PIL) to visualize
def create_taxi_image(df, x_range, y_range, w=plot_width, h=plot_height, cmap=fire):
"""Create an image of taxi dropoffs, returning a Python Imaging Library (PIL) image."""
# Step 1: Create the canvas
= ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
cvs
# Step 2: Aggregate the dropoff positions, coutning number of passengers
= cvs.points(df, 'dropoff_x', 'dropoff_y', ds.count('passenger_count'))
agg
# Step 3: Shade
= tf.shade(agg, cmap=cmap, how='eq_hist')
img
# Set the background
= tf.set_background(img, "black")
img
# NEW: return an PIL image
return img.to_pil()
def convert_to_12hour(hr24):
"""Convert from 24 hr to 12 hr."""
from datetime import datetime
= datetime.strptime(str(hr24), "%H")
d return d.strftime("%I %p")
def plot_dropoffs_by_hour(fig, data_all_hours, hour, x_range, y_range):
"""Plot the dropoffs for particular hour."""
# Trim to the specific hour
= data_all_hours.loc[data_all_hours["dropoff_hour"] == hour]
df_this_hour
# Create the datashaded image for this hour
= create_taxi_image(df_this_hour, x_range, y_range)
img
# Plot the image on a matplotlib axes
# Use imshow()
plt.clf()= fig.gca()
ax =[x_range[0], x_range[1], y_range[0], y_range[1]])
ax.imshow(img, extent
# Format the axis and figure
"equal")
ax.set_aspect(
ax.set_axis_off()=0, right=1, top=1, bottom=0)
fig.subplots_adjust(left
# Optional: Add a text label for the hour
ax.text(0.05,
0.9,
convert_to_12hour(hour),="white",
color=40,
fontsize="left",
ha=ax.transAxes,
transform
)
# Draw the figure and return the image
# This converts our matplotlib Figure into a format readable by imageio
fig.canvas.draw()= np.frombuffer(fig.canvas.tostring_rgb(), dtype="uint8")
image = image.reshape(fig.canvas.get_width_height()[::-1] + (3,))
image
return image
Strategy:
- Create a datashaded image for each hour of taxi dropoffs, return as a PIL image object
- Use matplotlib’s
imshow()
to plot each datashaded image to a matplotlib Figure - Return each matplotlib Figure in a format readable by the
imageio
library - Combine all of our images for each hours into a GIF using the
imageio
library
import imageio
# Create a figure
= plt.subplots(figsize=(10, 10), facecolor="black")
fig, ax
# Create an image for each hour
= []
imgs for hour in range(24):
# Plot the datashaded image for this specific hour
print(hour)
= plot_dropoffs_by_hour(fig, taxi_ddf, hour, x_range=NYC[0], y_range=NYC[1])
img
imgs.append(img)
# Combing the images for each hour into a single GIF
"dropoffs.gif", imgs, duration=1000); imageio.mimsave(
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
Interesting aside: Beyond hvplot
Analyzing hourly and weekly trends for taxis using holoviews
- We’ll load taxi data from 2016 that includes the number of pickups per hour.
- Visualize weekly and hourly trends using a radial heatmap
= pd.read_csv('./data/nyc_taxi_2016_by_hour.csv.gz', parse_dates=['Pickup_date']) df
df.head()
Pickup_date | Pickup_Count | |
---|---|---|
0 | 2016-01-01 00:00:00 | 19865 |
1 | 2016-01-01 01:00:00 | 24376 |
2 | 2016-01-01 02:00:00 | 24177 |
3 | 2016-01-01 03:00:00 | 21538 |
4 | 2016-01-01 04:00:00 | 15665 |
Let’s plot a radial heatmap
The binning dimensions of the heat map will be:
- Day of Week and Hour of Day
- Week of Year
A radial heatmap can be read similar to tree rings:
- The center of the heatmap will represent the first week of the year, while the outer edge is the last week of the year
- Rotating clockwise along a specific ring tells you the day/hour.
# Create a Holoviews HeatMap option
= ["Day & Hour", "Week of Year", "Pickup_Count", "Date"]
cols = hv.HeatMap(
heatmap =["Day & Hour", "Week of Year"], vdims=["Pickup_Count", "Date"]
df[cols], kdims )
heatmap.opts(=True,
radial=600,
height=600,
width=None,
yticks=7,
xmarks=3,
ymarks=np.pi * 19 / 14,
start_angle=(
xticks"Friday",
"Saturday",
"Sunday",
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
),=["hover"],
tools="fire"
cmap )
Trends
- Taxi pickup counts are high between 7-9am and 5-10pm during weekdays which business hours as expected. In contrast, during weekends, there is not much going on until 11am.
- Friday and Saterday nights clearly stand out with the highest pickup densities as expected.
- Public holidays can be easily identified. For example, taxi pickup counts are comparetively low around Christmas and Thanksgiving.
- Weather phenomena also influence taxi service. There is a very dark stripe at the beginning of the year starting at Saturday 23rd and lasting until Sunday 24th. Interestingly, there was one of the biggest blizzards in the history of NYC.
Useful reference: the Holoviews example gallery
This radial heatmap example, and many more examples beyond hvplot available:
Exercise: Datashading Philly parking violations data
Download the data
- A (large) CSV of parking violation data is available for download at: https://musa550.s3.amazonaws.com/parking_violations.csv
- Navigate to your browser, plug in the above URL, and download the data
- The data is from Open Data Philly: https://www.opendataphilly.org/dataset/parking-violations
- Input data is in EPSG=4326
- Remember: You will need to convert latitude/longitude to Web Mercator (epsg=3857) to work with datashader.
Step 1: Use dask
to load the data
- The
dask.dataframe
module includes aread_csv()
function just like pandas - You’ll want to specify the
assume_missing=True
keyword for that function: that will let dask know that some columns are allowed to have missing values
import dask.dataframe as dd
# I downloaded the data and moved it to the "data/" folder
= dd.read_csv("data/parking_violations.csv", assume_missing=True) df
df
lon | lat | |
---|---|---|
npartitions=4 | ||
float64 | float64 | |
... | ... | |
... | ... | |
... | ... | |
... | ... |
len(df)
9412858
df.head()
lon | lat | |
---|---|---|
0 | -75.158937 | 39.956252 |
1 | -75.154730 | 39.955233 |
2 | -75.172386 | 40.034175 |
3 | NaN | NaN |
4 | -75.157291 | 39.952661 |
Step 2: Remove any rows with missing geometries
Remove rows that have NaN for either the lat
or lon
columns (hint: use the dropna() function!)
= df.dropna() df
df
lon | lat | |
---|---|---|
npartitions=4 | ||
float64 | float64 | |
... | ... | |
... | ... | |
... | ... | |
... | ... |
len(df)
8659655
Step 3: Convert lat/lng to Web Mercator coordinates (x, y)
Add two new columns, x
and y
, that represent the coordinates in the EPSG=3857 CRS.
Hint: Use datashader’s lnglat_to_meters()
function.
from datashader.utils import lnglat_to_meters
# Do the conversion
= lnglat_to_meters(df['lon'], df['lat']) x, y
# Add as columns
'x'] = x
df['y'] = y df[
df.head()
lon | lat | x | y | |
---|---|---|---|---|
0 | -75.158937 | 39.956252 | -8.366655e+06 | 4.859587e+06 |
1 | -75.154730 | 39.955233 | -8.366186e+06 | 4.859439e+06 |
2 | -75.172386 | 40.034175 | -8.368152e+06 | 4.870910e+06 |
4 | -75.157291 | 39.952661 | -8.366471e+06 | 4.859066e+06 |
5 | -75.162902 | 39.959713 | -8.367096e+06 | 4.860090e+06 |
df
lon | lat | x | y | |
---|---|---|---|---|
npartitions=4 | ||||
float64 | float64 | float64 | float64 | |
... | ... | ... | ... | |
... | ... | ... | ... | |
... | ... | ... | ... | |
... | ... | ... | ... |
Step 4: Get the x/y range for Philadelphia for our canvas
- Convert the lat/lng bounding box into Web Mercator EPSG=3857
- Use the
lnglat_to_meters()
function to do the conversion - You should have two variables
x_range
andy_range
that give you the correspondingx
andy
bounds
# Use lat/lng bounds for Philly
# This will exclude any points that fall outside this region
= [( -75.28, -74.96), (39.86, 40.14)] PhillyBounds
= PhillyBounds[0]
PhillyBoundsLng = PhillyBounds[1] PhillyBoundsLat
# Convert to an EPSG=3857
= lnglat_to_meters(PhillyBoundsLng, PhillyBoundsLat)
x_range, y_range
x_range
array([-8380131.26691764, -8344509.02986379])
# Optional: convert to lists as opposed to arrays
= list(x_range)
x_range = list(y_range) y_range
x_range
[-8380131.266917636, -8344509.029863787]
Step 5: Datashade the dataset
Create a matplotlib figure with the datashaded image of the parking violation dataset.
# STEP 1: Create the canvas
= ds.Canvas(plot_width=600, plot_height=600, x_range=x_range, y_range=y_range)
cvs
# STEP 2: Aggregate the points
= cvs.points(df, "x", "y", agg=ds.count())
agg
# STEP 3: Shade the aggregated pixels
= tf.shade(agg, cmap=fire, how="eq_hist")
img
# Optional: Set the background of the image
= tf.set_background(img, "black")
img
# Show!
img
type(img)
datashader.transfer_functions.Image
Let’s add the city limits.
You’ll need to convert your datashaded image to PIL format and use the imshow()
function.
# Load
= gpd.read_file("./data/City_Limits.geojson")
city_limits
# Same CRS!
= city_limits.to_crs(epsg=3857) city_limits
# Show with matplotlib
= plt.subplots(figsize=(10, 10))
fig, ax
# NEW:
=[x_range[0], x_range[1], y_range[0], y_range[1]])
ax.imshow(img.to_pil(), extent
# Format
"equal")
ax.set_aspect(
ax.set_axis_off()
# Add the city limits on top!
=ax, facecolor="none", edgecolor="white", linewidth=2) city_limits.plot(ax
<Axes: >
Step 6: Make an interactive map
Use hvplot to make an interactive version of your datashaded image.
= df.hvplot.points(
violations ="x",
x="y",
y=True,
datashade=True,
geo=3857,
crs=600,
frame_width=600,
frame_height=fire,
cmap=x_range,
xlim=y_range,
ylim
)
* violations gv.tile_sources.CartoDark
That’s it!
- Next week: dashboards and presenting your results on the web!
- See you on Monday!