Passenger flow is closely related with crime. The more passenger flow in a station, the more criminals there will be. Therefore, we conduct EDA to:
The color of each circle is the line of the subway and the size is the total number of passengers in 2021.
df =
passenger_df %>%
drop_na(entry_diff_imputed, exit_diff_imputed) %>%
group_by(station, service, linename, sublocality, postal_code, lat, long) %>%
summarise(total_entry = sum(entry_diff_imputed),
total_exit = sum(exit_diff_imputed)) %>%
mutate(passenger_flow = total_entry + total_exit,
# set passenger_flow to int
passenger_flow = as.integer(passenger_flow))
# df %>%
# leaflet() %>%
# addTiles() %>%
# addCircleMarkers(~long, ~lat,radius= df$passenger_flow/100000000, weight= 0.9)
qpal <- colorQuantile("YlOrRd", df$passenger_flow, n = 4)
pal <-
colorFactor(palette = c("blue", "azure4", "orange",'green','green','brown','yellow','red','forestgreen','purple'),
levels = c('8 Avenue(ACE)',
'Shuttle(S)',
'6 Avenue(BDFM)',
'Brooklyn-Queens Crosstown(G)',
'Brooklyn-Queens(G)',
'14 St-Canarsie(L)',
'Broadway(NQRW)',
'7 Avenue(123)',
'Lexington Av(456)',
'Flushing(7)'))
df %>%
mutate(service = ifelse(service == 'Brooklyn-Queens Crosstown(G)', 'Brooklyn-Queens(G)', service)) %>%
mutate(passenger_flow2 = 10*log(passenger_flow)) %>%
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircles(lng = ~long, lat = ~lat, weight = 1, stroke = FALSE,
radius = ~sqrt(passenger_flow)/20, popup = ~station, color = ~pal(service), opacity = 0.75, fillOpacity = 0.75) %>%
addLegend("topright", pal = pal, values = ~service,
title = "Subway Service", opacity = 0.75) %>%
setView(-73.8399986, 40.746739, zoom = 11)
There are patterns between station locations and total passenger flow. Big stations are mostly located in lower and middle Manhattan, and there are some sub center stations in other areas, such as 9th Street station in Brooklyn and FLUSHING-MAIN station in Queen.
df %>%
ungroup %>%
filter('sublocality' != 'None') %>%
drop_na() %>%
group_by(sublocality) %>%
summarise(passenger_flow = sum(passenger_flow)) %>%
mutate(sublocality = as.factor(sublocality)) %>%
arrange(-passenger_flow) %>%
filter(passenger_flow < 500000 |passenger_flow > 60000000) %>%
knitr::kable()
sublocality | passenger_flow |
---|---|
Manhattan | 890710230 |
Brooklyn | 288469501 |
Queens | 174401767 |
Bronx | 98407980 |
Staten Island | 297340 |
Manhattan has the most subway passengers in 2021 and Staten Island has the least subway passenger_flow. Additionally, sublocality only has 5 levels, which is too few for a machine learning model.
# cache zip boundaries that are download via tigris package
options(tigris_use_cache = TRUE)
# get zip boundaries that start with 282
char_zips = zctas(cb = TRUE)
char_zips =
char_zips %>%
rename(postal_code = GEOID10)
summary_df<-
df %>%
mutate(postal_code) %>%
group_by(postal_code) %>%
summarise(passenger_flow = sum(passenger_flow),
station_cnt = n_distinct(station, linename))
summary_df<-geo_join(char_zips,
summary_df,
by_sp = "postal_code",
by_df = "postal_code",
how = "left") %>%
filter(passenger_flow>=0)
pal <- colorNumeric(
palette = "Greens",
domain = summary_df$passenger_flow,
na.color = "white")
labels <-
paste0(
"Zip Code: ",
summary_df$postal_code, "<br/>",
"Flow of Passengers: ",
summary_df$passenger_flow) %>%
lapply(htmltools::HTML)
# summary_df2 =
# char_zips %>%
# select(postal_code) %>%
# left_join(summary_df, by = 'postal_code')
summary_df %>%
mutate(postal_code_int = as.integer(postal_code)) %>%
filter(postal_code_int >= 10000 & postal_code_int < 14900) %>%
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~pal(passenger_flow),
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
fillOpacity = 0.7,
highlight = highlightOptions(weight = 2,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = labels) %>%
addLegend(pal = pal,
values = ~passenger_flow,
opacity = 0.7,
title = htmltools::HTML("Total Passengers 2021"),
position = "bottomright") %>%
setView(-73.8399986, 40.746739, zoom = 10)
labels <-
paste0(
"Zip Code: ",
summary_df$postal_code, "<br/>",
"Stations count: ",
summary_df$station_cnt) %>%
lapply(htmltools::HTML)
pal <- colorNumeric(
palette = "Purples",
domain = summary_df$station_cnt,
na.color = "white")
summary_df %>%
mutate(postal_code_int = as.integer(postal_code)) %>%
filter(postal_code_int >= 10000 & postal_code_int < 14900) %>%
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(fillColor = ~pal(station_cnt),
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
fillOpacity = 0.7,
highlight = highlightOptions(weight = 2,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = labels) %>%
addLegend(pal = pal,
values = ~station_cnt,
opacity = 0.7,
title = htmltools::HTML("Total Stations 2021"),
position = "bottomright") %>%
setView(-73.8399986, 40.746739, zoom = 10)
The zipcode does not demonstrate the exact relationship between location and passenger flow. For instance, some zipcodes such as 10002 and 10011 in lower Manhattan should have more passengers, however few stations are built there. Therefore, the key cause to this confusion is that subway stations are not built based on zipcode.
We set the number of clusters to be 8 and use Kmeans to cluster latitudes and longitudes. The color of each circle is the Kmeans cluster they belong and the size is the total number of passengers in 2021.
# conduct kmeans
df_sub =
df %>%
ungroup() %>%
select(long, lat) %>%
drop_na()
k2 = kmeans(df_sub, centers = 8, nstart = 25)
# EDA with Kmeans results
df$cluster = k2$cluster
df =
df %>%
mutate(cluster = case_when(
cluster == 1 ~ 'Queen',
cluster == 2 ~ 'Upper Manhattan',
cluster == 3 ~ 'Queen-Brooklyn',
cluster == 4 ~ 'Middle Manhattan',
cluster == 5 ~ 'Bronx',
cluster == 6 ~ 'Brooklyn',
cluster == 7 ~ 'Lower Manhattan',
cluster == 8 ~ 'Rockaway Beach',
))
pal = colorFactor(
brewer.pal(n = 10, name = "Set1"),
df$cluster,
levels = NULL,
ordered = FALSE,
na.color = "#808080",
alpha = FALSE,
reverse = FALSE
)
df %>%
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircles(lng = ~long, lat = ~lat, weight = 1, stroke = FALSE,
radius = ~sqrt(passenger_flow)/20, popup = ~station, color = ~pal(cluster), opacity = 1, fillOpacity = 1) %>%
addLegend("topright", pal = pal, values = ~cluster,
title = "Kmeans Cluster", opacity = 1) %>%
setView(-73.8399986, 40.746739, zoom = 11)
# # Output cluster centers
# as_tibble(k2$centers) %>%
# mutate(cluster = seq(1,8,1)) %>%
# mutate(cluster = case_when(
# cluster == 1 ~ 'Queen',
# cluster == 2 ~ 'Upper Manhattan',
# cluster == 3 ~ 'Queen-Brooklyn',
# cluster == 4 ~ 'Middle Manhattan',
# cluster == 5 ~ 'Bronx',
# cluster == 6 ~ 'Brooklyn',
# cluster == 7 ~ 'Lower Manhattan',
# cluster == 8 ~ 'Rockaway Beach',
# )) %>% write.csv('data/Kmeans_centers.csv')
Kmeans algorithm cluster Manhattan into three parts: lower Manhattan, middle Manhattan and upper Manhattan. Brooklyn and Queens shares 3 cluster. Also, there is a cluster for Bronx. We think the Kmeans result is easier to interpret than that of zipcode or sublocality and it can partly represent the relationship between passenger flow and location. Therefore, we use Kmeans result as the location variable in our model.