diff --git a/NAMESPACE b/NAMESPACE index 4175ba9e..2bf2ca6c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,6 +10,7 @@ export(filter_feed_by_trips) export(filter_stop_times) export(filter_stops) export(get_feedlist) +export(get_route_frequency) export(get_route_geometry) export(get_stop_frequency) export(get_trip_geometry) diff --git a/R/frequencies.R b/R/frequencies.R index fffda5cd..7c598395 100644 --- a/R/frequencies.R +++ b/R/frequencies.R @@ -1,18 +1,24 @@ #' Get Stop Frequency #' -#' Note that some GTFS feeds contain a frequency data frame already. +#' Calculate the number of departures and mean headways for all stops within a +#' given timespan and for given service_ids. +#' +#' @note Some GTFS feeds contain a frequency data frame already. #' Consider using this instead, as it will be more accurate than what -#' tidytransit calculates. +#' tidytransit calculates. #' #' @param gtfs_obj a list of gtfs dataframes as read by [read_gtfs()]. -#' @param start_hour (optional) an integer indicating the start hour (default 6) -#' @param end_hour (optional) an integer indicating the end hour (default 22) -#' @param service_ids (optional) a set of service_ids from the calendar dataframe -#' identifying a particular service id. If not provided the service_id -#' with the most departures is used -#' @param by_route default TRUE, if FALSE then calculate headway for any line coming through the stop in the same direction on the same schedule. +#' @param start_time analysis start time, can be given as "HH:MM:SS", +#' hms object or numeric value in seconds. +#' @param end_time analysis perdiod end time, can be given as "HH:MM:SS", +#' hms object or numeric value in seconds. +#' @param service_ids A set of service_ids from the calendar dataframe +#' identifying a particular service id. If not provided, the service_id +#' with the most departures is used. +#' @param by_route Default TRUE, if FALSE then calculate headway for any line coming +#' through the stop in the same direction on the same schedule. #' @return dataframe of stops with the number of departures and the headway -#' (departures divided by timespan) as columns. +#' (departures divided by timespan) in seconds as columns #' #' @importFrom dplyr %>% #' @importFrom rlang .data !! quo enquo @@ -24,11 +30,16 @@ #' x <- order(stop_frequency$mean_headway) #' head(stop_frequency[x,]) get_stop_frequency <- function(gtfs_obj, - start_hour = 6, - end_hour = 22, + start_time = "06:00:00", + end_time = "22:00:00", service_ids = NULL, - by_route = FALSE) { + by_route = TRUE) { n_deps <- direction_id <- NULL + + if(is.character(start_time)) start_time <- hhmmss_to_seconds(start_time) + if(is.character(end_time)) end_time <- hhmmss_to_seconds(end_time) + + # get service id with most departures if(is.null(service_ids)) { dep_per_trip = gtfs_obj$stop_times %>% dplyr::group_by(trip_id) %>% dplyr::count(name = "n_deps") %>% @@ -39,27 +50,82 @@ get_stop_frequency <- function(gtfs_obj, dplyr::arrange(dplyr::desc(n_deps)) service_ids = dep_per_service_id$service_id[1] } + + # filter stop_times to service_ids and start/end_time trips = gtfs_obj$trips %>% filter(service_id %in% service_ids) - # TODO change times to hms or strings stop_times = gtfs_obj$stop_times %>% filter(trip_id %in% trips$trip_id) %>% - filter(departure_time >= start_hour*3600 & arrival_time <= end_hour*3600) %>% + filter(departure_time >= start_time & arrival_time <= end_time) %>% left_join(trips[c("trip_id", "route_id", "direction_id", "service_id")], "trip_id") - # find number of departure per stop_id, route_id, service_id + # find number of departure per stop_id (route_id, direction_id, service_id) if(by_route) { - freq = stop_times %>% - dplyr::group_by(stop_id, route_id, direction_id, service_id) %>% - dplyr::count(name = "n_departures") %>% dplyr::ungroup() + freq = stop_times %>% + dplyr::group_by(stop_id, route_id, direction_id, service_id) %>% + dplyr::count(name = "n_departures") %>% dplyr::ungroup() } else { freq = stop_times %>% dplyr::group_by(stop_id, service_id) %>% dplyr::count(name = "n_departures") %>% dplyr::ungroup() } + # calculate average headway - duration = (end_hour-start_hour)*3600 + duration = as.numeric(end_time-start_time) freq$mean_headway <- round(duration / freq$n_departures) - + freq } + +#' Get Route Frequency +#' +#' Calculate the number of departures and mean headways for routes within a given timespan +#' and for given service_ids. +#' +#' @note Some GTFS feeds contain a frequency data frame already. +#' Consider using this instead, as it will be more accurate than what +#' tidytransit calculates. +#' +#' @param gtfs_obj a list of gtfs dataframes as read by the trread package. +#' @param start_time analysis start time, can be given as "HH:MM:SS", +#' hms object or numeric value in seconds. +#' @param end_time analysis perdiod end time, can be given as "HH:MM:SS", +#' hms object or numeric value in seconds. +#' @param service_ids A set of service_ids from the calendar dataframe +#' identifying a particular service id. If not provided, the service_id +#' with the most departures is used. +#' @return a dataframe of routes with variables or headway/frequency in seconds for a route +#' within a given time frame +#' @export +#' @examples +#' data(gtfs_duke) +#' routes_frequency <- get_route_frequency(gtfs_duke) +#' x <- order(routes_frequency$median_headways) +#' head(routes_frequency[x,]) +get_route_frequency <- function(gtfs_obj, + start_time = "06:00:00", + end_time = "22:00:00", + service_ids = NULL) { + total_departures <- median_headways <- mean_headways <- NULL + n_departures <- mean_headway <- st_dev_headways <- stop_count <- NULL + if(feed_contains(gtfs_obj, "frequencies") && nrow(gtfs_obj$frequencies) > 0) { + message("A pre-calculated frequencies dataframe exists for this feed already, + consider using that.") + } + departures_per_stop = get_stop_frequency(gtfs_obj, start_time, end_time, + service_ids, by_route = TRUE) + + if(dim(departures_per_stop)[[1]] != 0) { + routes_frequency = departures_per_stop %>% + group_by(route_id) %>% + summarise(total_departures = sum(n_departures), + median_headways = round(median(mean_headway)), + mean_headways = round(mean(mean_headway)), + st_dev_headways = round(sd(mean_headway), 2), + stop_count = dplyr::n()) + } else { + warning("Failed to calculate frequency, try passing a service_id from calendar_df.") + } + + return(routes_frequency) +} diff --git a/R/joins.R b/R/joins.R index 046f81f7..d56690aa 100644 --- a/R/joins.R +++ b/R/joins.R @@ -154,7 +154,7 @@ filter_feed_by_stops = function(gtfs_obj, stop_ids = NULL, stop_names = NULL) { #' \code{\link{filter_feed_by_trips}}, \code{\link{filter_feed_by_date}} #' @export filter_feed_by_date = function(gtfs_obj, extract_date, - min_departure_time = "00:00:00", max_arrival_time = "48:00:00") { + min_departure_time, max_arrival_time) { st = filter_stop_times(gtfs_obj, extract_date, min_departure_time, max_arrival_time) st <- dplyr::as_tibble(st) attributes(st)$stops <- NULL diff --git a/R/raptor.R b/R/raptor.R index 320b0052..7666e5bd 100644 --- a/R/raptor.R +++ b/R/raptor.R @@ -440,10 +440,10 @@ travel_times = function(filtered_stop_times, #' #' @param gtfs_obj a gtfs feed #' @param extract_date date to extract trips from this day (Date or "YYYY-MM-DD" string) -#' @param min_departure_time The earliest departure time. Can be given as "HH:MM:SS", +#' @param min_departure_time (optional) The earliest departure time. Can be given as "HH:MM:SS", #' hms object or numeric value in seconds. -#' @param max_arrival_time The latest arrival time. Can be given as "HH:MM:SS", -#' hms object or numeric value in seconds +#' @param max_arrival_time (optional) The latest arrival time. Can be given as "HH:MM:SS", +#' hms object or numeric value in seconds. #' #' @return Filtered `stop_times` data.table for [travel_times()] and [raptor()]. #' @@ -462,10 +462,14 @@ filter_stop_times = function(gtfs_obj, if(is.character(extract_date)) { extract_date <- as.Date(extract_date) } - if(is.character(min_departure_time)) { + if(missing(min_departure_time)) { + min_departure_time <- 0 + } else if(is.character(min_departure_time)) { min_departure_time <- hhmmss_to_seconds(min_departure_time) } - if(is.character(max_arrival_time)) { + if(missing(max_arrival_time)) { + max_arrival_time <- max(gtfs_obj$stop_times$arrival_time)+1 + } else if(is.character(max_arrival_time)) { max_arrival_time <- hhmmss_to_seconds(max_arrival_time) } min_departure_time <- as.numeric(min_departure_time) diff --git a/README.md b/README.md index 1a1628f1..d5dd2f90 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ status](https://github.com/r-transit/tidytransit/workflows/R-CMD-check/badge.svg # tidytransit Use tidytransit to map transit stops and routes, calculate travel times and transit -frequencies, and validate transit feeds. tidytransit reads the +frequencies, and validate transit feeds. Tidytransit reads the [General Transit Feed Specification](http://gtfs.org/) into [tidyverse](https://tibble.tidyverse.org/) and [simple features](https://en.wikipedia.org/wiki/Simple_Features) data frames. @@ -23,8 +23,9 @@ Tidytransit can be used to: Have a look at the following vignettes to see how tidytransit can be used to analyse a feed: - [the tutorial](http://tidytransit.r-transit.org/articles/introduction.html) -- [introduction to service patterns](http://tidytransit.r-transit.org/articles/servicepatterns.html) -- [introduction to time tables](http://tidytransit.r-transit.org/articles/timetable.html) +- [calendar and service patterns](http://tidytransit.r-transit.org/articles/servicepatterns.html) +- [create time tables for stops](http://tidytransit.r-transit.org/articles/timetable.html) +- [frequency and headway calculations](http://tidytransit.r-transit.org/articles/frequency.html) ## Installation diff --git a/man/filter_feed_by_date.Rd b/man/filter_feed_by_date.Rd index fea767b6..27afc7fb 100644 --- a/man/filter_feed_by_date.Rd +++ b/man/filter_feed_by_date.Rd @@ -7,8 +7,8 @@ filter_feed_by_date( gtfs_obj, extract_date, - min_departure_time = "00:00:00", - max_arrival_time = "48:00:00" + min_departure_time, + max_arrival_time ) } \arguments{ @@ -16,11 +16,11 @@ filter_feed_by_date( \item{extract_date}{date to extract trips from this day (Date or "YYYY-MM-DD" string)} -\item{min_departure_time}{The earliest departure time. Can be given as "HH:MM:SS", +\item{min_departure_time}{(optional) The earliest departure time. Can be given as "HH:MM:SS", hms object or numeric value in seconds.} -\item{max_arrival_time}{The latest arrival time. Can be given as "HH:MM:SS", -hms object or numeric value in seconds} +\item{max_arrival_time}{(optional) The latest arrival time. Can be given as "HH:MM:SS", +hms object or numeric value in seconds.} } \value{ tidygtfs object with filtered tables diff --git a/man/filter_stop_times.Rd b/man/filter_stop_times.Rd index ff3ec3e7..c5332b23 100644 --- a/man/filter_stop_times.Rd +++ b/man/filter_stop_times.Rd @@ -11,11 +11,11 @@ filter_stop_times(gtfs_obj, extract_date, min_departure_time, max_arrival_time) \item{extract_date}{date to extract trips from this day (Date or "YYYY-MM-DD" string)} -\item{min_departure_time}{The earliest departure time. Can be given as "HH:MM:SS", +\item{min_departure_time}{(optional) The earliest departure time. Can be given as "HH:MM:SS", hms object or numeric value in seconds.} -\item{max_arrival_time}{The latest arrival time. Can be given as "HH:MM:SS", -hms object or numeric value in seconds} +\item{max_arrival_time}{(optional) The latest arrival time. Can be given as "HH:MM:SS", +hms object or numeric value in seconds.} } \value{ Filtered \code{stop_times} data.table for \code{\link[=travel_times]{travel_times()}} and \code{\link[=raptor]{raptor()}}. diff --git a/man/get_route_frequency.Rd b/man/get_route_frequency.Rd new file mode 100644 index 00000000..676282f5 --- /dev/null +++ b/man/get_route_frequency.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/frequencies.R +\name{get_route_frequency} +\alias{get_route_frequency} +\title{Get Route Frequency} +\usage{ +get_route_frequency( + gtfs_obj, + start_time = "06:00:00", + end_time = "22:00:00", + service_ids = NULL +) +} +\arguments{ +\item{gtfs_obj}{a list of gtfs dataframes as read by the trread package.} + +\item{start_time}{analysis start time, can be given as "HH:MM:SS", +hms object or numeric value in seconds.} + +\item{end_time}{analysis perdiod end time, can be given as "HH:MM:SS", +hms object or numeric value in seconds.} + +\item{service_ids}{A set of service_ids from the calendar dataframe +identifying a particular service id. If not provided, the service_id +with the most departures is used.} +} +\value{ +a dataframe of routes with variables or headway/frequency in seconds for a route +within a given time frame +} +\description{ +Calculate the number of departures and mean headways for routes within a given timespan +and for given service_ids. +} +\note{ +Some GTFS feeds contain a frequency data frame already. +Consider using this instead, as it will be more accurate than what +tidytransit calculates. +} +\examples{ +data(gtfs_duke) +routes_frequency <- get_route_frequency(gtfs_duke) +x <- order(routes_frequency$median_headways) +head(routes_frequency[x,]) +} diff --git a/man/get_stop_frequency.Rd b/man/get_stop_frequency.Rd index a8919750..830e0bbb 100644 --- a/man/get_stop_frequency.Rd +++ b/man/get_stop_frequency.Rd @@ -6,31 +6,38 @@ \usage{ get_stop_frequency( gtfs_obj, - start_hour = 6, - end_hour = 22, + start_time = "06:00:00", + end_time = "22:00:00", service_ids = NULL, - by_route = FALSE + by_route = TRUE ) } \arguments{ \item{gtfs_obj}{a list of gtfs dataframes as read by \code{\link[=read_gtfs]{read_gtfs()}}.} -\item{start_hour}{(optional) an integer indicating the start hour (default 6)} +\item{start_time}{analysis start time, can be given as "HH:MM:SS", +hms object or numeric value in seconds.} -\item{end_hour}{(optional) an integer indicating the end hour (default 22)} +\item{end_time}{analysis perdiod end time, can be given as "HH:MM:SS", +hms object or numeric value in seconds.} -\item{service_ids}{(optional) a set of service_ids from the calendar dataframe -identifying a particular service id. If not provided the service_id -with the most departures is used} +\item{service_ids}{A set of service_ids from the calendar dataframe +identifying a particular service id. If not provided, the service_id +with the most departures is used.} -\item{by_route}{default TRUE, if FALSE then calculate headway for any line coming through the stop in the same direction on the same schedule.} +\item{by_route}{Default TRUE, if FALSE then calculate headway for any line coming +through the stop in the same direction on the same schedule.} } \value{ dataframe of stops with the number of departures and the headway -(departures divided by timespan) as columns. +(departures divided by timespan) in seconds as columns } \description{ -Note that some GTFS feeds contain a frequency data frame already. +Calculate the number of departures and mean headways for all stops within a +given timespan and for given service_ids. +} +\note{ +Some GTFS feeds contain a frequency data frame already. Consider using this instead, as it will be more accurate than what tidytransit calculates. } diff --git a/tests/testthat/test-headways.R b/tests/testthat/test-headways.R index 71b54dbd..e845a079 100644 --- a/tests/testthat/test-headways.R +++ b/tests/testthat/test-headways.R @@ -2,8 +2,8 @@ context("Frequencies are calculated correctly") # TODO rewrite with synthesized sample data test_that("Stop frequencies (headways) for included data are as expected", { - expect_equal(nrow(get_stop_frequency(gtfs_duke)), 47) - expect_equal(nrow(get_stop_frequency(gtfs_duke, start_hour = 10, end_hour = 11)), 41) + expect_equal(nrow(get_stop_frequency(gtfs_duke, by_route = FALSE)), 47) + expect_equal(nrow(get_stop_frequency(gtfs_duke, start_time = 10*3600, end_time = 11*3600, by_route = FALSE)), 41) stops_frequency <- get_stop_frequency(gtfs_duke, service_ids = "c_853_b_19828_d_64") ex_address <- stops_frequency$mean_headway[stops_frequency$stop_id==778058] @@ -16,3 +16,15 @@ test_that("Stop frequencies (headways) for included data are as expected", { colnames(stops_frequency_by_route), c("stop_id", "route_id", "direction_id", "service_id", "n_departures", "mean_headway")) }) + +test_that("Route frequencies (headways)", { + # TODO rewrite with synthesized sample data + routes_frequency <- get_route_frequency(gtfs_duke) + expect_equal(routes_frequency[routes_frequency$route_id == 1679, ]$median_headways, 24*60) +}) + +test_that("Route frequencies (headways) w/ service id", { + # TODO rewrite with synthesized sample data + routes_frequency <- get_route_frequency(gtfs_duke, service_id = "c_883_b_21967_d_31") + expect_equal(routes_frequency[routes_frequency$route_id == 1680, ]$median_headways, (53+1/3)*60) +}) diff --git a/tests/testthat/test-raptor.R b/tests/testthat/test-raptor.R index 16f0abeb..85038bc5 100644 --- a/tests/testthat/test-raptor.R +++ b/tests/testthat/test-raptor.R @@ -345,3 +345,9 @@ test_that("raptor with filtered feed", { stop_name = "One", time_range = 3600) expect_equal(tt1, tt2) }) + +test_that("filter feed without min/max time", { + st.1 = filter_stop_times(g, "2018-10-01") + st.2 = filter_stop_times(g, "2018-10-01", "00:00:00", 999*3600) + expect_true(all(st.1 == st.2)) +}) diff --git a/vignettes/frequency.Rmd b/vignettes/frequency.Rmd new file mode 100644 index 00000000..ad703c99 --- /dev/null +++ b/vignettes/frequency.Rmd @@ -0,0 +1,391 @@ +--- +title: "Transit (GTFS) Service & Headway Mapping with R" +author: "Tom Buckley" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{tidytransit-headways} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +library(dplyr) +library(tidytransit) +library(ggplot2) +library(sf) +options(dplyr.summarise.inform=F) +``` + +## Introduction + +The focus of this vignette is on how to use R to make graphics about where and how often +transit service operates based on schedule data published in the [General Transit Feed Specification](http://gtfs.org/). +We'll focus on the New York City Metropolitan Transit Agency's Subway schedule for this +vignette, but you can easily apply it to thousands of other GTFS data sources. See the +[tidytransit introductory vignette](http://tidytransit.r-transit.org/articles/introduction.html#finding-more-gtfs-feeds) +for instructions on finding data for other cities and operators. + +## Setup + +You'll need to have tidytransit installed. Please see the [install instructions](http://tidytransit.r-transit.org/articles/introduction.html#installation-dependencies) for more details. + +## Outline + +So, to review, we're going to: + +1) Import Transit Data (GTFS) +2) Identify Weekday Schedules of Service +3) Calculate Headways +4) Map Headways By Route +5) Map Departures by Stop and Route + +### 1) Import Transit Data (GTFS) + +We'll start by importing a snapshot of the NYC MTA's subway schedule, which is included +with the package when [installed](http://tidytransit.r-transit.org/index.html#installation). + +```{r} +local_gtfs_path <- system.file("extdata", "google_transit_nyc_subway.zip", package = "tidytransit") +gtfs <- read_gtfs(local_gtfs_path) +``` + +But note that you can also just uncomment the line below and import the data from the NYC +MTA's URL directly. + +```{r} +# gtfs <- read_gtfs("http://web.mta.info/developers/data/nyct/subway/google_transit.zip") +``` + +### 2) Identify Weekday Schedules of Service + +GTFS feeds typically contain a schedule of all the schedules of service for a given system. +Selecting a schedule of service in NYC allows us to focus on, for example, non-holiday +weekday service, in the Fall of 2019. In some feeds, service selection can be more or less +complicated than NYC. In any case, you'll want to read the [service patterns](http://tidytransit.r-transit.org/articles/servicepatterns.html) +vignette included in this package in order to see how you can select the right service for +your needs. + +We use one of the functions described in that vignette to create a table on the gtfs feed +that lets us filter by weekday/weekend service. + +```{r} +gtfs <- set_servicepattern(gtfs) +``` + +After setting the service patterns, we can summarise each service by the number of trips and +stops. We'll also summarise the total distance covered by all trips in the service, and then +check that against the total distance covered by the average route. First, we need to +calculate the distance of each part of the route shapes. To do this (and for creating maps +later on) we convert stops and shapes to [simple features](https://r-spatial.github.io/sf/articles/sf1.html) +with `gtfs_as_sf`. + +```{r} +gtfs <- gtfs_as_sf(gtfs) +gtfs$shapes$length <- st_length(gtfs$shapes) + +shape_lengths <- gtfs$shapes %>% + as.data.frame() %>% + select(shape_id, length, -geometry) +``` + +Now we're ready to roll the statistics up to services. + +```{r} +service_pattern_summary <- gtfs$trips %>% + left_join(gtfs$.$servicepatterns, by="service_id") %>% + left_join(shape_lengths, by="shape_id") %>% + left_join(gtfs$stop_times, by="trip_id") %>% + group_by(servicepattern_id) %>% + summarise( + trips = n(), + routes = n_distinct(route_id), + total_distance_per_day_km = sum(as.numeric(length), na.rm=TRUE)/1e3, + route_avg_distance_km = (sum(as.numeric(length), na.rm=TRUE)/1e3)/(trips*routes), + stops=(n_distinct(stop_id)/2)) +``` + +We can also add the number of days that each service is in operation. + +```{r} +service_pattern_summary <- gtfs$.$dates_servicepatterns %>% + group_by(servicepattern_id) %>% + summarise(days_in_service = n()) %>% + left_join(service_pattern_summary, by="servicepattern_id") +``` + +And then we'll print the summary. + +```{r} +knitr::kable(service_pattern_summary) +``` + +It seems that if we want to summarise the most common patterns of service in the NYC Metro +system, we should use the `s_e25d6ca` service pattern, as it has the most days in service, +the most trips, stops, and routes. + +We'll use that pattern below to pull out the service_ids that we need to use to identify +trips in the GTFS feed for which we want to summarise service. + +```{r} +service_ids <- gtfs$.$servicepattern %>% + filter(servicepattern_id == 's_e25d6ca') %>% + pull(service_id) + +head(service_ids) %>% + knitr::kable() +``` + +So, what are these service_id codes? How they are put together varies from operator to +operator. The important thing is that the service_ids are also a field on the `trips` +table, which describes all the trips taken in the system. + +Lets see how many trips fall under each of these service_ids on the trips table, and how +they relate to routes. + +```{r} +gtfs$trips %>% + filter(service_id %in% service_ids) %>% + group_by(service_id, route_id) %>% + summarise(count = n()) %>% + head() %>% + knitr::kable() +``` + +Given the one-to-one relationship between service_ids and routes, we might conclude that +the NYC Subway GTFS creates service_ids for each route that a trip runs on. Some GTFS feeds +are simpler: a single service_id might relate to 'all vehicle trips running every weekdays'. +Service patterns get us around complications like this by describing service in terms of +exhaustive calendar dates, regardless of whether an operator may break out each route as +a different service. + +### 3) Calculate Headways + +So, now that we've used service patterns to identify the set of service_ids that refer to +all weekday trips, we can summarize service between 6 am and 10 am for the NYC Subway +system on weekdays. + +```{r} +am_stop_freq <- get_stop_frequency(gtfs, start_time = 6*3600, end_time = 10*3600, + service_ids = service_ids, by_route = TRUE) +``` + +```{r} +knitr::kable(head(am_stop_freq)) +``` + +This table includes columns for the id for a given stop, the route_id, our selected +service_ids, and the number of departures and the average headway for a given direction +from 6 am to 10 am on weekdays. + +The `get_stop_frequency` function simply counts the number of departures within the time +frame to get departures per stop. Then, to get headways, it divides the number of seconds +(between start_time and end_time) by the number of departures, and rounds to the nearest +integer. + +Lets have a look at the headways for the 1 train, which runs from the Bronx down to the +Bottom of Manhattan. + +First, we filter the `am_stop_freq` data frame to just stops going in one direction on the 1 +train, and then we join to the original `stops` table, which includes a more descriptive +stop_name. + +```{r} +one_line_stops <- am_stop_freq %>% + filter(route_id == 1 & direction_id == 0) %>% + left_join(gtfs$stops, by ="stop_id") %>% + mutate(mean_headway_minutes = mean_headway/60) +``` + +As we can see, some stops seem to have higher headways than others, even when the train is +running in the same direction. This may be counterintuitive, because we might expect the +train to run through every stop the same amount of times for a given direction. + +Lets inspect the stops at which headways are higher. + +```{r} +one_line_stops %>% + arrange(desc(mean_headway)) %>% + select(stop_name, n_departures, mean_headway) %>% + head() %>% + knitr::kable() +``` + +And those at which headways are lower: + +```{r} +one_line_stops %>% + arrange(desc(mean_headway)) %>% + select(stop_name, n_departures, mean_headway) %>% + tail() %>% + knitr::kable() +``` + +Here we can see that the 242-Van Cortland Stop, the last stop up North, in the Bronx, has +noticeably higher headways (8 mins) at this time of day than the South Ferry Stop, +which is at the south end of Manhattan. + +Lets also plot the headways at these stops on a map to see how they are distributed across +the city. First, we join the stops sf object to the 1 line's calculated stop headways. + +```{r} +one_line_stops_sf <- gtfs$stops %>% + right_join(one_line_stops, by="stop_id") +``` + +And then use ggplot's `geom_sf` to plot the headways. + +```{r, fig.width=9} +one_line_stops_sf %>% + ggplot() + + geom_sf(aes(color = mean_headway_minutes)) + + theme_bw() +``` + +On the map too, we can see that there is some variation in stop headways. During certain +times of the day, the 1 train skips stops north of a certain stop in manhattan, presumably +in order to turn around and provide shorter headways to stops south of that stop. + +Finally, we can easily summarise what the headways are like along the entire route now, by +using R's default summary function for the vector of headways. + +```{r} +summary(one_line_stops$mean_headway) +``` + +This is the same method that tidytransit uses to summarise headways along all routes in the +system when we use the `get_route_frequency` function, which we'll try next. + +### 4) Map Headways By Route + +Now we'll use the `get_route_frequency` function to summarise transit service by route, for +the same time period. + +```{r} +am_route_freq <- get_route_frequency(gtfs, service_ids = service_ids, + start_time = 6*3600, end_time = 10*3600) +head(am_route_freq) %>% + knitr::kable() +``` + +Since, under the hood, this table is a summary of stop frequencies along each route, it +includes the same variables as a summary of the headways at each stop along the route, as +well as a sum of all departures. Again, its important to note that this summary is based on +the trips that happened within the time frame we specify. + +As with the stops, we can easily join this table to simple features and then plot it on a +map. Note that here too we pass in the select service_ids from above, as the route run +by a vehicle also depends on the selected service. + +```{r} +# get_route_geometry needs a gtfs object that includes shapes as simple feature data frames +routes_sf <- get_route_geometry(gtfs, service_ids = service_ids) +``` + +Then we join the geometries to the calculated frequencies: + +```{r} +routes_sf <- routes_sf %>% + inner_join(am_route_freq, by = 'route_id') +``` + +And finally, lets plot the routes with median headways of less than 10 minutes in the morning. + +```{r, fig.width=6, fig.height=10, warn=FALSE} +# convert to an appropriate coordinate reference system +routes_sf_crs <- sf::st_transform(routes_sf, 26919) +routes_sf_crs %>% + filter(median_headways < 10*60) %>% + ggplot() + + geom_sf(aes(colour=as.factor(median_headways))) + + labs(color = "Headways") + + geom_sf_text(aes(label=route_id)) + + theme_bw() +``` + +Its clear that a number of the route lines overlap. + +### 5) Map Departures by Stop and Route + +Still, we'd like to represent where and how frequently the subway runs in NYC in the +morning. How can we do so given that, graphically, the route lines overlap? + +One method might be change the units we are representing graphically. Thus far, we have +used stops and routes as units. But GTFS data also come with a `shapes` table, which, in +theory, should allow us to say what the frequency of vehicles passing through any given +shape is, using similar methods. This kind of method is beyond the scope of this vignette. + +Alternatively, regular ggplot users might expect the ggplot `dodge` function to allow us to +move around these lines but, by design, thats not possible with `geom_sf`. One can see why: +unlike a bar chart, the representations of route lines in geographic space on a map have a +specific meaning. + +So we'll use a cartographic trick, scaling each line according to total departures and +close to a number around .001 [decimal degrees](https://en.wikipedia.org/wiki/Decimal_degrees) +which is a about the length of a street, which will fit on the map well. One might call +this a cartogram. + +```{r} +routes_sf_buffer <- st_buffer(routes_sf, + dist = routes_sf$total_departures/1e6) +``` + +Next, when we render the map, we'll make sure to make the borders around each route +transparent, and set the opacity for the fill of all the polygons high again. + +```{r, fig.width=6, fig.height=10} +routes_sf_buffer %>% + ggplot() + + geom_sf(colour = alpha("white", 0), fill = alpha("red",0.2)) + + theme_bw() +``` + +Now we have a rough representation of the question we set out to answer: where and how +frequently does transit service run in the AM in the NYC Subway. Note that in this graphic, +the intensity of the red tells you how many overlapping trains run through the line and the +thickness of the lines represents how many run along each line. + +We can combine this with stops to get a sense of how central stops relate to routes. + +```{r} +gtfs$stops %>% + inner_join(am_stop_freq, by = "stop_id") %>% + filter(n_departures > 50) %>% + select(stop_id, stop_name, n_departures, mean_headway) %>% + arrange(n_departures) %>% + head() %>% + knitr::kable() +``` + +First, we'll leverage the common `stop_name` variable to group and count departures, in +both directions, for all stops, filtering to out a number of smaller stops for more +graphical clarity. + +```{r} +am_stop_name_departures <- left_join(gtfs$stops, am_stop_freq, by="stop_id") + +am_stop_name_departures <- am_stop_name_departures %>% + group_by(stop_name) %>% + transmute(total_departures = sum(n_departures, na.rm=TRUE)) + +am_stop_name_departures <- am_stop_name_departures %>% + filter(total_departures > 100) +``` + +Finally, we can plot both the route line counts and the stop departure counts on one map: + +```{r, fig.width=6, fig.height=10} +ggplot() + + geom_sf(data = routes_sf_buffer, + colour = alpha("white",0), fill = alpha("red",0.3)) + + geom_sf(data = am_stop_name_departures, + aes(size = total_departures), shape=1) + + labs(size = "Departures (Hundreds)") + + theme_bw() + + theme(legend.position="none") + + ggtitle("NYC MTA - Relative Departures by Route and Stop (AM)") +```