From d0359b2d8e1fb2a0d3908695b02b2260fa49ca1f Mon Sep 17 00:00:00 2001 From: rafapereirabr Date: Sun, 22 Sep 2024 22:41:33 -0300 Subject: [PATCH] New support function `latest_airfares_date()` --- NEWS.md | 1 + R/latest_flights_date.R | 25 ++++ tests_rafa/precodapassagem.R | 214 +++++++++++++++++++++++++++++++++++ 3 files changed, 240 insertions(+) create mode 100644 tests_rafa/precodapassagem.R diff --git a/NEWS.md b/NEWS.md index 5d550b6..e7597ee 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,7 @@ * Minor changes: * Internally check of the consistency of date inputs. The date input must be consistent in either a 6-digit format `yyyymm` OR a 4-digit format `yyyy`. + * New support function `latest_airfares_date()` # flightsbr dev v0.5.0 diff --git a/R/latest_flights_date.R b/R/latest_flights_date.R index 662d835..900d173 100644 --- a/R/latest_flights_date.R +++ b/R/latest_flights_date.R @@ -18,3 +18,28 @@ latest_flights_date <- function(){ return(latest_date) } + + + +#' Check the date of the latest airfare data available + +#' @param dom Logical. Defaults to `TRUE` download airfares of domestic +#' flights. If `FALSE`, the function downloads airfares of +#' international flights. +#' @return A numeric date in the format `yyyymm`. +#' @export +#' @family support function +#' @examples \dontrun{ if (interactive()) { +#' +#' latest_date <- latest_airfares_date() +#' +#'}} +latest_airfares_date <- function(dom=TRUE){ + + # get all dates available + all_dates <- get_airfares_dates_available(dom) + + # find latest date + latest_date <- max(all_dates) + return(latest_date) +} diff --git a/tests_rafa/precodapassagem.R b/tests_rafa/precodapassagem.R new file mode 100644 index 0000000..eb5b73d --- /dev/null +++ b/tests_rafa/precodapassagem.R @@ -0,0 +1,214 @@ +library(data.table) +library(flightsbr) +library(janitor) + +# get latest month available +latest_date <- flightsbr::latest_airfares_date() +last_two_monts <- (latest_date-1):latest_date + +# download data +df_dom <- flightsbr::read_airfares(date = last_two_monts) |> + janitor::clean_names() + +# head(df_dom) + +# fix numeric columns +df_dom[, tarifa := gsub(',', '.', tarifa)] +df_dom[, tarifa := as.numeric(tarifa)] +df_dom[, assentos := as.numeric(assentos)] + + # # create unique id for each OD pair + # df_dom[, id := .GRP, by = .(origem, destino)] +# create unique id for each OD pair +# so that A-B has the same id as B-A +# Sort each pair so that the smaller value is always the first +df_dom[, od_pair := paste0(pmin(origem, destino), "-", pmax(origem, destino))] +df_dom[, id := .GRP, by = od_pair] + + +# determine top 100 OD pairs +od_rank <- df_dom[, .(total_demand = sum(assentos)), + by = .(id, od_pair)][order(-total_demand)] + +od_rank <- od_rank |> + dplyr::slice_max(order_by = total_demand, n = 100) |> + mutate( ranking = 1:100) + +# filter raw data only for top 100 and bring raking column +df_dom_100 <- df_dom[ id %in% od_rank$id] +df_dom_100[od_rank, on='id', ranking := i.ranking] + +# calculate reference values +df <- df_dom_100[, .(passageiros = sum(assentos), + minima = min(tarifa), + q25 = Hmisc::wtd.quantile(x = tarifa, weights=assentos,probs = 0.25), + media = weighted.mean(x = tarifa, w=assentos), + q75 = Hmisc::wtd.quantile(x = tarifa, weights=assentos,probs = 0.75), + maxima = max(tarifa) + ), + by = .(id, ranking, origem, destino)][order(ranking)] + + +df100 <- df[, .(passageiros = sum(passageiros), + # minima = sum(minima), + q25 = sum(q25), + media = sum(media), + q75 = sum(q75) + #, maxima = sum(maxima) + ), + by = .(id, ranking)][order(ranking)] + + +# bring OD pair info back +df100[df_dom_100, on='id', od_pair := i.od_pair ] +df100[, origem := substring(od_pair, 1, 4)] +df100[, destino := substring(od_pair, 6, 9)] + +# add airport information +airports <- flightsbr::read_airports(type = 'public') |> + janitor::clean_names() |> + dplyr::select(c('codigo_oaci','municipio_atendido')) + +head(airports) + + +df_airport_codes <- dplyr::tribble( + ~oaci, ~iata, + "SBAM", "MCP", + "SWYN", NA, + "SNAL", "APQ", + "SWBC", "BAZ", + "SWBI", NA, + "SBBE", "BEL", + "SBCF", "CNF", + "SBBH", "PLU", + "SBBV", "BVB", + "SWBR", NA, + "SWBS", NA, + "SBBR", "BSB", + "SBCD", "CFC", + "SNCC", NA, + "SBKP", "VCP", + "SDAM", "CPQ", + "SBMT", NA, + "SNRU", "CAU", + "SWCA", "CAF", + "SBCA", "CAC", + "SILQ", NA, + "SWKO", "CIZ", + "SBAA", "CDJ", + "SBCZ", "CZS", + "SBBI", "BFH", + "SBCT", "CWB", + "SWFJ", "FEJ", + "SWFN", NA, + "SBFL", "FLN", + "SBFZ", "FOR", + "SBFI", "IGU", + "SBZM", "IZA", + "SBGO", "GYN", + "SBGR", "GRU", + "SBIZ", "IMP", + "SBJE", "JJD", + "SBJV", "JOI", + "SBJP", "JPA", + "SBJF", "JDF", + "SBJD", "QDV", + "SBMQ", "MCP", + "SBEG", "MAO", + "SNML", NA, + "SBMO", "MCZ", + "SBMS", "MVF", + "SBNF", "NVT", + "SBSG", "NAT", + "SWNK", NA, + "SBOI", NA, + "SNOZ", NA, + "SWJV", NA, + "SBPB", "PHB", + "SNPE", NA, + "SSZW", "PGZ", + "SBPA", "POA", + "SNPG", NA, + "SBPV", "PVH", + "SBRF", "REC", + "SBRP", "RAO", + "SBRB", "RBR", + "SBRJ", "SDU", + "SBGL", "GIG", + "SBJR", NA, + "SBRD", "ROO", + "SBSM", "RIA", + "SDOE", NA, + "SBST", "SSZ", + "SBSV", "SSA", + "SDSC", "QSC", + "SBSL", "SLZ", + "SBSP", "CGH", + "SWSN", "ZMD", + "SDCO", "SOD", + "SWMU", NA, + "SBTT", "TBT", + "SBTK", "TRQ", + "SBTF", "TFF", + "SBTE", "THE", + "SBCY", "CGB", # cuiaba + "SBPL", "PNZ", # petrolina + "SBVT", "VIX", # vitoria + "SBVC", "VDC", # vitoria da consquista + "SBJU", "JDO", # juazeiro do norte + "SBAR", "AJU", # aracaju + "SBTS", NA, + "SNUN", NA, + "SWXU", NA) + +data.table::setDT(df_airport_codes) + +airports[df_airport_codes, on=c('codigo_oaci'='oaci'), iata:= i.iata] + + +df100[airports, + on=c('origem'='codigo_oaci'), + c('orig_iata', 'orig_muni') := list(i.iata, i.municipio_atendido )] + +df100[airports, + on=c('destino'='codigo_oaci'), + c('dest_iata', 'dest_muni') := list(i.iata, i.municipio_atendido )] + +# reorganize origem and destino columns +df100[, De := paste0(orig_muni," (",orig_iata,")")] +df100[, Para := paste0(dest_muni," (",dest_iata,")")] + +# round values +cols <- c('q25', 'media', 'q75') +df100 <- df100 |> mutate_at(cols, ~round(.,1)) + + +# rename and reorder columns +df_output <- df100 |> + select(ranking, + De, + Para, + total_passageiros = passageiros, + q25, + media, + q75 + ) + + +quak::quak(df_output) + +# proximos passos +- logo no comeƧo, identificar pares de ida e volta +- gerar valor total de ida + volta +- gerar grafico da distribuiƧao do valor + +library(data.table) + +dt <- data.table::CJ(letters[1:4], letters[1:4]) +setnames(dt, c('origin', 'destination')) + + + + +