Skip to content

Commit

Permalink
Merge pull request #2 from naviqore/feature/gtfs-schedule-parser
Browse files Browse the repository at this point in the history
  • Loading branch information
munterfi authored Apr 27, 2024
2 parents 4711970 + ee0a7ba commit 0e8781d
Show file tree
Hide file tree
Showing 27 changed files with 586 additions and 242 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,7 @@ build/
.vscode/

### Mac OS ###
.DS_Store
.DS_Store

# Benchmark test files
benchmark/input/*.zip
25 changes: 25 additions & 0 deletions src/main/java/ch/naviqore/gtfs/schedule/GtfsScheduleFile.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package ch.naviqore.gtfs.schedule;

import lombok.Getter;
import lombok.RequiredArgsConstructor;

/**
* Standard GTFS schedule file types and their corresponding file names.
*/
@RequiredArgsConstructor
@Getter
public enum GtfsScheduleFile {
AGENCY("agency.txt"),
CALENDAR("calendar.txt"),
CALENDAR_DATES("calendar_dates.txt"),
FARE_ATTRIBUTES("fare_attributes.txt"),
FARE_RULES("fare_rules.txt"),
FREQUENCIES("frequencies.txt"),
STOPS("stops.txt"),
ROUTES("routes.txt"),
SHAPES("shapes.txt"),
TRIPS("trips.txt"),
STOP_TIMES("stop_times.txt");

private final String fileName;
}
114 changes: 67 additions & 47 deletions src/main/java/ch/naviqore/gtfs/schedule/GtfsScheduleParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,95 +4,115 @@
import ch.naviqore.gtfs.schedule.type.ExceptionType;
import ch.naviqore.gtfs.schedule.type.RouteType;
import ch.naviqore.gtfs.schedule.type.ServiceDayTime;
import lombok.RequiredArgsConstructor;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.csv.CSVRecord;

import java.time.DayOfWeek;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.EnumMap;
import java.util.EnumSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Consumer;

/**
* GTFS CSV Records Parser
* GTFS CSV records parser
*
* @author munterfi
*/
@RequiredArgsConstructor
@Log4j2
class GtfsScheduleParser {

private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyyMMdd");
private static final Map<String, DayOfWeek> DAY_MAPPINGS = Map.of("monday", DayOfWeek.MONDAY, "tuesday",
DayOfWeek.TUESDAY, "wednesday", DayOfWeek.WEDNESDAY, "thursday", DayOfWeek.THURSDAY, "friday",
DayOfWeek.FRIDAY, "saturday", DayOfWeek.SATURDAY, "sunday", DayOfWeek.SUNDAY);

private final EnumMap<GtfsScheduleFile, Consumer<CSVRecord>> parsers = new EnumMap<>(GtfsScheduleFile.class);
private final GtfsScheduleBuilder builder;

void parseAgencies(List<CSVRecord> records) {
log.info("Parsing {} agency records", records.size());
for (CSVRecord record : records) {
builder.addAgency(record.get("agency_id"), record.get("agency_name"), record.get("agency_url"),
record.get("agency_timezone"));
}
public GtfsScheduleParser(GtfsScheduleBuilder builder) {
this.builder = builder;
initializeParsers();
}

void parseCalendars(List<CSVRecord> records) {
log.info("Parsing {} calendar records", records.size());
for (CSVRecord record : records) {
EnumSet<DayOfWeek> serviceDays = EnumSet.noneOf(DayOfWeek.class);
DAY_MAPPINGS.forEach((key, value) -> {
if ("1".equals(record.get(key))) {
serviceDays.add(value);
}
});
builder.addCalendar(record.get("service_id"), serviceDays,
LocalDate.parse(record.get("start_date"), DATE_FORMATTER),
LocalDate.parse(record.get("end_date"), DATE_FORMATTER));
}
public void parse(CSVRecord record, GtfsScheduleFile fileType) {
Set<GtfsScheduleFile> warnings = EnumSet.noneOf(GtfsScheduleFile.class);
parsers.getOrDefault(fileType, r -> {
if (!warnings.contains(fileType)) {
log.warn("Unsupported GTFS file type for parsing: {}", fileType);
} else {
warnings.add(fileType);
}
}).accept(record);
}

private void initializeParsers() {
parsers.put(GtfsScheduleFile.AGENCY, this::parseAgency);
parsers.put(GtfsScheduleFile.CALENDAR, this::parseCalendar);
parsers.put(GtfsScheduleFile.CALENDAR_DATES, this::parseCalendarDate);
parsers.put(GtfsScheduleFile.STOPS, this::parseStop);
parsers.put(GtfsScheduleFile.ROUTES, this::parseRoute);
parsers.put(GtfsScheduleFile.TRIPS, this::parseTrips);
parsers.put(GtfsScheduleFile.STOP_TIMES, this::parseStopTimes);
}

private void parseAgency(CSVRecord record) {
builder.addAgency(record.get("agency_id"), record.get("agency_name"), record.get("agency_url"),
record.get("agency_timezone"));
}

void parseCalendarDates(List<CSVRecord> records) {
log.info("Parsing {} calendar date records", records.size());
for (CSVRecord record : records) {
private void parseCalendar(CSVRecord record) {
EnumSet<DayOfWeek> serviceDays = EnumSet.noneOf(DayOfWeek.class);
DAY_MAPPINGS.forEach((key, value) -> {
if ("1".equals(record.get(key))) {
serviceDays.add(value);
}
});
builder.addCalendar(record.get("service_id"), serviceDays,
LocalDate.parse(record.get("start_date"), DATE_FORMATTER),
LocalDate.parse(record.get("end_date"), DATE_FORMATTER));
}

private void parseCalendarDate(CSVRecord record) {
try {
builder.addCalendarDate(record.get("service_id"), LocalDate.parse(record.get("date"), DATE_FORMATTER),
ExceptionType.parse(record.get("exception_type")));
} catch (IllegalArgumentException e) {
log.warn("Skipping invalid calendar date {}: {}", record.get("date"), e.getMessage());
}
}

void parseStops(List<CSVRecord> records) {
log.info("Parsing {} stop records", records.size());
for (CSVRecord record : records) {
builder.addStop(record.get("stop_id"), record.get("stop_name"), Double.parseDouble(record.get("stop_lat")),
Double.parseDouble(record.get("stop_lon")));
}
private void parseStop(CSVRecord record) {
builder.addStop(record.get("stop_id"), record.get("stop_name"), Double.parseDouble(record.get("stop_lat")),
Double.parseDouble(record.get("stop_lon")));

}

void parseRoutes(List<CSVRecord> records) {
log.info("Parsing {} route records", records.size());
for (CSVRecord record : records) {
// TODO: Route types are not standardized in any way.
// RouteType.parse(record.get("route_type"))
builder.addRoute(record.get("route_id"), record.get("agency_id"), record.get("route_short_name"),
record.get("route_long_name"), RouteType.RAIL);
}
private void parseRoute(CSVRecord record) {
// TODO: Route types are not standardized in any way.
// RouteType.parse(record.get("route_type"))
builder.addRoute(record.get("route_id"), record.get("agency_id"), record.get("route_short_name"),
record.get("route_long_name"), RouteType.RAIL);
}

void parseTrips(List<CSVRecord> records) {
log.info("Parsing {} trip records", records.size());
for (CSVRecord record : records) {
private void parseTrips(CSVRecord record) {
try {
builder.addTrip(record.get("trip_id"), record.get("route_id"), record.get("service_id"));
} catch (IllegalArgumentException e) {
log.warn("Skipping invalid trip {}: {}", record.get("trip_id"), e.getMessage());
}
}

void parseStopTimes(List<CSVRecord> records) {
log.info("Parsing {} stop time records", records.size());
for (CSVRecord record : records) {
private void parseStopTimes(CSVRecord record) {
try {
builder.addStopTime(record.get("trip_id"), record.get("stop_id"),
ServiceDayTime.parse(record.get("arrival_time")),
ServiceDayTime.parse(record.get("departure_time")));
} catch (IllegalArgumentException e) {
log.warn("Skipping invalid stop time {}-{}: {}", record.get("trip_id"), record.get("stop_id"),
e.getMessage());
}
}

}
112 changes: 34 additions & 78 deletions src/main/java/ch/naviqore/gtfs/schedule/GtfsScheduleReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@

import ch.naviqore.gtfs.schedule.model.GtfsSchedule;
import ch.naviqore.gtfs.schedule.model.GtfsScheduleBuilder;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.RequiredArgsConstructor;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
Expand All @@ -17,9 +15,6 @@
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;

Expand All @@ -29,8 +24,8 @@
* This class provides functionality to read GTFS data from either a directory containing individual GTFS CSV files or a
* ZIP archive containing the GTFS dataset.
* <p>
* Supported GTFS files are enumerated in {@link GtfsFile}, and this reader will attempt to parse each specified file
* into a list of {@link CSVRecord} objects.
* Supported GTFS files are enumerated in {@link GtfsScheduleFile}, and this reader will attempt to parse each specified
* file into a list of {@link CSVRecord} objects.
* <p>
* Note: The GTFS data has to strictly follow the standard GTFS file naming and format. Non-standard files will not be
* read.
Expand All @@ -43,78 +38,21 @@ public class GtfsScheduleReader {

private static final String ZIP_FILE_EXTENSION = ".zip";

/**
* Standard GTFS file types and their corresponding file names.
*/
@RequiredArgsConstructor
@Getter
public enum GtfsFile {
AGENCY("agency.txt"),
CALENDAR_DATES("calendar_dates.txt"),
CALENDAR("calendar.txt"),
FARE_ATTRIBUTES("fare_attributes.txt"),
FARE_RULES("fare_rules.txt"),
FREQUENCIES("frequencies.txt"),
ROUTES("routes.txt"),
SHAPES("shapes.txt"),
STOP_TIMES("stop_times.txt"),
STOPS("stops.txt"),
TRIPS("trips.txt");

private final String fileName;
}

public GtfsSchedule read(String path) throws IOException {
File file = new File(path);
Map<GtfsFile, List<CSVRecord>> records;

if (file.isDirectory()) {
log.info("Reading GTFS CSV files from directory: {}", path);
records = readFromDirectory(file);
} else if (file.isFile() && path.endsWith(ZIP_FILE_EXTENSION)) {
log.info("Reading GTFS from ZIP file: {}", path);
records = readFromZip(file);
} else {
throw new IllegalArgumentException("Path must be a directory or a .zip file");
}

return buildSchedule(records);
}

private GtfsSchedule buildSchedule(Map<GtfsFile, List<CSVRecord>> records) {
GtfsScheduleBuilder builder = GtfsScheduleBuilder.builder();
GtfsScheduleParser parser = new GtfsScheduleParser(builder);
parser.parseAgencies(records.get(GtfsFile.AGENCY));
parser.parseCalendars(records.get(GtfsFile.CALENDAR));
parser.parseCalendarDates(records.get(GtfsFile.CALENDAR_DATES));
parser.parseStops(records.get(GtfsFile.STOPS));
parser.parseRoutes(records.get(GtfsFile.ROUTES));
parser.parseTrips(records.get(GtfsFile.TRIPS));
parser.parseStopTimes(records.get(GtfsFile.STOP_TIMES));
return builder.build();
}

private Map<GtfsFile, List<CSVRecord>> readFromDirectory(File directory) throws IOException {
Map<GtfsFile, List<CSVRecord>> records = new HashMap<>();

for (GtfsFile fileType : GtfsFile.values()) {
private static void readFromDirectory(File directory, GtfsScheduleParser parser) throws IOException {
for (GtfsScheduleFile fileType : GtfsScheduleFile.values()) {
File csvFile = new File(directory, fileType.getFileName());
if (csvFile.exists()) {
log.info("Reading GTFS CSV file: {}", csvFile.getAbsolutePath());
records.put(fileType, readCsvFile(csvFile));
readCsvFile(csvFile, parser, fileType);
} else {
log.warn("GTFS CSV file {} not found", csvFile.getAbsolutePath());
}
}

return records;
}

private Map<GtfsFile, List<CSVRecord>> readFromZip(File zipFile) throws IOException {
Map<GtfsFile, List<CSVRecord>> records = new HashMap<>();

private static void readFromZip(File zipFile, GtfsScheduleParser parser) throws IOException {
try (ZipFile zf = new ZipFile(zipFile, StandardCharsets.UTF_8)) {
for (GtfsFile fileType : GtfsFile.values()) {
for (GtfsScheduleFile fileType : GtfsScheduleFile.values()) {
ZipEntry entry = zf.getEntry(fileType.getFileName());
if (entry != null) {
log.info("Reading GTFS file from ZIP: {}", entry.getName());
Expand All @@ -123,33 +61,51 @@ private Map<GtfsFile, List<CSVRecord>> readFromZip(File zipFile) throws IOExcept
.setByteOrderMarks(ByteOrderMark.UTF_8)
.setInclude(false)
.get(), StandardCharsets.UTF_8)) {
records.put(fileType, readCsv(reader));
readCsvRecords(reader, parser, fileType);
}
} else {
log.warn("GTFS file {} not found in ZIP", fileType.getFileName());
}
}
}

return records;
}

private List<CSVRecord> readCsvFile(File file) throws IOException {
private static void readCsvFile(File file, GtfsScheduleParser parser,
GtfsScheduleFile fileType) throws IOException {
try (FileInputStream fileInputStream = new FileInputStream(file);
BOMInputStream bomInputStream = BOMInputStream.builder()
.setInputStream(fileInputStream)
.setByteOrderMarks(ByteOrderMark.UTF_8)
.get(); InputStreamReader reader = new InputStreamReader(bomInputStream, StandardCharsets.UTF_8)) {
return readCsv(reader);
readCsvRecords(reader, parser, fileType);
}
}

private List<CSVRecord> readCsv(InputStreamReader reader) throws IOException {
private static void readCsvRecords(InputStreamReader reader, GtfsScheduleParser recordParser,
GtfsScheduleFile fileType) throws IOException {
CSVFormat format = CSVFormat.DEFAULT.builder().setHeader().setIgnoreHeaderCase(true).setTrim(true).build();
try (CSVParser parser = new CSVParser(reader, format)) {
log.debug("CSV Headers: {}", parser.getHeaderMap().keySet());
return parser.getRecords();
try (CSVParser csvParser = new CSVParser(reader, format)) {
log.debug("CSV Headers: {}", csvParser.getHeaderMap().keySet());
csvParser.forEach(record -> recordParser.parse(record, fileType));
}
}

public GtfsSchedule read(String path) throws IOException {
File file = new File(path);
GtfsScheduleBuilder builder = GtfsSchedule.builder();
GtfsScheduleParser parser = new GtfsScheduleParser(builder);

if (file.isDirectory()) {
log.info("Reading GTFS CSV files from directory: {}", path);
readFromDirectory(file, parser);
} else if (file.isFile() && path.endsWith(ZIP_FILE_EXTENSION)) {
log.info("Reading GTFS from ZIP file: {}", path);
readFromZip(file, parser);
} else {
throw new IllegalArgumentException("Path must be a directory or a .zip file");
}

return builder.build();
}

}
18 changes: 0 additions & 18 deletions src/main/java/ch/naviqore/gtfs/schedule/RunExample.java

This file was deleted.

Loading

0 comments on commit 0e8781d

Please sign in to comment.