-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
untested wip, move to two-step grouping
- Loading branch information
Laurie Merrell
committed
May 12, 2023
1 parent
8ff38d0
commit 2084be3
Showing
2 changed files
with
76 additions
and
7 deletions.
There are no files selected for viewing
76 changes: 76 additions & 0 deletions
76
warehouse/models/intermediate/gtfs/int_gtfs_rt__trip_updates_trip_day_map_grouping.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
{{ | ||
config( | ||
materialized='incremental', | ||
incremental_strategy='insert_overwrite', | ||
partition_by={ | ||
'field': 'calculated_service_date_pacific', | ||
'data_type': 'date', | ||
'granularity': 'day', | ||
}, | ||
cluster_by='base64_url', | ||
) | ||
}} | ||
|
||
WITH stop_time_updates AS ( | ||
SELECT * | ||
FROM {{ ref('fct_stop_time_updates') }} | ||
WHERE {{ gtfs_rt_dt_where() }} | ||
), | ||
|
||
rt_feeds AS ( | ||
SELECT * | ||
FROM {{ ref('fct_daily_rt_feed_files') }} | ||
), | ||
|
||
schedule_feeds AS ( | ||
SELECT * | ||
FROM {{ ref('dim_schedule_feeds') }} | ||
), | ||
|
||
-- group by *both* the UTC date that data was scraped (dt) *and* calculated service date | ||
-- so that in the mart we can get just service date-level data | ||
-- this allows us to handle the dt/service_date mismatch by grouping in two stages | ||
int_gtfs_rt__trip_updates_trip_day_map_grouping AS ( | ||
SELECT | ||
-- try to figure out what the service date would be to join back with schedule: fall back from explicit to imputed | ||
-- TODO: it's possible that this could lead to some weirdness around midnight Pacific / in feed timezone | ||
-- if `trip_start_date` is not set we theoretically should be trying to grab the date of the first arrival time per trip | ||
-- because trip updates may be generated hours before the beginning of the actual trip activity | ||
-- however the fact that this would occur near date boundaries is precisely why it's a bit tricky to pick the right first arrival time if trip start date is not populated | ||
dt, | ||
COALESCE( | ||
PARSE_DATE("%Y%m%d", trip_start_date), | ||
DATE(header_timestamp, schedule_feeds.feed_timezone), | ||
DATE(_extract_ts, schedule_feeds.feed_timezone)) AS calculated_service_date, | ||
base64_url, | ||
trip_id, | ||
trip_route_id, | ||
trip_direction_id, | ||
trip_start_time, | ||
trip_start_date, | ||
trip_schedule_relationship, | ||
schedule_feeds.feed_timezone, | ||
ARRAY_AGG(DISTINCT id) AS message_ids_array, | ||
ARRAY_AGG(DISTINCT header_timestamp) AS header_timestamps_array, | ||
ARRAY_AGG(DISTINCT trip_update_timestamp) AS trip_update_timestamps_array, | ||
MIN(_extract_ts) AS min_extract_ts, | ||
MAX(_extract_ts) AS max_extract_ts, | ||
MIN(header_timestamp) AS min_header_timestamp, | ||
MAX(header_timestamp) AS max_header_timestamp, | ||
MIN(trip_update_timestamp) AS min_trip_update_timestamp, | ||
MAX(trip_update_timestamp) AS max_trip_update_timestamp, | ||
MAX(trip_update_delay) AS max_delay, | ||
ARRAY_AGG(DISTINCT CASE WHEN schedule_relationship = 'SKIPPED' THEN stop_id END) AS skipped_stops_array, | ||
ARRAY_AGG(DISTINCT CASE WHEN schedule_relationship = 'SCHEDULED' THEN stop_id END) AS scheduled_stops_array, | ||
ARRAY_AGG(DISTINCT CASE WHEN schedule_relationship = 'CANCELED' THEN stop_id END) AS canceled_stops_array, | ||
ARRAY_AGG(DISTINCT CASE WHEN schedule_relationship = 'ADDED' THEN stop_id END) AS added_stops_array, | ||
FROM stop_time_updates | ||
LEFT JOIN rt_feeds | ||
ON stop_time_updates.base64_url = rt_feeds.base64_url | ||
AND stop_time_updates.dt = rt_feeds.date | ||
LEFT JOIN schedule_feeds | ||
ON rt_feeds.schedule_feed_key = schedule_feeds.key | ||
GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 | ||
) | ||
|
||
SELECT * FROM int_gtfs_rt__trip_updates_trip_day_map_grouping |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters