diff --git a/warehouse/models/intermediate/transit_database/int_transit_database__urls_to_gtfs_datasets.sql b/warehouse/models/intermediate/transit_database/int_transit_database__urls_to_gtfs_datasets.sql index 86dc9cebfa..4ebe593e24 100644 --- a/warehouse/models/intermediate/transit_database/int_transit_database__urls_to_gtfs_datasets.sql +++ b/warehouse/models/intermediate/transit_database/int_transit_database__urls_to_gtfs_datasets.sql @@ -22,6 +22,7 @@ int_transit_database__urls_to_gtfs_datasets AS ( gtfs_datasets.base64_url, gtfs_datasets.source_record_id, gtfs_datasets.key AS gtfs_dataset_key, + gtfs_datasets.name AS gtfs_dataset_name, CASE WHEN gtfs_datasets._valid_from = appearance_duration.first_app THEN CAST('1900-01-01' AS TIMESTAMP) ELSE gtfs_datasets._valid_from diff --git a/warehouse/models/mart/gtfs/fct_daily_schedule_feeds.sql b/warehouse/models/mart/gtfs/fct_daily_schedule_feeds.sql index 1351088e80..a4aac531d6 100644 --- a/warehouse/models/mart/gtfs/fct_daily_schedule_feeds.sql +++ b/warehouse/models/mart/gtfs/fct_daily_schedule_feeds.sql @@ -28,7 +28,8 @@ fct_daily_schedule_feeds AS ( t2.key AS feed_key, t2.feed_timezone, t2.base64_url, - urls_to_gtfs_datasets.gtfs_dataset_key AS gtfs_dataset_key + urls_to_gtfs_datasets.gtfs_dataset_key AS gtfs_dataset_key, + urls_to_gtfs_datasets.gtfs_dataset_name AS gtfs_dataset_name, FROM make_noon_pacific AS t1 INNER JOIN dim_schedule_feeds AS t2 ON t1.noon_pacific BETWEEN t2._valid_from AND t2._valid_to diff --git a/warehouse/models/mart/gtfs/fct_observed_trips.sql b/warehouse/models/mart/gtfs/fct_observed_trips.sql index 774d943a80..a1af61a3ff 100644 --- a/warehouse/models/mart/gtfs/fct_observed_trips.sql +++ b/warehouse/models/mart/gtfs/fct_observed_trips.sql @@ -11,42 +11,61 @@ WITH trip_updates AS ( vehicle_positions AS ( SELECT * FROM {{ ref('fct_vehicle_positions_trip_summaries') }} ), - +{# service_alerts AS ( SELECT * FROM {{ ref('fct_service_alerts_trip_summaries') }} +), #} + +dim_gtfs_datasets AS ( + SELECT * FROM {{ ref('dim_gtfs_datasets') }} ), -fct_observed_trips AS ( +fct_daily_schedule_feeds AS ( + SELECT * FROM {{ ref('fct_daily_schedule_feeds') }} +), + +rt_joins AS ( SELECT trip_instance_key, - -- calculated service date, schedule URL, trip ID, and iteration num are the components of the key - -- so for these we can coalesce + -- calculated service date, schedule URL, trip ID, and iteration num are the components of the key used to join between feeds + -- so for these we can coalesce across feeds + -- otherwise, values are not guaranteed to be the same! COALESCE( tu.calculated_service_date, - vp.calculated_service_date, - sa.calculated_service_date + vp.calculated_service_date + -- , sa.calculated_service_date ) AS calculated_service_date, COALESCE( tu.schedule_base64_url, - vp.schedule_base64_url, - sa.schedule_base64_url + vp.schedule_base64_url + -- , sa.schedule_base64_url ) AS schedule_base64_url, - COALESCE(tu.trip_id, vp.trip_id, sa.trip_id) AS trip_id, - COALESCE(tu.calculated_iteration_num, vp.calculated_iteration_num, sa.calculated_iteration_num) AS calculated_iteration_num, + COALESCE( + tu.trip_id, + vp.trip_id + --, sa.trip_id + ) AS trip_id, + COALESCE( + tu.calculated_iteration_num, + vp.calculated_iteration_num + --, sa.calculated_iteration_num + ) AS calculated_iteration_num, COALESCE(tu.warning_multiple_route_ids, FALSE) OR COALESCE(vp.warning_multiple_route_ids, FALSE) - OR COALESCE(sa.warning_multiple_route_ids, FALSE) + -- OR COALESCE(sa.warning_multiple_route_ids, FALSE) OR COALESCE((tu.trip_route_ids != COALESCE(vp.trip_route_ids, tu.trip_route_ids)), FALSE) - OR COALESCE((tu.trip_route_ids != COALESCE(sa.trip_route_ids, tu.trip_route_ids)), FALSE) - OR COALESCE((vp.trip_route_ids != COALESCE(sa.trip_route_ids, vp.trip_route_ids)), FALSE) AS warning_multiple_route_ids, + -- OR COALESCE((tu.trip_route_ids != COALESCE(sa.trip_route_ids, tu.trip_route_ids)), FALSE) + -- OR COALESCE((vp.trip_route_ids != COALESCE(sa.trip_route_ids, vp.trip_route_ids)), FALSE) + AS warning_multiple_route_ids, tu.warning_multiple_direction_ids OR COALESCE(vp.warning_multiple_direction_ids, FALSE) - OR COALESCE(sa.warning_multiple_direction_ids, FALSE) + -- OR COALESCE(sa.warning_multiple_direction_ids, FALSE) OR COALESCE((tu.trip_direction_ids != COALESCE(vp.trip_direction_ids, tu.trip_direction_ids)), FALSE) - OR COALESCE((tu.trip_direction_ids != COALESCE(sa.trip_direction_ids, tu.trip_direction_ids)), FALSE) - OR COALESCE((vp.trip_direction_ids != COALESCE(sa.trip_direction_ids, vp.trip_direction_ids)), FALSE) AS warning_multiple_direction_ids, + -- OR COALESCE((tu.trip_direction_ids != COALESCE(sa.trip_direction_ids, tu.trip_direction_ids)), FALSE) + -- OR COALESCE((vp.trip_direction_ids != COALESCE(sa.trip_direction_ids, vp.trip_direction_ids)), FALSE) + AS warning_multiple_direction_ids, -- trip updates facts tu.trip_start_time AS tu_trip_start_time, @@ -99,8 +118,12 @@ fct_observed_trips AS ( vp.last_position_latitude AS vp_last_position_latitude, vp.last_position_longitude AS vp_last_position_longitude, + -- suppress alerts for now because at time of writing (6/30/23) the only feed with trip-level alerts is the MTC regional feed + -- and that feed does not namespace trip IDs by agency + -- so the alerts fail to join with the same trip from other feed types + -- for example, in the alerts feed, there will be trip_id 117 for Caltrain, which has trip_id CT:117 in trip updates/vehicle positions/schedule -- service alerts facts - sa.trip_start_time AS sa_trip_start_time, + {# sa.trip_start_time AS sa_trip_start_time, sa.trip_start_date AS sa_trip_start_date, sa.warning_multiple_route_ids AS sa_warning_multiple_route_ids, sa.warning_multiple_direction_ids AS sa_warning_multiple_direction_ids, @@ -116,18 +139,124 @@ fct_observed_trips AS ( sa.min_header_timestamp AS sa_min_header_timestamp, sa.max_header_timestamp AS sa_max_header_timestamp, sa.num_distinct_header_timestamps AS sa_num_distinct_header_timestamps, - sa.alert_content_array AS sa_alert_content_array, + sa.alert_content_array AS sa_alert_content_array, #} -- keying tu.base64_url AS tu_base64_url, vp.base64_url AS vp_base64_url, - sa.base64_url AS sa_base64_url, - + {# sa.base64_url AS sa_base64_url, #} FROM trip_updates AS tu FULL OUTER JOIN vehicle_positions AS vp USING (trip_instance_key) - FULL OUTER JOIN service_alerts AS sa - USING (trip_instance_key) + {# FULL OUTER JOIN service_alerts AS sa + USING (trip_instance_key) #} + +), + +fct_observed_trips AS ( + SELECT + trip_instance_key, + calculated_service_date, + schedule_base64_url, + trip_id, + calculated_iteration_num, + tu_datasets.name AS tu_name, + vp_datasets.name AS vp_name, + -- sa_datasets.name AS sa_name, + schedule.gtfs_dataset_name AS schedule_name, + + warning_multiple_route_ids, + warning_multiple_direction_ids, + + -- trip updates facts + tu_trip_start_time, + tu_trip_start_date, + tu_warning_multiple_route_ids, + tu_warning_multiple_direction_ids, + tu_starting_schedule_relationship, + tu_ending_schedule_relationship, + tu_trip_schedule_relationships, + tu_num_distinct_message_ids, + tu_min_extract_ts, + tu_max_extract_ts, + tu_num_distinct_extract_ts, + tu_trip_route_ids, + tu_trip_direction_ids, + tu_min_header_timestamp, + tu_max_header_timestamp, + tu_num_distinct_header_timestamps, + tu_min_trip_update_timestamp, + tu_max_trip_update_timestamp, + tu_num_distinct_trip_update_timestamps, + tu_max_delay, + tu_num_skipped_stops, + tu_num_scheduled_canceled_added_stops, + + -- vehicle positions facts + vp_trip_start_time, + vp_trip_start_date, + vp_warning_multiple_route_ids, + vp_warning_multiple_direction_ids, + vp_starting_schedule_relationship, + vp_ending_schedule_relationship, + vp_trip_schedule_relationships, + vp_num_distinct_message_ids, + vp_min_extract_ts, + vp_max_extract_ts, + vp_num_distinct_extract_ts, + vp_trip_route_ids, + vp_trip_direction_ids, + vp_min_header_timestamp, + vp_max_header_timestamp, + vp_num_distinct_header_timestamps, + vp_min_vehicle_timestamp, + vp_max_vehicle_timestamp, + vp_num_distinct_vehicle_timestamps, + vp_first_position_latitude, + vp_first_position_longitude, + vp_last_position_latitude, + vp_last_position_longitude, + + -- service alerts facts + {# sa_trip_start_time, + sa_trip_start_date, + sa_warning_multiple_route_ids, + sa_warning_multiple_direction_ids, + sa_starting_schedule_relationship, + sa_ending_schedule_relationship, + sa_trip_schedule_relationships, + sa_num_distinct_message_ids, + sa_min_extract_ts, + sa_max_extract_ts, + sa_num_distinct_extract_ts, + sa_trip_route_ids, + sa_trip_direction_ids, + sa_min_header_timestamp, + sa_max_header_timestamp, + sa_num_distinct_header_timestamps, + sa_alert_content_array, #} + + -- keying + tu_base64_url, + vp_base64_url, + {# sa_base64_url, + sa_datasets.key AS sa_gtfs_dataset_key, #} + tu_datasets.key AS tu_gtfs_dataset_key, + vp_datasets.key AS vp_gtfs_dataset_key, + schedule.gtfs_dataset_key AS schedule_gtfs_dataset_key, + FROM rt_joins + {# LEFT JOIN dim_gtfs_datasets AS sa_datasets + ON rt_joins.sa_base64_url = sa_datasets.base64_url + AND rt_joins.sa_min_extract_ts BETWEEN sa_datasets._valid_from AND sa_datasets._valid_to #} + LEFT JOIN dim_gtfs_datasets AS tu_datasets + ON rt_joins.tu_base64_url = tu_datasets.base64_url + AND rt_joins.tu_min_extract_ts BETWEEN tu_datasets._valid_from AND tu_datasets._valid_to + LEFT JOIN dim_gtfs_datasets AS vp_datasets + ON rt_joins.vp_base64_url = vp_datasets.base64_url + AND rt_joins.vp_min_extract_ts BETWEEN vp_datasets._valid_from AND vp_datasets._valid_to + LEFT JOIN fct_daily_schedule_feeds AS schedule + ON rt_joins.calculated_service_date = schedule.date + AND rt_joins.schedule_base64_url = schedule.base64_url ) SELECT * FROM fct_observed_trips diff --git a/warehouse/models/mart/gtfs/fct_vehicle_locations.sql b/warehouse/models/mart/gtfs/fct_vehicle_locations.sql index d6eda6b9ad..250654e480 100644 --- a/warehouse/models/mart/gtfs/fct_vehicle_locations.sql +++ b/warehouse/models/mart/gtfs/fct_vehicle_locations.sql @@ -28,7 +28,14 @@ first_keying_and_filtering AS ( {{ dbt_utils.generate_surrogate_key(['calculated_service_date', 'base64_url', 'location_timestamp', 'vehicle_id', 'vehicle_label', 'trip_id', 'trip_start_time']) }} AS key, {{ dbt_utils.generate_surrogate_key(['calculated_service_date', 'base64_url', 'vehicle_id', 'vehicle_label', 'trip_id', 'trip_start_time']) }} AS vehicle_trip_key FROM fct_vehicle_positions_messages + -- drop cases where trip id is null since these cannot be joined to schedule + -- this is something we may want to reconsider + -- TODO: theoretically we need to eventually support route / direction / start date / start time as an alternate trip identifier WHERE trip_id IS NOT NULL + -- we originally dropped the Bay Area regional feed because they don't make their vehicle identifiers unique by agency + -- so you can end up intermingling multiple vehicles + -- however, not clear this issue remains if we are also dropping rows with no trip + -- since regional feed does have unique trip IDs per agency AND name != 'Bay Area 511 Regional VehiclePositions' ),