diff --git a/warehouse/models/intermediate/gtfs/_int_gtfs.yaml b/warehouse/models/intermediate/gtfs/_int_gtfs.yaml index 070e521b1a..bd1549c29a 100644 --- a/warehouse/models/intermediate/gtfs/_int_gtfs.yaml +++ b/warehouse/models/intermediate/gtfs/_int_gtfs.yaml @@ -524,3 +524,14 @@ models: This means they will fail to join with `stop` related information. - name: base64_url - name: _feed_valid_from + - name: int_gtfs_rt__service_alerts_fully_unnested + description: | + This table contains GTFS RT service alerts messages with all elements (informed entities, + active periods, and translations) unnested, so each row is a message / entity / active period / + translation combination. + See: https://gtfs.org/realtime/reference/#message-alert for field definitions. + columns: + - name: english_likelihood + description: | + 100 if English language, 1 if null (null can mean no internationalization), + 0 for other languages. Based on `header_text.language`. diff --git a/warehouse/models/mart/gtfs/fct_service_alert_translations.sql b/warehouse/models/intermediate/gtfs/int_gtfs_rt__service_alerts_fully_unnested.sql similarity index 61% rename from warehouse/models/mart/gtfs/fct_service_alert_translations.sql rename to warehouse/models/intermediate/gtfs/int_gtfs_rt__service_alerts_fully_unnested.sql index 05d5ee057d..7c1531f17e 100644 --- a/warehouse/models/mart/gtfs/fct_service_alert_translations.sql +++ b/warehouse/models/intermediate/gtfs/int_gtfs_rt__service_alerts_fully_unnested.sql @@ -3,22 +3,42 @@ WITH messages AS ( FROM {{ ref('fct_service_alerts_messages') }} ), -fct_service_alert_translations AS ( +int_gtfs_rt__service_alerts_fully_unnested AS ( SELECT - messages.* EXCEPT( - key, - header_text, - description_text, - tts_header_text, - tts_description_text, - url - ), key AS service_alert_message_key, + gtfs_dataset_key, + dt, + hour, + _extract_ts, + header_timestamp, + base64_url, + _config_extract_ts, + _gtfs_dataset_name, + _header_message_age, + header_version, + header_incrementality, + id, + cause, + effect, - {{ dbt_utils.generate_surrogate_key(['key', - 'unnested_header_text_translation.text', - 'unnested_header_text_translation.language']) }} AS key, + -- active periods + unnested_active_period.start AS active_period_start, + unnested_active_period.end AS active_period_end, + -- informted entities + unnested_informed_entity.agencyId AS agency_id, + unnested_informed_entity.routeId AS route_id, + unnested_informed_entity.routeType AS route_type, + unnested_informed_entity.directionId AS direction_id, + unnested_informed_entity.trip.tripId AS trip_id, + unnested_informed_entity.trip.routeId AS trip_route_id, + unnested_informed_entity.trip.directionId AS trip_direction_id, + unnested_informed_entity.trip.startTime AS trip_start_time, + unnested_informed_entity.trip.startDate AS trip_start_date, + unnested_informed_entity.trip.scheduleRelationship AS trip_schedule_relationship, + unnested_informed_entity.stopId AS stop_id, + + -- text (translations) unnested_header_text_translation.text AS header_text_text, unnested_header_text_translation.language AS header_text_language, @@ -34,12 +54,16 @@ fct_service_alert_translations AS ( unnested_url_translation.text AS url_text, unnested_url_translation.language AS url_language, + -- try to assess which rows are English versions (if available); + -- we don't want to double count alerts when multiple lanugages are present CASE WHEN unnested_header_text_translation.language LIKE "%en%" THEN 100 WHEN unnested_header_text_translation.language IS NULL THEN 1 ELSE 0 END AS english_likelihood FROM messages + LEFT JOIN UNNEST(messages.informed_entity) AS unnested_informed_entity + LEFT JOIN UNNEST(messages.active_period) AS unnested_active_period -- https://stackoverflow.com/questions/44918108/google-bigquery-i-lost-null-row-when-using-unnest-function -- these arrays may have nulls LEFT JOIN UNNEST(messages.header_text.translation) AS unnested_header_text_translation @@ -54,4 +78,4 @@ fct_service_alert_translations AS ( AND COALESCE(unnested_url_translation.language, unnested_header_text_translation.language, 'language') = COALESCE(unnested_header_text_translation.language, 'language') ) -SELECT * FROM fct_service_alert_translations +SELECT * FROM int_gtfs_rt__service_alerts_fully_unnested diff --git a/warehouse/models/mart/gtfs/_mart_gtfs_fcts.yml b/warehouse/models/mart/gtfs/_mart_gtfs_fcts.yml index 67af1d8d55..d009b225fb 100644 --- a/warehouse/models/mart/gtfs/_mart_gtfs_fcts.yml +++ b/warehouse/models/mart/gtfs/_mart_gtfs_fcts.yml @@ -913,96 +913,60 @@ models: Synthetic primary key constructed from `service_alerts_message_key`, `active_period_start`, and `active_period_end`. tests: *rt_primary_key_tests - - &service_alert_message_key - name: service_alert_message_key + - name: fct_service_alerts_messages_unnested + description: | + This table contains GTFS RT service alerts messages with all elements (informed entities, + active periods, and translations) unnested, so each row is a message / entity / active period / + translation combination. It has been filtered so that only one translation (the one with highest likelihood + of being in English) appears per message. + Therefore one row here should correspond to one actual "alert" (counting alerts + that apply to different entities or different time periods as distinct.) + See: https://gtfs.org/realtime/reference/#message-alert for field definitions. + columns: + - name: key description: | - The primary key for the record in `dim_gtfs_datasets` associated with this message. + Synthetic primary key constructed from `service_alerts_message_key` along with the active period + and informed entities. + tests: *rt_primary_key_tests + - name: service_alert_message_key + description: | + The primary key for the message in `fct_service_alerts_messages`. tests: - not_null: config: where: '__rt_sampled__' - # todo: figure out how to get this to work; it's erroring on the partition elimination - # - dbt_utils.relationships_where: + # this doesn't work because we would need to do partition elimination on fct_service_alerts_messages + # TODO: create a custom relationships_where to handle the partition elimination + # - relationships: # to: ref('fct_service_alerts_messages') # field: key # config: - # to_condition: dt >= DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY) - # from_condition: dt >= DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY) - - name: dt - - name: active_period_start - description: | - See: https://gtfs.org/realtime/reference/#message-timerange. - - name: active_period_end - description: | - See: https://gtfs.org/realtime/reference/#message-timerange. - - name: fct_service_alert_informed_entities - description: | - Each row is an informed entity for a service alerts message. - See https://gtfs.org/realtime/reference/#message-alert for information - about message structure. - Due to data size, this table **must** be queried with a date filter (like `WHERE dt = 'YYYY-MM-DD'`). - Hour filters will also further improve performance. - columns: - - name: key + # where: '__rt_sampled__' + - name: gtfs_dataset_key description: | - Synthetic primary key constructed from `service_alerts_message_key`, - `agency_id`, `trip_id`, `route_id`, and `stop_id`. - tests: &service_alert_key_tests - - unique_proportion: - at_least: 0.98 - where: '__rt_sampled__' - - not_null: + The primary key for the record in `dim_gtfs_datasets` associated with this message. + columns: + - <<: *gtfs_dataset_key + config: where: '__rt_sampled__' - - *service_alert_message_key - name: dt - - name: agency_id description: | - See: https://gtfs.org/realtime/reference/#message-entityselector. - - name: route_id - description: | - See: https://gtfs.org/realtime/reference/#message-entityselector. - - name: route_type - description: | - See: https://gtfs.org/realtime/reference/#message-entityselector. - - name: direction_id - description: | - See: https://gtfs.org/realtime/reference/#message-entityselector. - - name: trip_id - description: | - See: https://gtfs.org/realtime/reference/#message-entityselector. - - name: route_id - description: | - See: https://gtfs.org/realtime/reference/#message-tripdescriptor. - - name: trip_direction_id - description: | - See: https://gtfs.org/realtime/reference/#message-tripdescriptor. - - name: trip_start_time - description: | - See: https://gtfs.org/realtime/reference/#message-tripdescriptor. - - name: trip_start_date + Date on which we scraped this message. + A date filter *must* be provided when querying this table, because of the size of the data. + - name: hour description: | - See: https://gtfs.org/realtime/reference/#message-tripdescriptor. - - name: trip_schedule_relationship + Timestamp of the beginning of the hour in which this message was scraped, + ex. "2022-09-01T00:00:00+00". + - name: base64_url description: | - See: https://gtfs.org/realtime/reference/#message-tripdescriptor. - - name: stop_id + URL-safe base64 encoding of the URL from which this message was scraped. + - name: _extract_ts description: | - See: https://gtfs.org/realtime/reference/#message-entityselector. - - name: fct_service_alert_translations - description: | - Each row is a translation for a service alerts message in a given language. - See https://gtfs.org/realtime/reference/#message-alert for information - about message structure. - Due to data size, this table **must** be queried with a date filter (like `WHERE dt = 'YYYY-MM-DD'`). - Hour filters will also further improve performance. - columns: - - name: key + Time at which this message was scraped. + - name: _gtfs_dataset_name description: | - Synthetic primary key constructed from `service_alerts_message_key`, - `header_text_text`, and `header_text_language`. - tests: *rt_primary_key_tests - - *service_alert_message_key - - name: dt + String name of the GTFS dataset of which this message is a part. + This field is provided for human readability and should not be used as a join key. - name: url_text description: | See: https://gtfs.org/realtime/reference/#message-translation. @@ -1033,10 +997,46 @@ models: - name: tts_description_text_language description: | See: https://gtfs.org/realtime/reference/#message-translation. - - name: english_likelihood + - name: agency_id + description: | + See: https://gtfs.org/realtime/reference/#message-entityselector. + - name: route_id + description: | + See: https://gtfs.org/realtime/reference/#message-entityselector. + - name: route_type + description: | + See: https://gtfs.org/realtime/reference/#message-entityselector. + - name: direction_id description: | - 100 if English language, 1 if null (null can mean no internationalization), - 0 for other languages. + See: https://gtfs.org/realtime/reference/#message-entityselector. + - name: trip_id + description: | + See: https://gtfs.org/realtime/reference/#message-entityselector. + - name: route_id + description: | + See: https://gtfs.org/realtime/reference/#message-tripdescriptor. + - name: trip_direction_id + description: | + See: https://gtfs.org/realtime/reference/#message-tripdescriptor. + - name: trip_start_time + description: | + See: https://gtfs.org/realtime/reference/#message-tripdescriptor. + - name: trip_start_date + description: | + See: https://gtfs.org/realtime/reference/#message-tripdescriptor. + - name: trip_schedule_relationship + description: | + See: https://gtfs.org/realtime/reference/#message-tripdescriptor. + - name: stop_id + description: | + See: https://gtfs.org/realtime/reference/#message-entityselector. + - name: active_period_start + description: | + See: https://gtfs.org/realtime/reference/#message-timerange. + - name: active_period_end + description: | + See: https://gtfs.org/realtime/reference/#message-timerange. + - *_header_message_age - name: fct_daily_service_alerts description: | Each row is a daily summary of a service alert. @@ -1047,7 +1047,12 @@ models: description: | Synthetic primary key constructed from `dt`, `base64_url`, and entity `id`. - tests: *service_alert_key_tests + tests: + - unique_proportion: + at_least: 0.98 + where: '__rt_sampled__' + - not_null: + where: '__rt_sampled__' - name: first_header_timestamp description: | Earliest header timestamp at which this alert appeared on this date. diff --git a/warehouse/models/mart/gtfs/fct_daily_service_alerts.sql b/warehouse/models/mart/gtfs/fct_daily_service_alerts.sql index a013eea410..89e20f5120 100644 --- a/warehouse/models/mart/gtfs/fct_daily_service_alerts.sql +++ b/warehouse/models/mart/gtfs/fct_daily_service_alerts.sql @@ -8,20 +8,11 @@ }, ) }} -WITH fct_service_alert_translations AS ( - SELECT * FROM {{ ref('fct_service_alert_translations') }} +WITH fct_service_alerts_messages_unnested AS ( + SELECT * FROM {{ ref('fct_service_alerts_messages_unnested') }} WHERE {{ gtfs_rt_dt_where() }} ), -select_english AS ( - SELECT - *, - ROW_NUMBER() OVER (PARTITION BY service_alert_message_key - ORDER BY english_likelihood DESC, header_text_language ASC) AS english_rank - FROM fct_service_alert_translations - QUALIFY english_rank = 1 -), - fct_daily_service_alerts AS ( SELECT {{ dbt_utils.generate_surrogate_key(['dt', 'base64_url', 'id', 'header_text_text']) }} AS key, @@ -35,8 +26,8 @@ fct_daily_service_alerts AS ( description_text_text, MIN(header_timestamp) AS first_header_timestamp, MAX(header_timestamp) AS last_header_timestamp, - COUNT(*) AS num_appearances - FROM select_english + COUNT(DISTINCT service_alert_message_key) AS num_appearances + FROM fct_service_alerts_messages_unnested GROUP BY dt, gtfs_dataset_key, base64_url, id, cause, effect, header_text_text, description_text_text ) diff --git a/warehouse/models/mart/gtfs/fct_service_alert_active_periods.sql b/warehouse/models/mart/gtfs/fct_service_alert_active_periods.sql deleted file mode 100644 index ff97b33de2..0000000000 --- a/warehouse/models/mart/gtfs/fct_service_alert_active_periods.sql +++ /dev/null @@ -1,22 +0,0 @@ -WITH messages AS ( - SELECT * - FROM {{ ref('fct_service_alerts_messages') }} -), - -fct_service_alerts_active_periods AS ( - SELECT - messages.* EXCEPT( - key, - active_period - ), - key AS service_alert_message_key, - {{ dbt_utils.generate_surrogate_key(['key', 'unnested_active_period.start', 'unnested_active_period.end']) }} AS key, - unnested_active_period.start AS active_period_start, - unnested_active_period.end AS active_period_end, - FROM messages - -- https://stackoverflow.com/questions/44918108/google-bigquery-i-lost-null-row-when-using-unnest-function - -- these arrays may have nulls - LEFT JOIN UNNEST(messages.active_period) AS unnested_active_period -) - -SELECT * FROM fct_service_alerts_active_periods diff --git a/warehouse/models/mart/gtfs/fct_service_alert_informed_entities.sql b/warehouse/models/mart/gtfs/fct_service_alert_informed_entities.sql deleted file mode 100644 index bdcab9e3f1..0000000000 --- a/warehouse/models/mart/gtfs/fct_service_alert_informed_entities.sql +++ /dev/null @@ -1,35 +0,0 @@ -WITH messages AS ( - SELECT * - FROM {{ ref('fct_service_alerts_messages') }} -), - -fct_service_alerts_informed_entities AS ( - SELECT - messages.* EXCEPT( - key, - informed_entity - ), - key AS service_alert_message_key, - - {{ dbt_utils.generate_surrogate_key(['key', 'unnested_informed_entity.agencyId', - 'unnested_informed_entity.routeId', 'unnested_informed_entity.trip.tripId', - 'unnested_informed_entity.stopId']) }} AS key, - - unnested_informed_entity.agencyId AS agency_id, - unnested_informed_entity.routeId AS route_id, - unnested_informed_entity.routeType AS route_type, - unnested_informed_entity.directionId AS direction_id, - unnested_informed_entity.trip.tripId AS trip_id, - unnested_informed_entity.trip.routeId AS trip_route_id, - unnested_informed_entity.trip.directionId AS trip_direction_id, - unnested_informed_entity.trip.startTime AS trip_start_time, - unnested_informed_entity.trip.startDate AS trip_start_date, - unnested_informed_entity.trip.scheduleRelationship AS trip_schedule_relationship, - unnested_informed_entity.stopId AS stop_id - FROM messages - -- https://stackoverflow.com/questions/44918108/google-bigquery-i-lost-null-row-when-using-unnest-function - -- these arrays may have nulls - LEFT JOIN UNNEST(messages.informed_entity) AS unnested_informed_entity -) - -SELECT * FROM fct_service_alerts_informed_entities diff --git a/warehouse/models/mart/gtfs/fct_service_alerts_messages_unnested.sql b/warehouse/models/mart/gtfs/fct_service_alerts_messages_unnested.sql new file mode 100644 index 0000000000..5316762b10 --- /dev/null +++ b/warehouse/models/mart/gtfs/fct_service_alerts_messages_unnested.sql @@ -0,0 +1,100 @@ +{{ config( + materialized='incremental', + incremental_strategy='insert_overwrite', + partition_by = { + 'field': 'dt', + 'data_type': 'date', + 'granularity': 'day', + }, + cluster_by = 'base64_url' +) }} + +WITH int_gtfs_rt__service_alerts_fully_unnested AS ( + SELECT * FROM {{ ref('int_gtfs_rt__service_alerts_fully_unnested') }} + WHERE {{ gtfs_rt_dt_where() }} +), + +select_english AS ( + SELECT + *, + ROW_NUMBER() OVER (PARTITION BY + service_alert_message_key, + active_period_start, + active_period_end, + agency_id, + route_id, + direction_id, + trip_id, + trip_start_date, + trip_start_time, + stop_id + ORDER BY english_likelihood DESC, header_text_language ASC) AS english_rank + FROM int_gtfs_rt__service_alerts_fully_unnested + QUALIFY english_rank = 1 +), + +fct_service_alerts_messages_unnested AS ( + SELECT + {{ dbt_utils.generate_surrogate_key([ + 'service_alert_message_key', + 'active_period_start', + 'active_period_end', + 'agency_id', + 'route_id', + 'direction_id', + 'trip_id', + 'trip_start_date', + 'trip_start_time', + 'stop_id']) }} AS key, + service_alert_message_key, + gtfs_dataset_key, + dt, + hour, + _extract_ts, + header_timestamp, + base64_url, + _config_extract_ts, + _gtfs_dataset_name, + _header_message_age, + header_version, + header_incrementality, + id, + cause, + effect, + + -- active periods + active_period_start, + active_period_end, + + -- informed entities + agency_id, + route_id, + route_type, + direction_id, + trip_id, + trip_route_id, + trip_direction_id, + trip_start_time, + trip_start_date, + trip_schedule_relationship, + stop_id, + + -- text (translations) + header_text_text, + header_text_language, + + description_text_text, + description_text_language, + + tts_header_text_text, + tts_header_text_language, + + tts_description_text_text, + tts_description_text_language, + + url_text, + url_language + FROM select_english +) + +SELECT * FROM fct_service_alerts_messages_unnested diff --git a/warehouse/models/mart/gtfs/fct_service_alerts_trip_summaries.sql b/warehouse/models/mart/gtfs/fct_service_alerts_trip_summaries.sql index 3fda0f6004..d135e37d76 100644 --- a/warehouse/models/mart/gtfs/fct_service_alerts_trip_summaries.sql +++ b/warehouse/models/mart/gtfs/fct_service_alerts_trip_summaries.sql @@ -12,7 +12,7 @@ }} WITH service_alerts AS ( - SELECT * FROM {{ ref('fct_service_alert_informed_entities') }} + SELECT * FROM {{ ref('fct_service_alerts_messages_unnested') }} WHERE {{ gtfs_rt_dt_where() }} ),