Skip to content

Commit

Permalink
Refactor unnesting for service alerts (#2583)
Browse files Browse the repository at this point in the history
* new unnesting for service alerts

* clean up unnesting and add yaml

* delete old mart tables and replace them with new unnested table
  • Loading branch information
lauriemerrell authored May 12, 2023
1 parent e638859 commit 3a9ac95
Show file tree
Hide file tree
Showing 8 changed files with 236 additions and 162 deletions.
11 changes: 11 additions & 0 deletions warehouse/models/intermediate/gtfs/_int_gtfs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -524,3 +524,14 @@ models:
This means they will fail to join with `stop` related information.
- name: base64_url
- name: _feed_valid_from
- name: int_gtfs_rt__service_alerts_fully_unnested
description: |
This table contains GTFS RT service alerts messages with all elements (informed entities,
active periods, and translations) unnested, so each row is a message / entity / active period /
translation combination.
See: https://gtfs.org/realtime/reference/#message-alert for field definitions.
columns:
- name: english_likelihood
description: |
100 if English language, 1 if null (null can mean no internationalization),
0 for other languages. Based on `header_text.language`.
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,42 @@ WITH messages AS (
FROM {{ ref('fct_service_alerts_messages') }}
),

fct_service_alert_translations AS (
int_gtfs_rt__service_alerts_fully_unnested AS (
SELECT
messages.* EXCEPT(
key,
header_text,
description_text,
tts_header_text,
tts_description_text,
url
),
key AS service_alert_message_key,
gtfs_dataset_key,
dt,
hour,
_extract_ts,
header_timestamp,
base64_url,
_config_extract_ts,
_gtfs_dataset_name,
_header_message_age,
header_version,
header_incrementality,
id,
cause,
effect,

{{ dbt_utils.generate_surrogate_key(['key',
'unnested_header_text_translation.text',
'unnested_header_text_translation.language']) }} AS key,
-- active periods
unnested_active_period.start AS active_period_start,
unnested_active_period.end AS active_period_end,

-- informted entities
unnested_informed_entity.agencyId AS agency_id,
unnested_informed_entity.routeId AS route_id,
unnested_informed_entity.routeType AS route_type,
unnested_informed_entity.directionId AS direction_id,
unnested_informed_entity.trip.tripId AS trip_id,
unnested_informed_entity.trip.routeId AS trip_route_id,
unnested_informed_entity.trip.directionId AS trip_direction_id,
unnested_informed_entity.trip.startTime AS trip_start_time,
unnested_informed_entity.trip.startDate AS trip_start_date,
unnested_informed_entity.trip.scheduleRelationship AS trip_schedule_relationship,
unnested_informed_entity.stopId AS stop_id,

-- text (translations)
unnested_header_text_translation.text AS header_text_text,
unnested_header_text_translation.language AS header_text_language,

Expand All @@ -34,12 +54,16 @@ fct_service_alert_translations AS (
unnested_url_translation.text AS url_text,
unnested_url_translation.language AS url_language,

-- try to assess which rows are English versions (if available);
-- we don't want to double count alerts when multiple lanugages are present
CASE
WHEN unnested_header_text_translation.language LIKE "%en%" THEN 100
WHEN unnested_header_text_translation.language IS NULL THEN 1
ELSE 0
END AS english_likelihood
FROM messages
LEFT JOIN UNNEST(messages.informed_entity) AS unnested_informed_entity
LEFT JOIN UNNEST(messages.active_period) AS unnested_active_period
-- https://stackoverflow.com/questions/44918108/google-bigquery-i-lost-null-row-when-using-unnest-function
-- these arrays may have nulls
LEFT JOIN UNNEST(messages.header_text.translation) AS unnested_header_text_translation
Expand All @@ -54,4 +78,4 @@ fct_service_alert_translations AS (
AND COALESCE(unnested_url_translation.language, unnested_header_text_translation.language, 'language') = COALESCE(unnested_header_text_translation.language, 'language')
)

SELECT * FROM fct_service_alert_translations
SELECT * FROM int_gtfs_rt__service_alerts_fully_unnested
161 changes: 83 additions & 78 deletions warehouse/models/mart/gtfs/_mart_gtfs_fcts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -913,96 +913,60 @@ models:
Synthetic primary key constructed from `service_alerts_message_key`,
`active_period_start`, and `active_period_end`.
tests: *rt_primary_key_tests
- &service_alert_message_key
name: service_alert_message_key
- name: fct_service_alerts_messages_unnested
description: |
This table contains GTFS RT service alerts messages with all elements (informed entities,
active periods, and translations) unnested, so each row is a message / entity / active period /
translation combination. It has been filtered so that only one translation (the one with highest likelihood
of being in English) appears per message.
Therefore one row here should correspond to one actual "alert" (counting alerts
that apply to different entities or different time periods as distinct.)
See: https://gtfs.org/realtime/reference/#message-alert for field definitions.
columns:
- name: key
description: |
The primary key for the record in `dim_gtfs_datasets` associated with this message.
Synthetic primary key constructed from `service_alerts_message_key` along with the active period
and informed entities.
tests: *rt_primary_key_tests
- name: service_alert_message_key
description: |
The primary key for the message in `fct_service_alerts_messages`.
tests:
- not_null:
config:
where: '__rt_sampled__'
# todo: figure out how to get this to work; it's erroring on the partition elimination
# - dbt_utils.relationships_where:
# this doesn't work because we would need to do partition elimination on fct_service_alerts_messages
# TODO: create a custom relationships_where to handle the partition elimination
# - relationships:
# to: ref('fct_service_alerts_messages')
# field: key
# config:
# to_condition: dt >= DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY)
# from_condition: dt >= DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY)
- name: dt
- name: active_period_start
description: |
See: https://gtfs.org/realtime/reference/#message-timerange.
- name: active_period_end
description: |
See: https://gtfs.org/realtime/reference/#message-timerange.
- name: fct_service_alert_informed_entities
description: |
Each row is an informed entity for a service alerts message.
See https://gtfs.org/realtime/reference/#message-alert for information
about message structure.
Due to data size, this table **must** be queried with a date filter (like `WHERE dt = 'YYYY-MM-DD'`).
Hour filters will also further improve performance.
columns:
- name: key
# where: '__rt_sampled__'
- name: gtfs_dataset_key
description: |
Synthetic primary key constructed from `service_alerts_message_key`,
`agency_id`, `trip_id`, `route_id`, and `stop_id`.
tests: &service_alert_key_tests
- unique_proportion:
at_least: 0.98
where: '__rt_sampled__'
- not_null:
The primary key for the record in `dim_gtfs_datasets` associated with this message.
columns:
- <<: *gtfs_dataset_key
config:
where: '__rt_sampled__'
- *service_alert_message_key
- name: dt
- name: agency_id
description: |
See: https://gtfs.org/realtime/reference/#message-entityselector.
- name: route_id
description: |
See: https://gtfs.org/realtime/reference/#message-entityselector.
- name: route_type
description: |
See: https://gtfs.org/realtime/reference/#message-entityselector.
- name: direction_id
description: |
See: https://gtfs.org/realtime/reference/#message-entityselector.
- name: trip_id
description: |
See: https://gtfs.org/realtime/reference/#message-entityselector.
- name: route_id
description: |
See: https://gtfs.org/realtime/reference/#message-tripdescriptor.
- name: trip_direction_id
description: |
See: https://gtfs.org/realtime/reference/#message-tripdescriptor.
- name: trip_start_time
description: |
See: https://gtfs.org/realtime/reference/#message-tripdescriptor.
- name: trip_start_date
Date on which we scraped this message.
A date filter *must* be provided when querying this table, because of the size of the data.
- name: hour
description: |
See: https://gtfs.org/realtime/reference/#message-tripdescriptor.
- name: trip_schedule_relationship
Timestamp of the beginning of the hour in which this message was scraped,
ex. "2022-09-01T00:00:00+00".
- name: base64_url
description: |
See: https://gtfs.org/realtime/reference/#message-tripdescriptor.
- name: stop_id
URL-safe base64 encoding of the URL from which this message was scraped.
- name: _extract_ts
description: |
See: https://gtfs.org/realtime/reference/#message-entityselector.
- name: fct_service_alert_translations
description: |
Each row is a translation for a service alerts message in a given language.
See https://gtfs.org/realtime/reference/#message-alert for information
about message structure.
Due to data size, this table **must** be queried with a date filter (like `WHERE dt = 'YYYY-MM-DD'`).
Hour filters will also further improve performance.
columns:
- name: key
Time at which this message was scraped.
- name: _gtfs_dataset_name
description: |
Synthetic primary key constructed from `service_alerts_message_key`,
`header_text_text`, and `header_text_language`.
tests: *rt_primary_key_tests
- *service_alert_message_key
- name: dt
String name of the GTFS dataset of which this message is a part.
This field is provided for human readability and should not be used as a join key.
- name: url_text
description: |
See: https://gtfs.org/realtime/reference/#message-translation.
Expand Down Expand Up @@ -1033,10 +997,46 @@ models:
- name: tts_description_text_language
description: |
See: https://gtfs.org/realtime/reference/#message-translation.
- name: english_likelihood
- name: agency_id
description: |
See: https://gtfs.org/realtime/reference/#message-entityselector.
- name: route_id
description: |
See: https://gtfs.org/realtime/reference/#message-entityselector.
- name: route_type
description: |
See: https://gtfs.org/realtime/reference/#message-entityselector.
- name: direction_id
description: |
100 if English language, 1 if null (null can mean no internationalization),
0 for other languages.
See: https://gtfs.org/realtime/reference/#message-entityselector.
- name: trip_id
description: |
See: https://gtfs.org/realtime/reference/#message-entityselector.
- name: route_id
description: |
See: https://gtfs.org/realtime/reference/#message-tripdescriptor.
- name: trip_direction_id
description: |
See: https://gtfs.org/realtime/reference/#message-tripdescriptor.
- name: trip_start_time
description: |
See: https://gtfs.org/realtime/reference/#message-tripdescriptor.
- name: trip_start_date
description: |
See: https://gtfs.org/realtime/reference/#message-tripdescriptor.
- name: trip_schedule_relationship
description: |
See: https://gtfs.org/realtime/reference/#message-tripdescriptor.
- name: stop_id
description: |
See: https://gtfs.org/realtime/reference/#message-entityselector.
- name: active_period_start
description: |
See: https://gtfs.org/realtime/reference/#message-timerange.
- name: active_period_end
description: |
See: https://gtfs.org/realtime/reference/#message-timerange.
- *_header_message_age
- name: fct_daily_service_alerts
description: |
Each row is a daily summary of a service alert.
Expand All @@ -1047,7 +1047,12 @@ models:
description: |
Synthetic primary key constructed from `dt`,
`base64_url`, and entity `id`.
tests: *service_alert_key_tests
tests:
- unique_proportion:
at_least: 0.98
where: '__rt_sampled__'
- not_null:
where: '__rt_sampled__'
- name: first_header_timestamp
description: |
Earliest header timestamp at which this alert appeared on this date.
Expand Down
17 changes: 4 additions & 13 deletions warehouse/models/mart/gtfs/fct_daily_service_alerts.sql
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,11 @@
},
) }}

WITH fct_service_alert_translations AS (
SELECT * FROM {{ ref('fct_service_alert_translations') }}
WITH fct_service_alerts_messages_unnested AS (
SELECT * FROM {{ ref('fct_service_alerts_messages_unnested') }}
WHERE {{ gtfs_rt_dt_where() }}
),

select_english AS (
SELECT
*,
ROW_NUMBER() OVER (PARTITION BY service_alert_message_key
ORDER BY english_likelihood DESC, header_text_language ASC) AS english_rank
FROM fct_service_alert_translations
QUALIFY english_rank = 1
),

fct_daily_service_alerts AS (
SELECT
{{ dbt_utils.generate_surrogate_key(['dt', 'base64_url', 'id', 'header_text_text']) }} AS key,
Expand All @@ -35,8 +26,8 @@ fct_daily_service_alerts AS (
description_text_text,
MIN(header_timestamp) AS first_header_timestamp,
MAX(header_timestamp) AS last_header_timestamp,
COUNT(*) AS num_appearances
FROM select_english
COUNT(DISTINCT service_alert_message_key) AS num_appearances
FROM fct_service_alerts_messages_unnested
GROUP BY dt, gtfs_dataset_key, base64_url, id, cause, effect, header_text_text,
description_text_text
)
Expand Down
22 changes: 0 additions & 22 deletions warehouse/models/mart/gtfs/fct_service_alert_active_periods.sql

This file was deleted.

35 changes: 0 additions & 35 deletions warehouse/models/mart/gtfs/fct_service_alert_informed_entities.sql

This file was deleted.

Loading

0 comments on commit 3a9ac95

Please sign in to comment.