From 9a1c3a22a4f1ab5ac2d8e5aaf42803c7d68eb1b3 Mon Sep 17 00:00:00 2001
From: Paul Swartz <paul@paulswartz.net>
Date: Wed, 21 Aug 2024 11:47:10 -0400
Subject: [PATCH 1/5] doc: initial calculation/writeup of terminal schedule
 accuracy

---
 ...ght_rail_terminal_schedule_accuracy.livemd | 89 +++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 reports/light_rail_terminal_schedule_accuracy.livemd

diff --git a/reports/light_rail_terminal_schedule_accuracy.livemd b/reports/light_rail_terminal_schedule_accuracy.livemd
new file mode 100644
index 0000000..699725a
--- /dev/null
+++ b/reports/light_rail_terminal_schedule_accuracy.livemd
@@ -0,0 +1,89 @@
+<!-- livebook:{"file_entries":[{"name":"2024-08-12-subway-on-time-performance-v1.parquet","type":"url","url":"https://performancedata.mbta.com/lamp/subway-on-time-performance-v1/2024-08-12-subway-on-time-performance-v1.parquet"},{"name":"2024-08-13-subway-on-time-performance-v1.parquet","type":"url","url":"https://performancedata.mbta.com/lamp/subway-on-time-performance-v1/2024-08-13-subway-on-time-performance-v1.parquet"},{"name":"2024-08-14-subway-on-time-performance-v1.parquet","type":"url","url":"https://performancedata.mbta.com/lamp/subway-on-time-performance-v1/2024-08-14-subway-on-time-performance-v1.parquet"},{"name":"2024-08-15-subway-on-time-performance-v1.parquet","type":"url","url":"https://performancedata.mbta.com/lamp/subway-on-time-performance-v1/2024-08-15-subway-on-time-performance-v1.parquet"},{"name":"2024-08-16-subway-on-time-performance-v1.parquet","type":"url","url":"https://performancedata.mbta.com/lamp/subway-on-time-performance-v1/2024-08-16-subway-on-time-performance-v1.parquet"}]} -->
+
+# Light Rail Terminal Schedule Accuracy
+
+```elixir
+Mix.install([
+  {:explorer, "~> 0.9.1"},
+  {:kino, "~> 0.13.2"}
+])
+
+```
+
+## Grab All The Data
+
+```elixir
+require Explorer.DataFrame, as: DF
+alias Explorer.Series
+
+# one business week, starting 2024-08-12
+start_date = ~D[2024-08-12]
+range = 0..4
+files = for add <- range do
+  date = Date.add(start_date, add)
+  "#{Date.to_iso8601(date)}-subway-on-time-performance-v1.parquet"
+end
+
+df = files
+|> Enum.map(&DF.from_parquet!(Kino.FS.file_path(&1)))
+|> DF.concat_rows()
+Kino.DataTable.new(df)
+```
+
+```elixir
+require DF
+
+service_date_epoch = df["service_date"]
+|> Series.cast(:string)
+|> Series.strptime("%Y%m%d")
+|> Series.cast(:integer)
+|> Series.quotient(1000000)
+
+dst_offset = -4 * 3600
+
+df = DF.put(df, :service_date_epoch, service_date_epoch)
+df = DF.put(df, :scheduled_timestamp, Series.add(df["service_date_epoch"], Series.subtract(df["scheduled_departure_time"], dst_offset)))
+df = DF.put(df, :diff, Series.subtract(df["stop_timestamp"], df["scheduled_timestamp"]))
+df = DF.mutate(df, stop_timestamp: cast(stop_timestamp * 1000, {:naive_datetime, :millisecond}), scheduled_timestamp: cast(scheduled_timestamp * 1000, {:naive_datetime, :millisecond}))
+df = DF.filter(df, trunk_route_id == "Green" and parent_station in ["place-lake", "place-clmnl", "place-river", "place-hsmnl", "place-unsqu", "place-mdftf"])
+df = DF.mutate(df, is_accurate: diff > -90 and diff < 30)
+df
+|> DF.select(["trip_id", "parent_station", "stop_sequence", "move_timestamp", "stop_timestamp", "scheduled_timestamp", "diff", "is_accurate"])
+|> DF.sort_by([asc: trip_id, asc: scheduled_timestamp])
+|> Kino.DataTable.new()
+
+```
+
+## Overall Accuracy
+
+Values are in seconds. Negative values are departures earlier than the schedule; positive values are after the schedule.
+
+```elixir
+df
+|> DF.summarise(count: count(diff), nil_count: nil_count(diff), accurate_count: cast(sum(is_accurate), {:u, 32}), mean: mean(diff), std: standard_deviation(diff), p25: quantile(diff, 0.25), p50: median(diff), p75: quantile(diff, 0.75))
+|> DF.mutate(nil_pct: nil_count / count, accurate_pct: accurate_count / count)
+|> Kino.DataTable.new()
+```
+
+## Accuracy by Terminal
+
+Values are in seconds. Negative values are departures earlier than the schedule; positive values are after the schedule.
+
+```elixir
+df
+|> DF.group_by(:parent_station)
+|> DF.summarise(count: count(diff), nil_count: nil_count(diff), accurate_count: cast(sum(is_accurate), {:u, 32}), mean: mean(diff), std: standard_deviation(diff), p25: quantile(diff, 0.25), p50: median(diff), p75: quantile(diff, 0.75))
+|> DF.mutate(nil_pct: nil_count / count, accurate_pct: accurate_count / count)
+|> Kino.DataTable.new()
+```
+
+## Summary
+
+* 9.2% of schedules would be considered "accurate" (30 seconds earlier than actual to 90 seconds later than actual)
+* half of all trains leave more than 4.5 minutes earlier than the schedule
+* a quarter of trains leave later than the schedule
+* Union Square is the least accurate:
+  * half of trains leave more than 20 minutes earlier than the schedule
+  * 40% of departures not matching the schedule at all
+  * 3.1% accuracy
+* Boston College is the most variable, with a standard deviation of 36 minutes

From 015df3d4fcca95784bbd40479e2f9ca8984a41e7 Mon Sep 17 00:00:00 2001
From: Paul Swartz <paul@paulswartz.net>
Date: Wed, 21 Aug 2024 13:53:32 -0400
Subject: [PATCH 2/5] doc: Glides Full Data Analysis

---
 reports/glides_full_data_analysis.livemd | 157 +++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 reports/glides_full_data_analysis.livemd

diff --git a/reports/glides_full_data_analysis.livemd b/reports/glides_full_data_analysis.livemd
new file mode 100644
index 0000000..84ff0ea
--- /dev/null
+++ b/reports/glides_full_data_analysis.livemd
@@ -0,0 +1,157 @@
+<!-- livebook:{"file_entries":[{"file":{"file_system_id":"local","file_system_type":"local","path":"/Users/pswartz/Dropbox/0-Inbox/trip-data-analytics-2024-08-12-2024-08-16/full_data.csv"},"name":"full_data.csv","type":"file"}]} -->
+
+# Glides Full Data Analysis
+
+```elixir
+Mix.install([
+  {:explorer, "~> 0.9.1"},
+  {:kino, "~> 0.13.2"}
+])
+```
+
+## Summary
+
+(This is based on data from 2024-08-12 through 2024-08-16)
+
+Accuracy measurements are based on the [ETA Accuracy Benchmark](https://github.com/TransitApp/ETA-Accuracy-Benchmark?tab=readme-ov-file).
+
+Overall, 26.2% of scheduled trips would have had an accurate prediction based solely on the schedule. These are all treated as being in the 0 - 3 minute bucket, requiring the most accuracy.
+
+Inspector-entered data (before the trip leaves) is better. 60.2% of scheduled trips had an accurate time entered by an inspector before the trip left. This has some inaccuracies on both sides:
+
+* understated because dropped trips do not include a `final_lead_time` and we cannot be sure they were dropped before the trip would have departed, so we treat them as inaccurate
+* overstated because we put inspector-entered data into the prediction bucket appropriate for when the data was entered, not taking into account that the data would get less accurate as the actual departure approaches. If we treated all predictions as being in the 0 - 3 minute bucket, the accuracy drops to 26.3%. If we fall back to using the schedule data in cases where the inspectors do not enter data, the accuracy goes to 33.0%.
+
+33% of scheduled trips did not have a recorded departure time: it's unclear whether what this means, but it does limit our ability to measure inspector/schedule data against actual data.
+
+## Data
+
+Fetch `full_data.csv` fetched from Glides report panel and add to the workbook as a file reference.
+
+File documentation: https://www.notion.so/mbta-downtown-crossing/Trip-Data-Analytics-Export-Field-Descriptions-71f6e0fc443f4ca5aaae18183028dd0a
+
+```elixir
+require Explorer.DataFrame, as: DF
+alias Explorer.Series
+  
+df = DF.from_csv!(Kino.FS.file_path("full_data.csv"))
+
+df =  DF.sort_by(df, [asc: service_date, asc: terminal, asc: scheduled_time])
+# |> Kino.DataTable.new()
+
+Kino.nothing()
+```
+
+```elixir
+# scheduled at 9:00
+# automatic at 9:02
+# vehicle was after ETA, value should be positive: automatic - scheduled
+df = df
+  |> DF.mutate(
+    manual_bucket: Series.cut(^df[:final_lead_time], [-1, 2, 5, 9], labels: ["late", "0-3", "3-6", "6-10", "10+"])[:category]
+    #manual_bucket: "0-3"
+  )
+  |> DF.mutate(
+  schedule_inaccuracy: automatic_time - scheduled_time, 
+  manual_inaccuracy: automatic_time - manual_time,
+  allowed_early: Series.select(
+    manual_bucket == "10+",
+    -90,
+    Series.select(
+      manual_bucket == "6-10",
+      -60,
+      Series.select(
+        manual_bucket == "3-6",
+        -60,
+        -30
+      )
+    )
+  ),
+  allowed_late: Series.select(
+    manual_bucket == "10+",
+    270,
+    Series.select(
+      manual_bucket == "6-10",
+      210,
+      Series.select(
+        manual_bucket == "3-6",
+        150,
+        90
+      )
+    )
+  )
+)
+|> DF.mutate(
+  is_accurate: Series.select(dropped?, 
+    false, 
+    schedule_inaccuracy <= 90 and schedule_inaccuracy >= -30))
+|> DF.mutate(
+  manual_accurate: Series.select(
+    dropped?,
+    is_accurate,
+    Series.select(final_lead_time >= 0, 
+      manual_inaccuracy >= allowed_early and manual_inaccuracy <= allowed_late,
+      is_accurate
+    ))
+)
+df
+#|> DF.filter(final_lead_time == 3)
+|> DF.select([:service_date, :terminal, :scheduled_time, :automatic_time, :manual_time, :dropped?, :initial_lead_time, :final_lead_time, :schedule_inaccuracy, :manual_inaccuracy, :is_accurate, :manual_accurate, :manual_bucket])
+|> Kino.DataTable.new()
+```
+
+<!-- livebook:{"reevaluate_automatically":true} -->
+
+```elixir
+summarised = df 
+|> DF.summarise(
+  count: count(automatic_time), 
+  nil_count: nil_count(automatic_time), 
+  mean: mean(schedule_inaccuracy), 
+  std: standard_deviation(schedule_inaccuracy), 
+  p25: quantile(schedule_inaccuracy, 0.25), 
+  p50: median(schedule_inaccuracy), 
+  p75: quantile(schedule_inaccuracy, 0.75),
+  accurate_count: sum(is_accurate),
+  manual_count: sum(manual_accurate)
+)
+|> DF.mutate(
+  accurate_pct: round(cast(accurate_count, {:u, 32}) / (count + nil_count), 3)
+)
+
+# weigh each manual bucket equally
+manual_pct = df 
+  |> DF.filter(manual_bucket != "late")
+  |> DF.group_by(:manual_bucket)
+  |> DF.summarise(size: size(manual_accurate), accurate_count: sum(manual_accurate))
+  |> DF.mutate(group_pct: round(cast(accurate_count, {:u, 32}) / size, 3))
+  |> DF.ungroup()
+  |> DF.summarise(manual_pct: mean(group_pct))
+
+summarised
+|> DF.concat_columns(manual_pct)
+|> Kino.DataTable.new()
+```
+
+```elixir
+df
+|> DF.group_by(:terminal)
+|> DF.summarise(
+  count: count(automatic_time), 
+  nil_count: nil_count(automatic_time), 
+  mean: mean(schedule_inaccuracy), 
+  std: standard_deviation(schedule_inaccuracy), 
+  p25: quantile(schedule_inaccuracy, 0.25), 
+  p50: median(schedule_inaccuracy), 
+  p75: quantile(schedule_inaccuracy, 0.75),
+  accurate_count: sum(is_accurate),
+  manual_count: sum(manual_accurate)
+)
+|> DF.mutate(
+  accurate_pct: round(cast(accurate_count, {:u, 32}) / (count + nil_count), 3),
+  manual_pct: round(cast(manual_count, {:u, 32}) / (count + nil_count), 3)
+)
+|> Kino.DataTable.new()
+```
+
+<!-- livebook:{"offset":5373,"stamp":{"token":"XCP.t5odXogwlI4Fm_roCYgV75lfQ08umaK8iKnSn22jN_JbJBWKEQK4L90jIf3-81fVk2a4Q-87coDSQe2vZnF0w12M-IuJ4EdhxfIVcQ","version":2}} -->

From 58f654c8dbd0f370392774b6612d96a0f44c7727 Mon Sep 17 00:00:00 2001
From: Paul Swartz <paul@paulswartz.net>
Date: Tue, 24 Sep 2024 10:03:34 -0400
Subject: [PATCH 3/5] doc: add LiveBook for initial LR Block Adherence

---
 reports/light_rail_block_adherence.livemd | 100 ++++++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 reports/light_rail_block_adherence.livemd

diff --git a/reports/light_rail_block_adherence.livemd b/reports/light_rail_block_adherence.livemd
new file mode 100644
index 0000000..3823bed
--- /dev/null
+++ b/reports/light_rail_block_adherence.livemd
@@ -0,0 +1,100 @@
+<!-- livebook:{"file_entries":[{"file":{"file_system_id":"local","file_system_type":"local","path":"/Users/paulswartz/Dropbox/1-Projects/MBTA/Light Rail Terminal Predictions/trips-2024.parquet"},"name":"trips-2024.parquet","type":"file"}],"persist_outputs":true} -->
+
+# Block Adherence
+
+```elixir
+Mix.install([
+  {:explorer, "~> 0.9.2"},
+  {:kino, "~> 0.14.1"}
+])
+```
+
+## Data
+
+```
+$ curl -o trips-2024.parquet 'https://performancedata.mbta.com/lamp/gtfs_archive/2024/trips.parquet'
+$ curl -JO 'https://performancedata.mbta.com/lamp/subway-on-time-performance-v1/2024-08-{01-31}-subway-on-time-performance-v1.parquet'
+```
+
+Then add `trips-2024.parquet` as a file for this notebook (sidebar on the left, folder icon, Add file).
+
+```elixir
+require Explorer.DataFrame, as: DF
+alias Explorer.Series, as: S
+
+trips =
+  "trips-2024.parquet"
+  |> Kino.FS.file_path()
+  |> DF.from_parquet!()
+
+blocks =
+  DF.select(trips, [:trip_id, :gtfs_active_date, :gtfs_end_date, :block_id])
+  |> DF.sort_by(desc: gtfs_active_date)
+  |> DF.filter(gtfs_end_date >= 20_240_801 and gtfs_active_date <= 20_240_831)
+
+# |> DF.group_by([:trip_id, :block_id])
+# |> DF.filter(S.size(gtfs_active_date) > 1)
+# |> DF.summarise(gtfs_active_date: min(gtfs_active_date), gtfs_end_date: max(gtfs_end_date))
+# |> DF.ungroup()
+
+df =
+  "trips-2024.parquet"
+  |> Kino.FS.file_path()
+  |> Path.join("../*-subway-on-time-performance-v1.parquet")
+  |> Path.expand()
+  |> Path.wildcard()
+  |> Enum.flat_map(fn path ->
+    case DF.from_parquet(path) do
+      {:ok, df} -> [df]
+      {:error, _} -> []
+    end
+  end)
+  |> DF.concat_rows()
+
+df =
+  df
+  |> DF.filter(S.contains(route_id, "Green-"))
+
+# |> DF.filter(not S.contains(trip_id, "ADDED-"))
+# |> DF.filter(not S.contains(trip_id, "NONREV-"))
+# |> DF.group_by
+
+df =
+  df
+  |> DF.join(blocks, how: :left, on: :trip_id)
+  |> DF.filter(
+    is_nil(gtfs_active_date) or
+      (gtfs_active_date <= service_date and gtfs_end_date >= service_date)
+  )
+  |> DF.filter(travel_time_seconds > 0)
+  |> DF.mutate(timestamp: select(is_nil(move_timestamp), stop_timestamp, move_timestamp))
+  |> DF.group_by([:service_date, :vehicle_id])
+  # |> DF.summarise(block_count: S.size(block_id))
+  |> DF.sort_by(asc: service_date, asc: vehicle_id, asc: timestamp)
+  # |> DF.select([:timestamp, :service_date, :vehicle_id, :trip_id, :block_id])
+  |> DF.summarise(block_count: S.count(block_id))
+  |> DF.mutate(
+    one_block: select(block_count == 1, 1, 0),
+    zero_or_one_block: select(block_count <= 1, 1, 0)
+  )
+  # |> DF.group_by(:service_date)
+  |> DF.summarise(count: S.count(vehicle_id), one_block: S.sum(one_block), zero_or_one_block: S.sum(zero_or_one_block))
+  |> DF.mutate(one_block_pct: 100 * one_block / count, zero_or_one_block_ct: 100 * zero_or_one_block / count)
+
+Kino.DataTable.new(df)
+```
+
+<!-- livebook:{"output":true} -->
+
+```text
+#Explorer.DataFrame<
+  Polars[1 x 5]
+  count u32 [12115]
+  one_block s64 [732]
+  zero_or_one_block s64 [6614]
+  one_block_pct f64 [6.042096574494429]
+  zero_or_one_block_ct f64 [54.59347915806851]
+>
+```
+
+<!-- livebook:{"offset":3039,"stamp":{"token":"XCP.RY4Ej4I4BWqvOd8Fzdctu_DlMp4PWlhHyOadjMuzWk-4UKUtfNsvlpOd8_UlBl8dkmvrUbkdY4NLLejrVLih6o_tiXF5WQ1nN0-mcw","version":2}} -->

From 74b9add694c05fa77ed4cfd40cf1d3c8d4cec070 Mon Sep 17 00:00:00 2001
From: Paul Swartz <paul@paulswartz.net>
Date: Tue, 1 Oct 2024 11:12:51 -0400
Subject: [PATCH 4/5] doc: include Glides trainsheet CSV in block adherence

---
 reports/light_rail_block_adherence.livemd | 350 ++++++++++++++++++++--
 1 file changed, 329 insertions(+), 21 deletions(-)

diff --git a/reports/light_rail_block_adherence.livemd b/reports/light_rail_block_adherence.livemd
index 3823bed..588cb64 100644
--- a/reports/light_rail_block_adherence.livemd
+++ b/reports/light_rail_block_adherence.livemd
@@ -1,4 +1,4 @@
-<!-- livebook:{"file_entries":[{"file":{"file_system_id":"local","file_system_type":"local","path":"/Users/paulswartz/Dropbox/1-Projects/MBTA/Light Rail Terminal Predictions/trips-2024.parquet"},"name":"trips-2024.parquet","type":"file"}],"persist_outputs":true} -->
+<!-- livebook:{"file_entries":[{"file":{"file_system_id":"local","file_system_type":"local","path":"/Users/paulswartz/Dropbox/0-Inbox/trainsheet-2024-08-01-2024-08-31.csv"},"name":"trainsheet.csv","type":"file"},{"file":{"file_system_id":"local","file_system_type":"local","path":"/Users/paulswartz/Dropbox/1-Projects/MBTA/Light Rail Terminal Predictions/trips-2024.parquet"},"name":"trips-2024.parquet","type":"file"}],"persist_outputs":true} -->
 
 # Block Adherence
 
@@ -11,13 +11,17 @@ Mix.install([
 
 ## Data
 
-```
+```shell
 $ curl -o trips-2024.parquet 'https://performancedata.mbta.com/lamp/gtfs_archive/2024/trips.parquet'
+$ curl -o stop-times-2024.parquet 'https://performancedata.mbta.com/lamp/gtfs_archive/2024/stop_times.parquet'
+$ curl -o stops-2024.parquet 'https://performancedata.mbta.com/lamp/gtfs_archive/2024/stops.parquet'
 $ curl -JO 'https://performancedata.mbta.com/lamp/subway-on-time-performance-v1/2024-08-{01-31}-subway-on-time-performance-v1.parquet'
 ```
 
 Then add `trips-2024.parquet` as a file for this notebook (sidebar on the left, folder icon, Add file).
 
+You'll also need to download the Trainsheet CSV for 2024-08-01 through 2024-08-31 from Glides, and add it here as `trainsheet.csv`.
+
 ```elixir
 require Explorer.DataFrame, as: DF
 alias Explorer.Series, as: S
@@ -26,17 +30,155 @@ trips =
   "trips-2024.parquet"
   |> Kino.FS.file_path()
   |> DF.from_parquet!()
+  |> DF.filter(gtfs_end_date >= 20_240_801 and gtfs_active_date <= 20_240_831)
+
+stops =
+  "trips-2024.parquet"
+  |> Kino.FS.file_path()
+  |> Path.join("../stops-2024.parquet")
+  |> Path.expand()
+  |> DF.from_parquet!()
+  |> DF.filter(gtfs_end_date >= 20_240_801 and gtfs_active_date <= 20_240_831)
+
+:ok
+```
 
+<!-- livebook:{"output":true} -->
+
+```
+:ok
+```
+
+```elixir
+parent_stations = stops
+|> DF.mutate(parent_station: select(is_nil(parent_station),stop_id,parent_station))
+|> DF.select([:stop_id,:parent_station])
+|> DF.distinct()
+```
+
+<!-- livebook:{"output":true} -->
+
+```
+#Explorer.DataFrame<
+  Polars[10212 x 2]
+  stop_id string ["9170012", "9070013", "9070010", "9070004", "9070003", ...]
+  parent_station string ["9170012", "9070013", "9070010", "9070004", "9070003", ...]
+>
+```
+
+```elixir
+gtfs_time_to_seconds = fn s ->
+  str = S.split_into(s, ":", ["hour", "minute", "second"])
+  hour = S.field(str, :hour) |> S.cast(:s32)
+  minute = S.field(str, :minute) |> S.cast(:s32)
+  second = S.field(str, :second) |> S.cast(:s32)
+
+  S.add(
+    S.multiply(hour, 3600),
+    S.multiply(minute, 60)
+  )
+  |> S.add(second)
+
+  # S.strptime(s, "%H:%M")
+end
+
+stop_times =
+  "trips-2024.parquet"
+  |> Kino.FS.file_path()
+  |> Path.join("../stop-times-2024.parquet")
+  |> Path.expand()
+  |> DF.from_parquet!()
+  |> DF.filter(gtfs_end_date >= 20_240_801 and gtfs_active_date <= 20_240_831)
+  |> DF.mutate_with(fn df ->
+    [
+      departure_time: gtfs_time_to_seconds.(df[:departure_time]),
+      arrival_time: gtfs_time_to_seconds.(df[:arrival_time])
+    ]
+  end)
+  |> DF.group_by([:gtfs_active_date, :gtfs_end_date, :trip_id])
+  # |> DF.sort_by(asc: stop_sequence)
+  # |> DF.summarise(departure_time: S.first(departure_time), arrival_time: S.last(arrival_time))
+  |> DF.summarise_with(fn df ->
+    argsort = S.argsort(df[:stop_sequence])
+    departure_time = df[:departure_time] |> S.slice(argsort) |> S.first()
+    departure_stop = df[:stop_id] |> S.slice(argsort) |> S.first()
+    arrival_time = df[:arrival_time] |> S.slice(argsort) |> S.last()
+    arrival_stop = df[:stop_id] |> S.slice(argsort) |> S.last()
+
+    [
+      departure_time: departure_time,
+      departure_stop_id: departure_stop,
+      arrival_time: arrival_time,
+      arrival_stop_id: arrival_stop
+    ]
+  end)
+  |> DF.join(parent_stations, on: [departure_stop_id: :stop_id])
+  |> DF.rename(parent_station: :departure_station)
+  |> DF.join(parent_stations, on: [arrival_stop_id: :stop_id])
+  |> DF.rename(parent_station: :arrival_station)
+
+DF.filter(stop_times, trip_id == "62822419")
+```
+
+<!-- livebook:{"output":true} -->
+
+```
+#Explorer.DataFrame<
+  Polars[1 x 9]
+  gtfs_active_date s32 [20240608]
+  gtfs_end_date s32 [20240904]
+  trip_id string ["62822419"]
+  departure_time s64 [86700]
+  departure_stop_id string ["110"]
+  arrival_time s64 [88440]
+  arrival_stop_id string ["64"]
+  departure_station string ["110"]
+  arrival_station string ["place-nubn"]
+>
+```
+
+```elixir
 blocks =
   DF.select(trips, [:trip_id, :gtfs_active_date, :gtfs_end_date, :block_id])
   |> DF.sort_by(desc: gtfs_active_date)
   |> DF.filter(gtfs_end_date >= 20_240_801 and gtfs_active_date <= 20_240_831)
+  |> DF.join(stop_times, on: :trip_id, how: :left)
+  |> DF.filter(
+    gtfs_active_date_right <= gtfs_active_date and gtfs_end_date_right >= gtfs_end_date
+  )
+  |> DF.discard([:gtfs_active_date_right, :gtfs_end_date_right])
+  |> DF.group_by([:gtfs_active_date, :block_id])
+  |> DF.sort_by(asc: departure_time)
+  |> DF.mutate(block_sequence: S.row_index(trip_id) + 1)
+  |> DF.sort_by(asc: gtfs_active_date, asc: block_id)
 
 # |> DF.group_by([:trip_id, :block_id])
 # |> DF.filter(S.size(gtfs_active_date) > 1)
 # |> DF.summarise(gtfs_active_date: min(gtfs_active_date), gtfs_end_date: max(gtfs_end_date))
 # |> DF.ungroup()
+```
 
+<!-- livebook:{"output":true} -->
+
+```
+#Explorer.DataFrame<
+  Polars[152897 x 11]
+  Groups: ["gtfs_active_date", "block_id"]
+  trip_id string ["62822243", "62822336", "62822249", "62822344", "62822255", ...]
+  gtfs_active_date s32 [20240831, 20240831, 20240831, 20240831, 20240831, ...]
+  gtfs_end_date s32 [20240904, 20240904, 20240904, 20240904, 20240904, ...]
+  block_id string ["C01-12", "C01-12", "C01-12", "C01-12", "C01-12", ...]
+  departure_time s64 [25320, 27300, 29760, 31800, 34140, ...]
+  departure_stop_id string ["64", "110", "64", "110", "64", ...]
+  arrival_time s64 [27000, 28980, 31440, 33720, 36120, ...]
+  arrival_stop_id string ["110", "64", "110", "64", "110", ...]
+  departure_station string ["place-nubn", "110", "place-nubn", "110", "place-nubn", ...]
+  arrival_station string ["110", "place-nubn", "110", "place-nubn", "110", ...]
+  block_sequence s64 [1, 2, 3, 4, 5, ...]
+>
+```
+
+```elixir
 df =
   "trips-2024.parquet"
   |> Kino.FS.file_path()
@@ -50,10 +192,51 @@ df =
     end
   end)
   |> DF.concat_rows()
+```
+
+<!-- livebook:{"output":true} -->
 
+```
+#Explorer.DataFrame<
+  Polars[1186672 x 27]
+  stop_sequence s16 [320, 310, 40, 40, 660, ...]
+  stop_id string ["70162", "70107", "70051", "70051", "70503", ...]
+  parent_station string ["place-woodl", "place-lake", "place-orhte", "place-orhte", "place-unsqu",
+   ...]
+  move_timestamp s64 [1722499511, nil, 1722501862, nil, 1722501980, ...]
+  stop_timestamp s64 [nil, 1722499796, nil, 1722501886, 1722502138, ...]
+  travel_time_seconds s64 [nil, nil, nil, nil, 158, ...]
+  dwell_time_seconds s64 [nil, nil, nil, nil, nil, ...]
+  headway_trunk_seconds s64 [nil, nil, nil, nil, nil, ...]
+  headway_branch_seconds s64 [nil, nil, nil, nil, nil, ...]
+  service_date s64 [20240801, 20240801, 20240801, 20240801, 20240801, ...]
+  route_id string ["Green-D", "Green-B", "Blue", "Blue", "Green-D", ...]
+  direction_id boolean [true, false, false, false, true, ...]
+  start_time s64 [14712, 14402, 17063, 18480, 17460, ...]
+  vehicle_id string ["G-10037", "G-10012", "B-547DC1D2", "B-547DC1D2", "G-10046", ...]
+  branch_route_id string ["Green-D", "Green-B", nil, nil, "Green-D", ...]
+  trunk_route_id string ["Green", "Green", "Blue", "Blue", "Green", ...]
+  stop_count s16 [1, 1, 1, 8, 1, ...]
+  trip_id string ["ADDED-1582000863", "ADDED-1582000862", "NONREV-1580581567", "NONREV-1580581568",
+   "62922273", ...]
+  vehicle_label string ["3844", "3846", "0711", "0711", "3696-3829", ...]
+  vehicle_consist string ["3844", "3846", "0711|0710|0754|0755|0746|0747",
+   "0711|0710|0754|0755|0746|0747", "3696|3829", ...]
+  direction string ["East", "West", "West", "West", "East", ...]
+  direction_destination string ["Union Square", "Boston College", "Bowdoin", "Bowdoin",
+   "Union Square", ...]
+  scheduled_arrival_time s64 [nil, 23340, 18480, 18480, 21060, ...]
+  scheduled_departure_time s64 [nil, 23340, 18480, 18480, 21060, ...]
+  scheduled_travel_time s64 [nil, 180, nil, nil, 300, ...]
+  scheduled_headway_branch s64 [nil, nil, nil, nil, 360, ...]
+  scheduled_headway_trunk s64 [nil, nil, nil, nil, 360, ...]
+>
+```
+
+```elixir
 df =
   df
-  |> DF.filter(S.contains(route_id, "Green-"))
+  |> DF.filter(trunk_route_id=="Green")
 
 # |> DF.filter(not S.contains(trip_id, "ADDED-"))
 # |> DF.filter(not S.contains(trip_id, "NONREV-"))
@@ -66,35 +249,160 @@ df =
     is_nil(gtfs_active_date) or
       (gtfs_active_date <= service_date and gtfs_end_date >= service_date)
   )
-  |> DF.filter(travel_time_seconds > 0)
+  #|> DF.filter(travel_time_seconds > 0)
   |> DF.mutate(timestamp: select(is_nil(move_timestamp), stop_timestamp, move_timestamp))
-  |> DF.group_by([:service_date, :vehicle_id])
-  # |> DF.summarise(block_count: S.size(block_id))
   |> DF.sort_by(asc: service_date, asc: vehicle_id, asc: timestamp)
-  # |> DF.select([:timestamp, :service_date, :vehicle_id, :trip_id, :block_id])
-  |> DF.summarise(block_count: S.count(block_id))
+  #|> DF.distinct([:service_date, :vehicle_id, :trip_id], keep_all: true)
+
+df
+#|> DF.filter()
+|> DF.select([:service_date, :vehicle_id, :timestamp, :trip_id, :block_id, :block_sequence])
+|> Kino.DataTable.new()
+```
+
+<!-- livebook:{"output":true} -->
+
+```text
+#Explorer.DataFrame<
+  Polars[717260 x 6]
+  service_date s64 [20240801, 20240801, 20240801, 20240801, 20240801, ...]
+  vehicle_id string ["G-10001", "G-10001", "G-10002", "G-10002", "G-10002", ...]
+  timestamp s64 [1722516403, 1722516410, 1722501999, 1722502017, 1722502119, ...]
+  trip_id string ["ADDED-1582001076", "ADDED-1582001076", "ADDED-1582000864", "ADDED-1582000864", "ADDED-1582000864", ...]
+  block_id string [nil, nil, nil, nil, nil, ...]
+  block_sequence s64 [nil, nil, nil, nil, nil, ...]
+>
+```
+
+```elixir
+df
+|> DF.group_by([:service_date, :vehicle_id])
+# |> DF.select([:timestamp, :service_date, :vehicle_id, :trip_id, :block_id])
+|> DF.summarise(block_count: S.count(block_id))
+|> DF.mutate(
+  one_block: select(block_count == 1, 1, 0),
+  zero_or_one_block: select(block_count <= 1, 1, 0)
+)
+ |> DF.group_by(:service_date)
+|> DF.summarise(
+  count: S.count(vehicle_id),
+  one_block: S.sum(one_block),
+  zero_or_one_block: S.sum(zero_or_one_block)
+)
+|> DF.mutate(
+  one_block_pct: 100 * one_block / count,
+  zero_or_one_block_ct: 100 * zero_or_one_block / count
+)
+|> Kino.DataTable.new()
+```
+
+<!-- livebook:{"output":true} -->
+
+```text
+#Explorer.DataFrame<
+  Polars[31 x 6]
+  service_date s64 [20240801, 20240802, 20240803, 20240804, 20240805, ...]
+  count u32 [399, 303, 217, 229, 450, ...]
+  one_block s64 [17, 12, 9, 8, 20, ...]
+  zero_or_one_block s64 [200, 142, 101, 118, 241, ...]
+  one_block_pct f64 [4.260651629072682, 3.9603960396039604, 4.147465437788019, 3.493449781659389, 4.444444444444445, ...]
+  zero_or_one_block_ct f64 [50.12531328320802, 46.864686468646866, 46.54377880184332, 51.52838427947598, 53.55555555555556, ...]
+>
+```
+
+```elixir
+df2 =
+  "trainsheet.csv"
+  |> Kino.FS.file_path()
+  |> DF.from_csv!(dtypes: %{car0: :string, car1: :string})
+  |> DF.mutate(
+    service_year: cast(S.substring(service_date, 1, 4), :s32),
+    service_month: cast(S.substring(service_date, 6, 2), :s32),
+    service_day: cast(S.substring(service_date, 9, 2), :s32)
+  )
   |> DF.mutate(
-    one_block: select(block_count == 1, 1, 0),
-    zero_or_one_block: select(block_count <= 1, 1, 0)
+    service_date: service_day + 100 * service_month + 10000 * service_year,
+    min_car: select(car0 < car1, car0, car1),
+    max_car: select(car0 < car1, car1, car0)
+  )
+  |> DF.mutate(consist: min_car <> "-" <> max_car)
+  |> DF.filter(consist != "-")
+  |> DF.discard([:service_year, :service_month, :service_day])
+  |> DF.join(blocks,
+    how: :left,
+    on: [
+      scheduled_start_station: :departure_station,
+      scheduled_end_station: :arrival_station,
+      scheduled_departure: :departure_time
+    ]
   )
-  # |> DF.group_by(:service_date)
-  |> DF.summarise(count: S.count(vehicle_id), one_block: S.sum(one_block), zero_or_one_block: S.sum(zero_or_one_block))
-  |> DF.mutate(one_block_pct: 100 * one_block / count, zero_or_one_block_ct: 100 * zero_or_one_block / count)
+  |> DF.filter(gtfs_active_date <= service_date and gtfs_end_date >= service_date)
+  |> DF.sort_by(asc: service_date, asc: scheduled_departure, asc: gtfs_active_date)
+  |> DF.distinct([:service_date, :scheduled_trip_id], keep_all: true)
 
-Kino.DataTable.new(df)
+Kino.DataTable.new(df2 |> DF.filter(block_id=="B800-53"))
 ```
 
 <!-- livebook:{"output":true} -->
 
 ```text
 #Explorer.DataFrame<
-  Polars[1 x 5]
-  count u32 [12115]
-  one_block s64 [732]
-  zero_or_one_block s64 [6614]
-  one_block_pct f64 [6.042096574494429]
-  zero_or_one_block_ct f64 [54.59347915806851]
+  Polars[408 x 23]
+  service_date s64 [20240801, 20240801, 20240801, 20240801, 20240801, ...]
+  scheduled_trip_id string ["alb34011-esomr-mdftf-0500", "alb34011-mdftf-hsmnl-0512", "alb34011-hsmnl-mdftf-0613", "alb34011-mdftf-hsmnl-0714", "alb34011-hsmnl-mdftf-0819", ...]
+  scheduled_start_station string ["place-esomr", "place-mdftf", "place-hsmnl", "place-mdftf", "place-hsmnl", ...]
+  scheduled_end_station string ["place-mdftf", "place-hsmnl", "place-mdftf", "place-hsmnl", "place-mdftf", ...]
+  scheduled_departure s64 [18000, 18720, 22380, 26040, 29940, ...]
+  scheduled_arrival s64 [18540, 21720, 25320, 29280, 33180, ...]
+  actual_start_station string [nil, nil, nil, nil, nil, ...]
+  actual_end_station string [nil, nil, nil, nil, nil, ...]
+  actual_departure s64 [nil, 18720, nil, 26040, nil, ...]
+  detected_departure s64 [18311, 18748, 21863, 26009, 29058, ...]
+  car0 string ["3663", "3837", "3837", "3833", "3833", ...]
+  car1 string ["3837", "3663", "3663", "3621", "3621", ...]
+  min_car string ["3663", "3663", "3663", "3621", "3621", ...]
+  max_car string ["3837", "3837", "3837", "3833", "3833", ...]
+  consist string ["3663-3837", "3663-3837", "3663-3837", "3621-3833", "3621-3833", ...]
+  trip_id string ["62921905", "62921906", "62921907", "62921908", "62921909", ...]
+  gtfs_active_date s32 [20240608, 20240608, 20240608, 20240608, 20240608, ...]
+  gtfs_end_date s32 [20240904, 20240904, 20240904, 20240904, 20240904, ...]
+  block_id string ["B800-53", "B800-53", "B800-53", "B800-53", "B800-53", ...]
+  departure_stop_id string ["70513", "70512", "70260", "70512", "70260", ...]
+  arrival_time s64 [18540, 21720, 25320, 29280, 33180, ...]
+  arrival_stop_id string ["70511", "70260", "70511", "70260", "70511", ...]
+  block_sequence s64 [1, 2, 3, 4, 5, ...]
+>
+```
+
+```elixir
+
+df2
+|> DF.group_by([:service_date, :block_id])
+|> DF.summarise(car_count: S.n_distinct(consist))
+|> DF.ungroup()
+|> DF.mutate(one_car: select(car_count < 2, 1, 0))
+|> DF.summarise(
+  min: min(car_count),
+  median: median(car_count),
+  max: max(car_count),
+  size: size(car_count),
+  one_car: sum(one_car)
+)
+|> DF.mutate(one_car_pct: 100 * one_car / size)
+```
+
+<!-- livebook:{"output":true} -->
+
+```
+#Explorer.DataFrame<
+  Polars[1 x 6]
+  min u32 [1]
+  median f64 [6.0]
+  max u32 [14]
+  size u32 [2210]
+  one_car s64 [95]
+  one_car_pct f64 [4.298642533936651]
 >
 ```
 
-<!-- livebook:{"offset":3039,"stamp":{"token":"XCP.RY4Ej4I4BWqvOd8Fzdctu_DlMp4PWlhHyOadjMuzWk-4UKUtfNsvlpOd8_UlBl8dkmvrUbkdY4NLLejrVLih6o_tiXF5WQ1nN0-mcw","version":2}} -->
+<!-- livebook:{"offset":14524,"stamp":{"token":"XCP.dMHsQ2MHe6JP5h3ylPmWUmrEdZFSI4tjHg2GZTjdK6OwdtbT366YHQNnBQZr0aCzX0uCUCHdDcuyqpDPuSp08WqDf8_vWS3qyVeK3g","version":2}} -->

From 70f6ea2e7826cbd33da0050998a712a0f85f97bc Mon Sep 17 00:00:00 2001
From: Paul Swartz <paul@paulswartz.net>
Date: Wed, 9 Oct 2024 17:25:37 -0400
Subject: [PATCH 5/5] doc: improve block adherence matching

---
 reports/light_rail_block_adherence.livemd | 432 +++++++++++++++++++---
 1 file changed, 372 insertions(+), 60 deletions(-)

diff --git a/reports/light_rail_block_adherence.livemd b/reports/light_rail_block_adherence.livemd
index 588cb64..c578f51 100644
--- a/reports/light_rail_block_adherence.livemd
+++ b/reports/light_rail_block_adherence.livemd
@@ -15,6 +15,9 @@ Mix.install([
 $ curl -o trips-2024.parquet 'https://performancedata.mbta.com/lamp/gtfs_archive/2024/trips.parquet'
 $ curl -o stop-times-2024.parquet 'https://performancedata.mbta.com/lamp/gtfs_archive/2024/stop_times.parquet'
 $ curl -o stops-2024.parquet 'https://performancedata.mbta.com/lamp/gtfs_archive/2024/stops.parquet'
+$ curl -o calendar-2024.parquet 'https://performancedata.mbta.com/lamp/gtfs_archive/2024/calendar.parquet'
+$ curl -o calendar-dates-2024.parquet 'https://performancedata.mbta.com/lamp/gtfs_archive/2024/calendar_dates.parquet'
+
 $ curl -JO 'https://performancedata.mbta.com/lamp/subway-on-time-performance-v1/2024-08-{01-31}-subway-on-time-performance-v1.parquet'
 ```
 
@@ -40,6 +43,22 @@ stops =
   |> DF.from_parquet!()
   |> DF.filter(gtfs_end_date >= 20_240_801 and gtfs_active_date <= 20_240_831)
 
+calendar =
+  "trips-2024.parquet"
+  |> Kino.FS.file_path()
+  |> Path.join("../calendar-2024.parquet")
+  |> Path.expand()
+  |> DF.from_parquet!()
+  |> DF.filter(gtfs_end_date >= 20_240_801 and gtfs_active_date <= 20_240_831)
+
+calendar_dates =
+  "trips-2024.parquet"
+  |> Kino.FS.file_path()
+  |> Path.join("../calendar-dates-2024.parquet")
+  |> Path.expand()
+  |> DF.from_parquet!()
+  |> DF.filter(gtfs_end_date >= 20_240_801 and gtfs_active_date <= 20_240_831)
+
 :ok
 ```
 
@@ -138,19 +157,45 @@ DF.filter(stop_times, trip_id == "62822419")
 ```
 
 ```elixir
+calendar_filtered =
+  calendar
+  |> DF.mutate(
+    start_date: select(start_date > gtfs_active_date, start_date, gtfs_active_date),
+    end_date: select(end_date < gtfs_end_date, end_date, gtfs_end_date)
+  )
+  |> DF.filter(end_date >= start_date)
+  |> DF.discard([:gtfs_active_date, :gtfs_end_date])
+
 blocks =
-  DF.select(trips, [:trip_id, :gtfs_active_date, :gtfs_end_date, :block_id])
+  DF.select(trips, [:trip_id, :service_id, :gtfs_active_date, :gtfs_end_date, :block_id])
   |> DF.sort_by(desc: gtfs_active_date)
   |> DF.filter(gtfs_end_date >= 20_240_801 and gtfs_active_date <= 20_240_831)
   |> DF.join(stop_times, on: :trip_id, how: :left)
-  |> DF.filter(
-    gtfs_active_date_right <= gtfs_active_date and gtfs_end_date_right >= gtfs_end_date
+  # |> DF.filter(
+  #   gtfs_active_date_right <= gtfs_end_date and gtfs_end_date_right >= gtfs_active_date
+  # )
+  |> DF.mutate(
+    gtfs_active_date:
+      select(gtfs_active_date > gtfs_active_date_right, gtfs_active_date, gtfs_active_date_right),
+    gtfs_end_date: select(gtfs_end_date < gtfs_end_date_right, gtfs_end_date, gtfs_end_date_right)
   )
+  |> DF.filter(gtfs_active_date <= gtfs_end_date)
   |> DF.discard([:gtfs_active_date_right, :gtfs_end_date_right])
-  |> DF.group_by([:gtfs_active_date, :block_id])
+  |> DF.join(calendar_filtered, on: :service_id, how: :inner)
+  # |> DF.filter(
+  #   (start_date <= gtfs_end_date and end_date >= gtfs_active_date) or
+  #   (gtfs_active_date <= end_date and gtfs_end_date >= start_date)
+  # )
+  |> DF.mutate(
+    gtfs_active_date: select(gtfs_active_date > start_date, gtfs_active_date, start_date),
+    gtfs_end_date: select(gtfs_end_date < end_date, gtfs_end_date, end_date)
+  )
+  |> DF.filter(gtfs_active_date <= gtfs_end_date)
+  # |> DF.discard([:gtfs_active_date_right, :gtfs_end_date_right])
+  |> DF.group_by([:gtfs_active_date, :service_id, :block_id])
   |> DF.sort_by(asc: departure_time)
   |> DF.mutate(block_sequence: S.row_index(trip_id) + 1)
-  |> DF.sort_by(asc: gtfs_active_date, asc: block_id)
+  |> DF.sort_by(asc: gtfs_active_date, asc: service_id, asc: block_id)
 
 # |> DF.group_by([:trip_id, :block_id])
 # |> DF.filter(S.size(gtfs_active_date) > 1)
@@ -162,22 +207,112 @@ blocks =
 
 ```
 #Explorer.DataFrame<
-  Polars[152897 x 11]
-  Groups: ["gtfs_active_date", "block_id"]
-  trip_id string ["62822243", "62822336", "62822249", "62822344", "62822255", ...]
-  gtfs_active_date s32 [20240831, 20240831, 20240831, 20240831, 20240831, ...]
-  gtfs_end_date s32 [20240904, 20240904, 20240904, 20240904, 20240904, ...]
-  block_id string ["C01-12", "C01-12", "C01-12", "C01-12", "C01-12", ...]
-  departure_time s64 [25320, 27300, 29760, 31800, 34140, ...]
-  departure_stop_id string ["64", "110", "64", "110", "64", ...]
-  arrival_time s64 [27000, 28980, 31440, 33720, 36120, ...]
-  arrival_stop_id string ["110", "64", "110", "64", "110", ...]
-  departure_station string ["place-nubn", "110", "place-nubn", "110", "place-nubn", ...]
-  arrival_station string ["110", "place-nubn", "110", "place-nubn", "110", ...]
+  Polars[243862 x 21]
+  Groups: ["gtfs_active_date", "service_id", "block_id"]
+  trip_id string ["AirportWonderland-Weekday-c0986-0-00:00:00",
+   "AirportWonderland-Weekday-c0986-1-00:00:00",
+   "AlewifeHarvardViaHolyoke-Weekday-c0986-0-00:00:00",
+   "AlewifeHarvardViaHolyoke-Weekday-c0986-1-00:00:00", "AlewifeKendall-Weekday-c0986-1-00:00:00",
+   ...]
+  service_id string ["canonical", "canonical", "canonical", "canonical", "canonical", ...]
+  gtfs_active_date s64 [20240831, 20240831, 20240831, 20240831, 20240831, ...]
+  gtfs_end_date s64 [20240904, 20240904, 20240904, 20240904, 20240904, ...]
+  block_id string [nil, nil, nil, nil, nil, ...]
+  departure_time s64 [0, 0, 0, 0, 0, ...]
+  departure_stop_id string ["15795", "7096", "110", "141", "9070071", ...]
+  arrival_time s64 [999, 929, 718, 758, 1280, ...]
+  arrival_stop_id string ["7096", "15795", "141", "110", "141", ...]
+  departure_station string ["place-wondl", "place-aport", "110", "place-alfcl", "9070071", ...]
+  arrival_station string ["place-aport", "place-wondl", "place-alfcl", "110", "place-alfcl", ...]
+  monday s64 [0, 0, 0, 0, 0, ...]
+  tuesday s64 [0, 0, 0, 0, 0, ...]
+  wednesday s64 [0, 0, 0, 0, 0, ...]
+  thursday s64 [0, 0, 0, 0, 0, ...]
+  friday s64 [0, 0, 0, 0, 0, ...]
+  saturday s64 [0, 0, 0, 0, 0, ...]
+  sunday s64 [0, 0, 0, 0, 0, ...]
+  start_date s64 [20240831, 20240831, 20240831, 20240831, 20240831, ...]
+  end_date s64 [20240904, 20240904, 20240904, 20240904, 20240904, ...]
   block_sequence s64 [1, 2, 3, 4, 5, ...]
 >
 ```
 
+```elixir
+calendar
+|> DF.mutate(
+  start_date: select(start_date > gtfs_active_date, start_date, gtfs_active_date),
+  end_date: select(end_date < gtfs_end_date, end_date, gtfs_end_date)
+)
+|> DF.filter(end_date >= start_date)
+|> DF.discard([:gtfs_active_date, :gtfs_end_date])
+```
+
+<!-- livebook:{"output":true} -->
+
+```
+#Explorer.DataFrame<
+  Polars[498 x 10]
+  service_id string ["RTL32024-hmo34011-Weekday-01", "SPRING2024-SOUTHWKD-Weekday-1", "canonical",
+   "BUS32024-hbq34ns1-Weekday-02", "BUS32024-hbt34ns1-Weekday-02", ...]
+  monday s64 [1, 1, 0, 1, 1, ...]
+  tuesday s64 [1, 1, 0, 1, 1, ...]
+  wednesday s64 [1, 1, 0, 1, 1, ...]
+  thursday s64 [1, 1, 0, 1, 1, ...]
+  friday s64 [1, 1, 0, 1, 1, ...]
+  saturday s64 [0, 0, 0, 0, 0, ...]
+  sunday s64 [0, 0, 0, 0, 0, ...]
+  start_date s64 [20240801, 20240801, 20240801, 20240801, 20240801, ...]
+  end_date s64 [20240802, 20240802, 20240802, 20240802, 20240802, ...]
+>
+```
+
+```elixir
+blocks
+|> DF.filter(block_id == "B800-50" and thursday == 1 and block_sequence == 2)
+|> DF.filter(
+  gtfs_active_date <= 20_240_801 and gtfs_end_date >= 20_240_801 and start_date <= 20_240_801 and
+    end_date >= 20_240_801
+)
+|> DF.sort_by(asc: gtfs_active_date)
+
+# trips
+# |> DF.filter(block_id=="B800-50" and gtfs_active_date <= 20240801 and gtfs_end_date>=20240801)
+# |> DF.join(calendar, on: :service_id, how: :left)
+# |> DF.filter(gtfs_active_date_right <= 20240801 and gtfs_end_date_right >= 20240801)
+# |> DF.filter(thursday==1)
+# |> DF.filter(start_date<=20240801 and end_date>=20240801)
+```
+
+<!-- livebook:{"output":true} -->
+
+```
+#Explorer.DataFrame<
+  Polars[1 x 21]
+  Groups: ["gtfs_active_date", "service_id", "block_id"]
+  trip_id string ["62922253"]
+  service_id string ["LRV32024-hlb34011-Weekday-01"]
+  gtfs_active_date s64 [20240801]
+  gtfs_end_date s64 [20240802]
+  block_id string ["B800-50"]
+  departure_time s64 [20580]
+  departure_stop_id string ["70504"]
+  arrival_time s64 [24360]
+  arrival_stop_id string ["70161"]
+  departure_station string ["place-unsqu"]
+  arrival_station string ["place-river"]
+  monday s64 [1]
+  tuesday s64 [1]
+  wednesday s64 [1]
+  thursday s64 [1]
+  friday s64 [1]
+  saturday s64 [0]
+  sunday s64 [0]
+  start_date s64 [20240801]
+  end_date s64 [20240802]
+  block_sequence s64 [2]
+>
+```
+
 ```elixir
 df =
   "trips-2024.parquet"
@@ -236,7 +371,7 @@ df =
 ```elixir
 df =
   df
-  |> DF.filter(trunk_route_id=="Green")
+  |> DF.filter(trunk_route_id == "Green")
 
 # |> DF.filter(not S.contains(trip_id, "ADDED-"))
 # |> DF.filter(not S.contains(trip_id, "NONREV-"))
@@ -249,14 +384,15 @@ df =
     is_nil(gtfs_active_date) or
       (gtfs_active_date <= service_date and gtfs_end_date >= service_date)
   )
-  #|> DF.filter(travel_time_seconds > 0)
+  # |> DF.filter(travel_time_seconds > 0)
   |> DF.mutate(timestamp: select(is_nil(move_timestamp), stop_timestamp, move_timestamp))
   |> DF.sort_by(asc: service_date, asc: vehicle_id, asc: timestamp)
-  #|> DF.distinct([:service_date, :vehicle_id, :trip_id], keep_all: true)
+
+# |> DF.distinct([:service_date, :vehicle_id, :trip_id], keep_all: true)
 
 df
-#|> DF.filter()
-|> DF.select([:service_date, :vehicle_id, :timestamp, :trip_id, :block_id, :block_sequence])
+# |> DF.filter()
+# |> DF.select([:service_date, :vehicle_id, :timestamp, :trip_id, :block_id, :block_sequence])
 |> Kino.DataTable.new()
 ```
 
@@ -264,13 +400,55 @@ df
 
 ```text
 #Explorer.DataFrame<
-  Polars[717260 x 6]
+  Polars[717260 x 48]
+  stop_sequence s16 [310, 320, 310, 320, 330, ...]
+  stop_id string ["70160", "70162", "70160", "70162", "70164", ...]
+  parent_station string ["place-river", "place-woodl", "place-river", "place-woodl", "place-waban", ...]
+  move_timestamp s64 [nil, 1722516410, nil, 1722502017, 1722502119, ...]
+  stop_timestamp s64 [1722516403, 1722516465, 1722501999, 1722502083, 1722502215, ...]
+  travel_time_seconds s64 [nil, 55, nil, 66, 96, ...]
+  dwell_time_seconds s64 [nil, nil, nil, 36, 18, ...]
+  headway_trunk_seconds s64 [397, nil, nil, nil, nil, ...]
+  headway_branch_seconds s64 [397, nil, nil, nil, nil, ...]
   service_date s64 [20240801, 20240801, 20240801, 20240801, 20240801, ...]
+  route_id string ["Green-D", "Green-D", "Green-D", "Green-D", "Green-D", ...]
+  direction_id boolean [true, true, true, true, true, ...]
+  start_time s64 [31605, 31605, 17200, 17200, 17200, ...]
   vehicle_id string ["G-10001", "G-10001", "G-10002", "G-10002", "G-10002", ...]
-  timestamp s64 [1722516403, 1722516410, 1722501999, 1722502017, 1722502119, ...]
+  branch_route_id string ["Green-D", "Green-D", "Green-D", "Green-D", "Green-D", ...]
+  trunk_route_id string ["Green", "Green", "Green", "Green", "Green", ...]
+  stop_count s16 [2, 2, 25, 25, 25, ...]
   trip_id string ["ADDED-1582001076", "ADDED-1582001076", "ADDED-1582000864", "ADDED-1582000864", "ADDED-1582000864", ...]
+  vehicle_label string ["3836", "3836", "3835-3698", "3835-3698", "3835-3698", ...]
+  vehicle_consist string ["3836", "3836", "3835|3698", "3835|3698", "3835|3698", ...]
+  direction string ["East", "East", "East", "East", "East", ...]
+  direction_destination string ["Union Square", "Union Square", "Union Square", "Union Square", "Union Square", ...]
+  scheduled_arrival_time s64 [31620, 31680, nil, nil, nil, ...]
+  scheduled_departure_time s64 [31620, 31680, nil, nil, nil, ...]
+  scheduled_travel_time s64 [nil, 60, nil, nil, nil, ...]
+  scheduled_headway_branch s64 [540, 540, nil, nil, nil, ...]
+  scheduled_headway_trunk s64 [540, 540, nil, nil, nil, ...]
+  service_id string [nil, nil, nil, nil, nil, ...]
+  gtfs_active_date s64 [nil, nil, nil, nil, nil, ...]
+  gtfs_end_date s64 [nil, nil, nil, nil, nil, ...]
   block_id string [nil, nil, nil, nil, nil, ...]
+  departure_time s64 [nil, nil, nil, nil, nil, ...]
+  departure_stop_id string [nil, nil, nil, nil, nil, ...]
+  arrival_time s64 [nil, nil, nil, nil, nil, ...]
+  arrival_stop_id string [nil, nil, nil, nil, nil, ...]
+  departure_station string [nil, nil, nil, nil, nil, ...]
+  arrival_station string [nil, nil, nil, nil, nil, ...]
+  monday s64 [nil, nil, nil, nil, nil, ...]
+  tuesday s64 [nil, nil, nil, nil, nil, ...]
+  wednesday s64 [nil, nil, nil, nil, nil, ...]
+  thursday s64 [nil, nil, nil, nil, nil, ...]
+  friday s64 [nil, nil, nil, nil, nil, ...]
+  saturday s64 [nil, nil, nil, nil, nil, ...]
+  sunday s64 [nil, nil, nil, nil, nil, ...]
+  start_date s64 [nil, nil, nil, nil, nil, ...]
+  end_date s64 [nil, nil, nil, nil, nil, ...]
   block_sequence s64 [nil, nil, nil, nil, nil, ...]
+  timestamp s64 [1722516403, 1722516410, 1722501999, 1722502017, 1722502119, ...]
 >
 ```
 
@@ -283,7 +461,7 @@ df
   one_block: select(block_count == 1, 1, 0),
   zero_or_one_block: select(block_count <= 1, 1, 0)
 )
- |> DF.group_by(:service_date)
+|> DF.group_by(:service_date)
 |> DF.summarise(
   count: S.count(vehicle_id),
   one_block: S.sum(one_block),
@@ -318,15 +496,16 @@ df2 =
   |> DF.mutate(
     service_year: cast(S.substring(service_date, 1, 4), :s32),
     service_month: cast(S.substring(service_date, 6, 2), :s32),
-    service_day: cast(S.substring(service_date, 9, 2), :s32)
+    service_day: cast(S.substring(service_date, 9, 2), :s32),
+    service_date: cast(S.substring(service_date, 1, 10), :date)
   )
   |> DF.mutate(
+    service_dow: day_of_week(service_date),
     service_date: service_day + 100 * service_month + 10000 * service_year,
     min_car: select(car0 < car1, car0, car1),
     max_car: select(car0 < car1, car1, car0)
   )
   |> DF.mutate(consist: min_car <> "-" <> max_car)
-  |> DF.filter(consist != "-")
   |> DF.discard([:service_year, :service_month, :service_day])
   |> DF.join(blocks,
     how: :left,
@@ -336,51 +515,93 @@ df2 =
       scheduled_departure: :departure_time
     ]
   )
-  |> DF.filter(gtfs_active_date <= service_date and gtfs_end_date >= service_date)
-  |> DF.sort_by(asc: service_date, asc: scheduled_departure, asc: gtfs_active_date)
-  |> DF.distinct([:service_date, :scheduled_trip_id], keep_all: true)
+  |> DF.filter(
+    gtfs_active_date <= service_date and gtfs_end_date >= service_date and
+      start_date <= service_date and end_date >= service_date
+  )
+  |> DF.filter(
+    (service_dow == 1 and monday == 1) or (service_dow == 2 and tuesday == 1) or
+      (service_dow == 3 and wednesday == 1) or (service_dow == 4 and thursday == 1) or
+      (service_dow == 5 and friday == 1) or
+      (service_dow == 6 and saturday == 1) or
+      (service_dow == 7 and sunday == 1)
+  )
+
+# |> DF.sort_by(asc: service_date, asc: scheduled_departure, asc: gtfs_active_date)
+# |> DF.distinct([:service_date, :scheduled_trip_id], keep_all: true)
 
-Kino.DataTable.new(df2 |> DF.filter(block_id=="B800-53"))
+Kino.DataTable.new(df2 |> DF.filter(service_date == 20_240_805))
 ```
 
 <!-- livebook:{"output":true} -->
 
 ```text
 #Explorer.DataFrame<
-  Polars[408 x 23]
-  service_date s64 [20240801, 20240801, 20240801, 20240801, 20240801, ...]
-  scheduled_trip_id string ["alb34011-esomr-mdftf-0500", "alb34011-mdftf-hsmnl-0512", "alb34011-hsmnl-mdftf-0613", "alb34011-mdftf-hsmnl-0714", "alb34011-hsmnl-mdftf-0819", ...]
-  scheduled_start_station string ["place-esomr", "place-mdftf", "place-hsmnl", "place-mdftf", "place-hsmnl", ...]
-  scheduled_end_station string ["place-mdftf", "place-hsmnl", "place-mdftf", "place-hsmnl", "place-mdftf", ...]
-  scheduled_departure s64 [18000, 18720, 22380, 26040, 29940, ...]
-  scheduled_arrival s64 [18540, 21720, 25320, 29280, 33180, ...]
+  Polars[1165 x 34]
+  service_date s64 [20240805, 20240805, 20240805, 20240805, 20240805, ...]
+  scheduled_trip_id string ["alb34011-mdftf-hsmnl-0944", "alb34011-mdftf-hsmnl-0951", "alb34011-mdftf-hsmnl-0742", "alb34011-mdftf-hsmnl-1049", "alb34011-river-mdftf-0456", ...]
+  scheduled_start_station string ["place-mdftf", "place-mdftf", "place-mdftf", "place-mdftf", "place-river", ...]
+  scheduled_end_station string ["place-hsmnl", "place-hsmnl", "place-hsmnl", "place-hsmnl", "place-mdftf", ...]
+  scheduled_departure s64 [35040, 35460, 27720, 38940, 17760, ...]
+  scheduled_arrival s64 [38220, 38640, 30960, 42120, 21840, ...]
   actual_start_station string [nil, nil, nil, nil, nil, ...]
   actual_end_station string [nil, nil, nil, nil, nil, ...]
-  actual_departure s64 [nil, 18720, nil, 26040, nil, ...]
-  detected_departure s64 [18311, 18748, 21863, 26009, 29058, ...]
-  car0 string ["3663", "3837", "3837", "3833", "3833", ...]
-  car1 string ["3837", "3663", "3663", "3621", "3621", ...]
-  min_car string ["3663", "3663", "3663", "3621", "3621", ...]
-  max_car string ["3837", "3837", "3837", "3833", "3833", ...]
-  consist string ["3663-3837", "3663-3837", "3663-3837", "3621-3833", "3621-3833", ...]
-  trip_id string ["62921905", "62921906", "62921907", "62921908", "62921909", ...]
-  gtfs_active_date s32 [20240608, 20240608, 20240608, 20240608, 20240608, ...]
-  gtfs_end_date s32 [20240904, 20240904, 20240904, 20240904, 20240904, ...]
-  block_id string ["B800-53", "B800-53", "B800-53", "B800-53", "B800-53", ...]
-  departure_stop_id string ["70513", "70512", "70260", "70512", "70260", ...]
-  arrival_time s64 [18540, 21720, 25320, 29280, 33180, ...]
-  arrival_stop_id string ["70511", "70260", "70511", "70260", "70511", ...]
-  block_sequence s64 [1, 2, 3, 4, 5, ...]
+  actual_departure s64 [35400, 36000, 28020, 39240, 17940, ...]
+  detected_departure s64 [35410, 35776, 28162, 39282, 17980, ...]
+  car0 string ["3642", "3874", "3700", "3844", "3706", ...]
+  car1 string ["3880", "3660", "3887", "3692", "3814", ...]
+  service_dow s8 [1, 1, 1, 1, 1, ...]
+  min_car string ["3642", "3660", "3700", "3692", "3706", ...]
+  max_car string ["3880", "3874", "3887", "3844", "3814", ...]
+  consist string ["3642-3880", "3660-3874", "3700-3887", "3692-3844", "3706-3814", ...]
+  trip_id string ["65164974", "65164991", "65164990", "65165130", "65165293", ...]
+  service_id string ["LRV32024-hlb34bb1-Weekday-01", "LRV32024-hlb34bb1-Weekday-01", "LRV32024-hlb34bb1-Weekday-01", "LRV32024-hlb34bb1-Weekday-01", "LRV32024-hlb34bb1-Weekday-01", ...]
+  gtfs_active_date s64 [20240802, 20240802, 20240802, 20240802, 20240802, ...]
+  gtfs_end_date s64 [20240809, 20240809, 20240809, 20240809, 20240809, ...]
+  block_id string ["B800-56", "B800-57", "B800-57", "B800-65", "B800-33", ...]
+  departure_stop_id string ["70512", "70512", "70512", "70512", "70160", ...]
+  arrival_time s64 [38220, 38640, 30960, 42120, 21840, ...]
+  arrival_stop_id string ["70260", "70260", "70260", "70260", "70511", ...]
+  monday s64 [1, 1, 1, 1, 1, ...]
+  tuesday s64 [1, 1, 1, 1, 1, ...]
+  wednesday s64 [1, 1, 1, 1, 1, ...]
+  thursday s64 [1, 1, 1, 1, 1, ...]
+  friday s64 [1, 1, 1, 1, 1, ...]
+  saturday s64 [0, 0, 0, 0, 0, ...]
+  sunday s64 [0, 0, 0, 0, 0, ...]
+  start_date s64 [20240802, 20240802, 20240802, 20240802, 20240802, ...]
+  end_date s64 [20240809, 20240809, 20240809, 20240809, 20240809, ...]
+  block_sequence s64 [6, 6, 4, 6, 1, ...]
 >
 ```
 
 ```elixir
+df2
+|> DF.filter(consist != "-" and not is_nil(block_id))
+|> DF.group_by(:service_date)
+|> DF.summarise(block_count: S.n_distinct(block_id))
+|> DF.sort_by(asc: service_date)
+|> Kino.DataTable.new()
+```
 
+<!-- livebook:{"output":true} -->
+
+```text
+#Explorer.DataFrame<
+  Polars[31 x 2]
+  service_date s64 [20240801, 20240802, 20240803, 20240804, 20240805, ...]
+  block_count u32 [73, 73, 41, 38, 57, ...]
+>
+```
+
+```elixir
 df2
+|> DF.filter(consist != "-" and not is_nil(block_id))
 |> DF.group_by([:service_date, :block_id])
 |> DF.summarise(car_count: S.n_distinct(consist))
-|> DF.ungroup()
 |> DF.mutate(one_car: select(car_count < 2, 1, 0))
+# |> DF.ungroup([:block_id])
+# |> DF.summarise()
 |> DF.summarise(
   min: min(car_count),
   median: median(car_count),
@@ -389,20 +610,111 @@ df2
   one_car: sum(one_car)
 )
 |> DF.mutate(one_car_pct: 100 * one_car / size)
+|> Kino.DataTable.new()
 ```
 
 <!-- livebook:{"output":true} -->
 
-```
+```text
 #Explorer.DataFrame<
   Polars[1 x 6]
   min u32 [1]
   median f64 [6.0]
-  max u32 [14]
-  size u32 [2210]
-  one_car s64 [95]
-  one_car_pct f64 [4.298642533936651]
+  max u32 [12]
+  size u32 [1936]
+  one_car s64 [37]
+  one_car_pct f64 [1.9111570247933884]
+>
+```
+
+```elixir
+df2
+|> DF.sort_by(asc: service_date, asc: block_id, asc: block_sequence)
+|> DF.group_by([:service_date, :block_id])
+|> DF.filter(block_id == "B800-50" and service_date == 20_240_801)
+|> Kino.DataTable.new()
+```
+
+<!-- livebook:{"output":true} -->
+
+```text
+#Explorer.DataFrame<
+  Polars[16 x 34]
+  Groups: ["service_date", "block_id"]
+  service_date s64 [20240801, 20240801, 20240801, 20240801, 20240801, ...]
+  scheduled_trip_id string ["alb34011-rsmnl-unsqu-0445", "alb34011-unsqu-river-0543", "alb34011-river-unsqu-0707", "alb34011-unsqu-river-0825", "alb34011-river-unsqu-0944", ...]
+  scheduled_start_station string ["place-rsmnl", "place-unsqu", "place-river", "place-unsqu", "place-river", ...]
+  scheduled_end_station string ["place-unsqu", "place-river", "place-unsqu", "place-river", "place-unsqu", ...]
+  scheduled_departure s64 [17100, 20580, 25620, 30300, 35040, ...]
+  scheduled_arrival s64 [19740, 24360, 29460, 34260, 38820, ...]
+  actual_start_station string [nil, nil, nil, nil, nil, ...]
+  actual_end_station string [nil, nil, nil, nil, nil, ...]
+  actual_departure s64 [nil, nil, nil, 29100, 35160, ...]
+  detected_departure s64 [nil, nil, nil, 29258, nil, ...]
+  car0 string [nil, nil, nil, "3851", "3860", ...]
+  car1 string [nil, nil, nil, "3645", "3601", ...]
+  service_dow s8 [4, 4, 4, 4, 4, ...]
+  min_car string [nil, nil, nil, "3645", "3601", ...]
+  max_car string [nil, nil, nil, "3851", "3860", ...]
+  consist string ["-", "-", "-", "3645-3851", "3601-3860", ...]
+  trip_id string ["62922252", "62922253", "62922254", "62922255", "62922256", ...]
+  service_id string ["LRV32024-hlb34011-Weekday-01", "LRV32024-hlb34011-Weekday-01", "LRV32024-hlb34011-Weekday-01", "LRV32024-hlb34011-Weekday-01", "LRV32024-hlb34011-Weekday-01", ...]
+  gtfs_active_date s64 [20240801, 20240801, 20240801, 20240801, 20240801, ...]
+  gtfs_end_date s64 [20240802, 20240802, 20240802, 20240802, 20240802, ...]
+  block_id string ["B800-50", "B800-50", "B800-50", "B800-50", "B800-50", ...]
+  departure_stop_id string ["70174", "70504", "70160", "70504", "70160", ...]
+  arrival_time s64 [19740, 24360, 29460, 34260, 38820, ...]
+  arrival_stop_id string ["70503", "70161", "70503", "70161", "70503", ...]
+  monday s64 [1, 1, 1, 1, 1, ...]
+  tuesday s64 [1, 1, 1, 1, 1, ...]
+  wednesday s64 [1, 1, 1, 1, 1, ...]
+  thursday s64 [1, 1, 1, 1, 1, ...]
+  friday s64 [1, 1, 1, 1, 1, ...]
+  saturday s64 [0, 0, 0, 0, 0, ...]
+  sunday s64 [0, 0, 0, 0, 0, ...]
+  start_date s64 [20240801, 20240801, 20240801, 20240801, 20240801, ...]
+  end_date s64 [20240802, 20240802, 20240802, 20240802, 20240802, ...]
+  block_sequence s64 [1, 2, 3, 4, 5, ...]
+>
+```
+
+```elixir
+df2
+|> DF.filter(block_sequence > 1)
+|> DF.mutate(previous_block_sequence: block_sequence - 1)
+|> DF.join(df2,
+  on: [:service_date, :block_id, previous_block_sequence: :block_sequence],
+  how: :left
+)
+|> DF.select([
+  :service_date,
+  :block_id,
+  :trip_id,
+  :arrival_time,
+  :consist,
+  :block_sequence,
+  :trip_id_right,
+  :consist_right,
+  :arrival_time_right,
+  :previous_block_sequence
+])
+|> DF.sort_by(asc: service_date, asc: block_id, asc: block_sequence)
+# |> DF.filter(service_date == 20240801 and block_id=="B800-50")
+|> DF.mutate(same_consist?: select(consist != "-" and consist == consist_right, 1, 0))
+|> DF.summarise(count: count(same_consist?), same_consist: sum(same_consist?))
+|> DF.mutate(same_consist_pct: 100 * (same_consist / count))
+|> Kino.DataTable.new()
+```
+
+<!-- livebook:{"output":true} -->
+
+```text
+#Explorer.DataFrame<
+  Polars[1 x 3]
+  count u32 [41129]
+  same_consist s64 [25300]
+  same_consist_pct f64 [61.51377373629313]
 >
 ```
 
-<!-- livebook:{"offset":14524,"stamp":{"token":"XCP.dMHsQ2MHe6JP5h3ylPmWUmrEdZFSI4tjHg2GZTjdK6OwdtbT366YHQNnBQZr0aCzX0uCUCHdDcuyqpDPuSp08WqDf8_vWS3qyVeK3g","version":2}} -->
+<!-- livebook:{"offset":27175,"stamp":{"token":"XCP.1B8-_xK5SJs03JhPsOnr8hnMVQxKBrmpgsFEczHzsFD8nB_IJsKGn5JXz_I2t7mOjm5zTMIoSVbCOkBrYuUq1tFGE8FjChlGjfd0Og","version":2}} -->