From 7aab154b5218c8e7a99c43b19c17e55621f3d41c Mon Sep 17 00:00:00 2001 From: Denis Irinyakov Date: Wed, 17 Jun 2026 20:33:57 +0300 Subject: [PATCH 1/2] Improve local music self-analytics dashboard --- Makefile | 19 +- README.md | 3 +- dashboard/app.py | 1082 ++++++++++++++++++++-------- docs/product_acceptance.md | 2 +- docs/yandex_music_local.md | 8 +- scripts/smoke_compose_local.py | 29 +- scripts/smoke_dashboard_content.py | 47 +- scripts/validate_yamusic_local.py | 16 +- scripts/yamusic_token_help.py | 97 +++ tests/test_yamusic_ingest.py | 20 + 10 files changed, 991 insertions(+), 332 deletions(-) create mode 100644 scripts/yamusic_token_help.py diff --git a/Makefile b/Makefile index 5e85bdf..d7d4c0c 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ VENV_STREAMLIT := $(VENV)/bin/streamlit ENV_RUN := $(VENV_PYTHON) scripts/run_with_dotenv.py DBT_PROFILES_DIR ?= dbt -.PHONY: help setup token-help status ingest ingest-sample preflight dbt-deps dbt-build dashboard dashboard-smoke doctor report snapshot recommendations readiness readiness-real real-gate-smoke product-answers-smoke pages-site acceptance-local acceptance-real compose-smoke-local test up-local compose-check clean-local +.PHONY: help setup token-help status ingest ingest-sample preflight dbt-deps dbt-build dashboard dashboard-smoke doctor report snapshot recommendations readiness readiness-real real-gate-smoke product-answers-smoke pages-site acceptance-local acceptance-real compose-smoke-local compose-smoke-real test up-local compose-check clean-local help: @printf '%s\n' 'Streamify local Yandex Music self-analytics' @@ -25,6 +25,7 @@ help: @printf '%s\n' 'Docker Compose local profile:' @printf '%s\n' ' make up-local' @printf '%s\n' ' make compose-smoke-local' + @printf '%s\n' ' make compose-smoke-real # requires YANDEX_MUSIC_TOKEN' @printf '%s\n' '' @printf '%s\n' 'Useful checks and exports:' @printf '%s\n' ' make raw-contract Validate raw JSONL/manifest contracts' @@ -42,18 +43,7 @@ setup: $(MAKE) dbt-deps token-help: - @printf '%s\n' 'Streamify needs a ready Yandex Music OAuth token in .env:' - @printf '%s\n' ' YANDEX_MUSIC_TOKEN=...' - @printf '%s\n' '' - @printf '%s\n' 'The installed yandex-music client only accepts a token; it does not obtain one.' - @printf '%s\n' 'Use an external Yandex Music OAuth token helper, then paste the token into .env.' - @printf '%s\n' '' - @printf '%s\n' 'Known community helper:' - @printf '%s\n' ' https://github.com/MarshalX/yandex-music-token' - @printf '%s\n' '' - @printf '%s\n' 'After saving .env, run:' - @printf '%s\n' ' make preflight' - @printf '%s\n' ' make acceptance-real' + $(ENV_RUN) -- $(VENV_PYTHON) scripts/yamusic_token_help.py status: $(ENV_RUN) -- $(VENV_PYTHON) -m yamusic_ingest --status @@ -119,6 +109,9 @@ acceptance-real: preflight ingest raw-contract dbt-build doctor report readiness compose-smoke-local: $(ENV_RUN) -- $(VENV_PYTHON) scripts/smoke_compose_local.py +compose-smoke-real: + $(ENV_RUN) -- $(VENV_PYTHON) scripts/smoke_compose_local.py --use-env-token + test: $(VENV_PYTHON) scripts/validate_dbt_quality.py $(VENV_PYTHON) scripts/validate_yamusic_local.py diff --git a/README.md b/README.md index 10b82b7..1576949 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ make dashboard Local defaults: - command guide: `make help` -- token guide: `make token-help` +- token guide: `make token-help`, which checks `.env`, installed `yandex-music` capabilities, and next steps without printing token values. - raw metadata: `data/raw/yamusic/*.jsonl` - local warehouse: `data/streamify.duckdb` - local configuration: `.env` is loaded by the Python CLI/scripts and by `scripts/run_with_dotenv.py` for Makefile commands, so token and path overrides work without Make parsing token values. @@ -66,6 +66,7 @@ Local defaults: - safety guard: `scripts/check_no_local_sensitive_artifacts.py` keeps root `.env`, Yandex raw data, DuckDB files and local audio out of git. - raw schema contract: `make raw-contract` - Docker Compose smoke: `make compose-smoke-local` +- real-account Docker Compose smoke: `make compose-smoke-real`, after `YANDEX_MUSIC_TOKEN` is set. - one-command container path: `make up-local`, which loads `.env` through `scripts/run_with_dotenv.py` and runs Docker Compose with the `local` profile. It uses real Yandex Music metadata when `YANDEX_MUSIC_TOKEN` is present in `.env`, otherwise it writes deterministic sample metadata. - local reset: `make clean-local` removes generated raw metadata, DuckDB databases, summary/snapshot/recommendations reports, dbt target/logs/packages, and smoke-test artifacts while preserving `.env` and source files. diff --git a/dashboard/app.py b/dashboard/app.py index ca8ec1b..eeb821a 100644 --- a/dashboard/app.py +++ b/dashboard/app.py @@ -1,9 +1,11 @@ from __future__ import annotations +import html import os import sys from pathlib import Path +import altair as alt import duckdb import pandas as pd import streamlit as st @@ -12,9 +14,9 @@ if str(ROOT) not in sys.path: sys.path.insert(0, str(ROOT)) -from yamusic_ingest.config import load_dotenv from dashboard.actions import build_data_next_actions from dashboard.filters import apply_track_filters +from yamusic_ingest.config import load_dotenv load_dotenv(ROOT / ".env") DB_PATH = Path(os.getenv("STREAMIFY_DUCKDB_PATH", "data/streamify.duckdb")) @@ -31,14 +33,32 @@ def safe_float(value: object) -> float: return 0.0 if pd.isna(value) else float(value) +def compact_int(value: object) -> str: + number = safe_int(value) + return f"{number:,}".replace(",", " ") + + def percent_label(value: object) -> str: return f"{safe_float(value) * 100:.1f}%" +def pct_from_whole(part: object, whole: object) -> str: + denominator = safe_float(whole) + if denominator == 0: + return "0.0%" + return f"{safe_float(part) * 100 / denominator:.1f}%" + + def yes_no(value: object) -> str: return "yes" if safe_int(value) else "no" +def escape(value: object) -> str: + if pd.isna(value): + return "" + return html.escape(str(value)) + + @st.cache_data(ttl=30) def query(sql: str) -> pd.DataFrame: with duckdb.connect(str(DB_PATH), read_only=True) as conn: @@ -53,10 +73,313 @@ def require_database() -> bool: return False -st.set_page_config(page_title="Streamify Self-Analytics", page_icon="♪", layout="wide") +def first_record(frame: pd.DataFrame) -> dict[str, object]: + return {} if frame.empty else frame.iloc[0].to_dict() + + +def style_app() -> None: + st.markdown( + """ + + """, + unsafe_allow_html=True, + ) + + +def insight_card(label: str, value: object, note: str, accent: str) -> None: + st.markdown( + f""" +
+
{escape(label)}
+
{escape(value)}
+
{escape(note)}
+
+ """, + unsafe_allow_html=True, + ) + + +def format_percent_column(frame: pd.DataFrame, column: str) -> pd.DataFrame: + result = frame.copy() + if column in result.columns: + result[column] = result[column].map(lambda value: f"{safe_float(value) * 100:.1f}%") + return result + + +def apply_focus_filters( + frame: pd.DataFrame, + selected_genres: list[str], + liked_mode: str, + search: str, + year_range: tuple[int, int], + min_repeat: int, + max_playlist_count: int | None, +) -> pd.DataFrame: + filtered = apply_track_filters(frame, selected_genres, liked_mode, search) + if "release_year" in filtered.columns: + filtered = filtered[ + filtered["release_year"].isna() + | filtered["release_year"].between(year_range[0], year_range[1]) + ] + if "repeat_signal" in filtered.columns and min_repeat > 0: + filtered = filtered[filtered["repeat_signal"] >= min_repeat] + if max_playlist_count is not None and "playlist_count" in filtered.columns: + filtered = filtered[filtered["playlist_count"] <= max_playlist_count] + return filtered + + +def render_track_cards(frame: pd.DataFrame, limit: int = 8) -> None: + if frame.empty: + st.info("No tracks match the current focus.") + return + rows = frame.head(limit).to_dict("records") + for start in range(0, len(rows), 2): + cols = st.columns(2) + for col, item in zip(cols, rows[start : start + 2]): + with col: + playlist_note = f"{safe_int(item.get('playlist_count'))} playlists" + repeat_note = f"repeat {safe_int(item.get('repeat_signal'))}" + genre_note = item.get("genre") or "unknown genre" + insight_card( + item.get("title", "unknown track"), + item.get("artist_display", "unknown artist"), + f"{genre_note} · {playlist_note} · {repeat_note}", + "#0f766e" if safe_int(item.get("playlist_count")) == 0 else "#2563eb", + ) + + +def polish_chart(chart: alt.Chart) -> alt.Chart: + return ( + chart.configure(background="#ffffff") + .configure_view(fill="#ffffff", stroke="#dfe6e1") + .configure_axis( + labelColor="#43504c", + titleColor="#17201f", + gridColor="#e6ece8", + domainColor="#cfd8d2", + tickColor="#cfd8d2", + ) + .configure_legend(labelColor="#17201f", titleColor="#17201f") + .configure_title(color="#17201f", anchor="start") + ) + -st.title("Streamify Self-Analytics") -st.caption("Local Yandex Music metadata analytics. Audio is not downloaded or stored.") +def hbar_chart(frame: pd.DataFrame, x: str, y: str, title: str, color: str = "#0f766e") -> None: + if frame.empty: + st.info("No data for this chart.") + return + chart = ( + alt.Chart(frame) + .mark_bar(cornerRadiusEnd=3, color=color) + .encode( + x=alt.X(f"{x}:Q", title=None), + y=alt.Y(f"{y}:N", sort="-x", title=None), + tooltip=list(frame.columns), + ) + .properties(title=title, height=max(260, min(520, len(frame.index) * 30))) + ) + st.altair_chart(polish_chart(chart), use_container_width=True, theme=None) + + +def source_payload(row: pd.Series) -> dict[str, object]: + return { + "database": str(DB_PATH), + "manifest_source": str(row["manifest_source"]), + "manifest_generated_at": None if pd.isna(row["manifest_generated_at"]) else str(row["manifest_generated_at"]), + "manifest_raw_dir": str(row["manifest_raw_dir"]), + "manifest_json_only": bool(row["manifest_json_only"]), + "adapter": { + "adapter_name": str(row["adapter_name"]), + "adapter_version": str(row["adapter_version"]), + "client_library": str(row["client_library"]), + "client_library_version": None if pd.isna(row["client_library_version"]) else str(row["client_library_version"]), + }, + "ingestion_diagnostics": { + "liked_shortcuts_seen": safe_int(row["diagnostic_liked_shortcuts_seen"]), + "liked_tracks_written": safe_int(row["diagnostic_liked_tracks_written"]), + "liked_shortcuts_fetch_failed": safe_int(row["diagnostic_liked_shortcuts_fetch_failed"]), + "liked_shortcuts_missing_track_id": safe_int(row["diagnostic_liked_shortcuts_missing_track_id"]), + "liked_tracks_duplicate_skipped": safe_int(row["diagnostic_liked_tracks_duplicate_skipped"]), + "liked_albums_seen": safe_int(row["diagnostic_liked_albums_seen"]), + "liked_albums_written": safe_int(row["diagnostic_liked_albums_written"]), + "liked_albums_missing_id": safe_int(row["diagnostic_liked_albums_missing_id"]), + "liked_albums_duplicate_skipped": safe_int(row["diagnostic_liked_albums_duplicate_skipped"]), + "liked_artists_seen": safe_int(row["diagnostic_liked_artists_seen"]), + "liked_artists_written": safe_int(row["diagnostic_liked_artists_written"]), + "liked_artists_missing_id": safe_int(row["diagnostic_liked_artists_missing_id"]), + "liked_artists_duplicate_skipped": safe_int(row["diagnostic_liked_artists_duplicate_skipped"]), + "liked_playlists_seen": safe_int(row["diagnostic_liked_playlists_seen"]), + "liked_playlists_written": safe_int(row["diagnostic_liked_playlists_written"]), + "liked_playlists_missing_id": safe_int(row["diagnostic_liked_playlists_missing_id"]), + "liked_playlists_duplicate_skipped": safe_int(row["diagnostic_liked_playlists_duplicate_skipped"]), + "playlists_seen": safe_int(row["diagnostic_playlists_seen"]), + "playlists_written": safe_int(row["diagnostic_playlists_written"]), + "playlists_missing_id": safe_int(row["diagnostic_playlists_missing_id"]), + "playlist_fetch_fallbacks": safe_int(row["diagnostic_playlist_fetch_fallbacks"]), + "playlist_tracks_seen": safe_int(row["diagnostic_playlist_tracks_seen"]), + "playlist_tracks_written": safe_int(row["diagnostic_playlist_tracks_written"]), + "playlist_tracks_fetch_failed": safe_int(row["diagnostic_playlist_tracks_fetch_failed"]), + "playlist_tracks_missing_track_id": safe_int(row["diagnostic_playlist_tracks_missing_track_id"]), + "playlist_tracks_duplicate_skipped": safe_int(row["diagnostic_playlist_tracks_duplicate_skipped"]), + }, + "raw_counts": { + "tracks": safe_int(row["raw_tracks"]), + "artists": safe_int(row["raw_artists"]), + "albums": safe_int(row["raw_albums"]), + "playlists": safe_int(row["raw_playlists"]), + "playlist_tracks": safe_int(row["raw_playlist_tracks"]), + "user_library_events": safe_int(row["raw_user_library_events"]), + }, + "raw_checksums": { + "tracks": str(row["raw_tracks_sha256"]), + "artists": str(row["raw_artists_sha256"]), + "albums": str(row["raw_albums_sha256"]), + "playlists": str(row["raw_playlists_sha256"]), + "playlist_tracks": str(row["raw_playlist_tracks_sha256"]), + "user_library_events": str(row["raw_user_library_events_sha256"]), + }, + "calculated_at": str(row["calculated_at"]), + "top_artist_concentration": percent_label(row["top_artist_concentration"]), + "top_genre_share": percent_label(row["top_genre_share"]), + "playlist_track_slots": safe_int(row["playlist_track_slots"]), + "playlist_unique_tracks": safe_int(row["playlist_unique_tracks"]), + "busiest_month_events": safe_int(row["busiest_month_events"]), + "max_repeat_signal": safe_int(row["max_repeat_signal"]), + "last_ingested_at": None if pd.isna(row["last_ingested_at"]) else str(row["last_ingested_at"]), + "ingestion_age_hours": safe_int(row["ingestion_age_hours"]), + "stale_ingestion_flag": yes_no(row["stale_ingestion_flag"]), + } + + +st.set_page_config( + page_title="Streamify Self-Analytics", + page_icon="S", + layout="wide", + initial_sidebar_state="expanded", +) +style_app() + +st.markdown( + """ +
+
Local Yandex Music self-analytics
+

Streamify Taste Console

+

Personal metadata lakehouse for taste, rediscovery, playlist quality and data health. Audio is not downloaded or stored.

+
+ """, + unsafe_allow_html=True, +) if not require_database(): st.stop() @@ -75,35 +398,121 @@ def require_database() -> bool: row = profile.iloc[0] has_library_data = safe_int(row["total_tracks"]) > 0 -metric_cols = st.columns(5) -metric_cols[0].metric("Tracks", safe_int(row["total_tracks"])) -metric_cols[1].metric("Liked", safe_int(row["liked_tracks"])) -metric_cols[2].metric("Artists", safe_int(row["artists"])) -metric_cols[3].metric("Playlists", safe_int(row["playlists"])) -metric_cols[4].metric("Hours", safe_float(row["library_hours"])) - -source_cols = st.columns(3) -source_cols[0].metric("Source", str(row["manifest_source"])) -source_cols[1].metric("Raw tracks", safe_int(row["raw_tracks"])) -source_cols[2].metric( - "Manifest generated", - "missing" if pd.isna(row["manifest_generated_at"]) else str(row["manifest_generated_at"])[:19], + +top_artists = query( + """ + select artist_name, track_count, liked_track_count, playlist_appearances + from yamusic_artist_affinity + order by track_count desc, playlist_appearances desc, artist_name + limit 25 + """ +) +top_genres = query( + """ + select genre, track_count, liked_track_count, library_hours, track_share + from yamusic_genre_profile + order by track_count desc, genre + limit 20 + """ ) -st.caption( - f"Ingestion adapter: {row['adapter_name']} {row['adapter_version']} " - f"using {row['client_library']} {'' if pd.isna(row['client_library_version']) else row['client_library_version']}" +periods = query( + """ + select activity_month, event_count, liked_events, playlist_events, active_tracks, active_artists, active_genres + from yamusic_period_activity + order by activity_month + """ +) +track_signals_all = query( + """ + select + title, + artist_display, + album_title, + genre, + release_year, + liked, + playlist_slots, + playlist_count, + event_count, + repeat_signal, + underrated_flag, + first_event_ts, + last_event_ts + from yamusic_track_signals + order by repeat_signal desc, playlist_count desc, title + limit 5000 + """ +) +playlist_signals = query( + """ + select playlist_title, actual_track_count, unique_track_count, uniqueness_ratio, max_overlap, overlapped_track_mentions, underrated_playlist_flag + from yamusic_playlist_signals + order by underrated_playlist_flag desc, uniqueness_ratio desc, actual_track_count desc, playlist_title + """ +) +playlist_overlap = query( + """ + select playlist_a_title, playlist_b_title, overlap_track_count, jaccard_overlap + from yamusic_playlist_overlap + order by jaccard_overlap desc, overlap_track_count desc, playlist_a_title, playlist_b_title + limit 50 + """ +) +genre_periods = query( + """ + with ranked_genres as ( + select genre + from yamusic_genre_profile + order by track_count desc, genre + limit 10 + ) + select + gp.activity_month, + gp.genre, + gp.event_count, + gp.event_share_in_month + from yamusic_genre_periods gp + join ranked_genres rg using (genre) + order by gp.activity_month, gp.genre + """ +) +release_eras = query( + """ + select + case + when release_year is null then 'unknown' + when release_year < 1980 then '<1980' + when release_year < 1990 then '1980s' + when release_year < 2000 then '1990s' + when release_year < 2010 then '2000s' + when release_year < 2020 then '2010s' + else '2020s' + end as era, + count(*) as track_count, + sum(case when liked then 1 else 0 end) as liked_track_count, + round(sum(duration_seconds) / 3600.0, 1) as library_hours + from yamusic_dim_tracks + group by 1 + order by + case era + when '<1980' then 1 + when '1980s' then 2 + when '1990s' then 3 + when '2000s' then 4 + when '2010s' then 5 + when '2020s' then 6 + else 7 + end + """ ) -signal_cols = st.columns(5) -signal_cols[0].metric("Known genres", safe_int(row["known_genres"])) -signal_cols[1].metric("Active months", safe_int(row["active_months"])) -signal_cols[2].metric("Underrated tracks", safe_int(row["underrated_tracks"])) -signal_cols[3].metric("Underrated playlists", safe_int(row["underrated_playlists"])) -signal_cols[4].metric("Top artist concentration", percent_label(row["top_artist_concentration"])) - -if not has_library_data: - st.warning("No Yandex Music library metadata was returned for this run.") - st.code("make ingest\nmake dbt-build", language="bash") +top_artist = first_record(top_artists) +top_genre = first_record(top_genres) +top_overlap = first_record(playlist_overlap) +standout_playlist = first_record(playlist_signals[playlist_signals["underrated_playlist_flag"] == 1]) +rediscovery_count = safe_int(row["underrated_tracks"]) +liked_share = pct_from_whole(row["liked_tracks"], row["total_tracks"]) +source_is_real = str(row["manifest_source"]) == "yandex_music" genre_options = query( """ @@ -112,217 +521,374 @@ def require_database() -> bool: order by genre """ )["genre"].tolist() +year_values = track_signals_all["release_year"].dropna().astype(int) +min_year = int(year_values.min()) if not year_values.empty else 1960 +max_year = int(year_values.max()) if not year_values.empty else 2026 +max_playlist_seen = safe_int(track_signals_all["playlist_count"].max() if not track_signals_all.empty else 0) + +st.sidebar.header("Focus controls") +focus_preset = st.sidebar.radio( + "Quick lens", + ["Full library", "Liked rediscovery", "Repeat signals", "Playlist repair", "Recent era"], + help="Presets tune the controls below; change any field after choosing a lens.", +) +default_liked = "All" +default_min_repeat = 0 +default_max_playlist = max_playlist_seen +default_years = (min_year, max_year) +if focus_preset == "Liked rediscovery": + default_liked = "Liked" + default_max_playlist = 0 +elif focus_preset == "Repeat signals": + default_min_repeat = min(2, safe_int(row["max_repeat_signal"])) +elif focus_preset == "Playlist repair": + default_max_playlist = min(1, max_playlist_seen) +elif focus_preset == "Recent era": + default_years = (max(min_year, 2020), max_year) + +selected_genres = st.sidebar.multiselect( + "Genres", + genre_options, + default=[], + placeholder="All genres", + help="Leave empty to keep every genre in focus.", +) +liked_mode = st.sidebar.selectbox("Liked state", ["All", "Liked", "Not liked"], index=["All", "Liked", "Not liked"].index(default_liked)) +track_search = st.sidebar.text_input("Search track, artist, album").strip().lower() +year_range = st.sidebar.slider("Release years", min_year, max_year, default_years) +min_repeat = st.sidebar.slider("Minimum repeat signal", 0, safe_int(row["max_repeat_signal"]), default_min_repeat) +max_playlist_count = st.sidebar.slider("Maximum playlist coverage", 0, max_playlist_seen, default_max_playlist) +filtered_tracks = apply_focus_filters( + track_signals_all, + selected_genres, + liked_mode, + track_search, + year_range, + min_repeat, + max_playlist_count, +) +st.sidebar.metric("Tracks in focus", compact_int(len(filtered_tracks.index))) +if not filtered_tracks.empty: + st.sidebar.metric("Artists in focus", compact_int(filtered_tracks["artist_display"].nunique())) + st.sidebar.metric("Genres in focus", compact_int(filtered_tracks["genre"].fillna("unknown").nunique())) +st.sidebar.caption("The focus controls drive discovery, repeat, artist and explorer views. Top profile cards stay anchored to the complete build.") + +cols = st.columns(2) +with cols[0]: + insight_card( + "Taste spread", + f"{compact_int(row['artists'])} artists", + f"Top artist is only {percent_label(row['top_artist_concentration'])} of the library; this is a broad catalog, not a single-artist archive.", + "#0f766e", + ) +with cols[1]: + insight_card( + "Main genre", + top_genre.get("genre", "unknown"), + f"{safe_int(top_genre.get('track_count'))} tracks, {safe_float(top_genre.get('track_share')) * 100:.1f}% of known library weight.", + "#a16207", + ) +cols = st.columns(2) +with cols[0]: + insight_card( + "Rediscovery backlog", + f"{compact_int(rediscovery_count)} tracks", + f"{pct_from_whole(rediscovery_count, row['liked_tracks'])} of liked tracks are lightly playlisted or not playlisted.", + "#c2412d", + ) +with cols[1]: + overlap_text = "No material overlap detected." + if top_overlap: + overlap_text = ( + f"{top_overlap.get('playlist_a_title')} and {top_overlap.get('playlist_b_title')}: " + f"{safe_float(top_overlap.get('jaccard_overlap')) * 100:.1f}% overlap." + ) + insight_card("Playlist overlap", f"{compact_int(row['playlists'])} playlists", overlap_text, "#2563eb") + +source_label = "Yandex Music" if source_is_real else str(row["manifest_source"]) +metric_cols = st.columns(3) +metric_cols[0].metric("Tracks", compact_int(row["total_tracks"]), f"{liked_share} liked") +metric_cols[1].metric("Artists", compact_int(row["artists"]), f"{compact_int(row['known_genres'])} genres") +metric_cols[2].metric("Playlists", compact_int(row["playlists"]), f"{compact_int(row['playlist_track_slots'])} slots") +metric_cols = st.columns(3) +metric_cols[0].metric("Library hours", f"{safe_float(row['library_hours']):.1f}", "metadata duration") +metric_cols[1].metric("Active months", compact_int(row["active_months"]), f"peak {compact_int(row['busiest_month_events'])} events") +metric_cols[2].metric("Source", source_label, "real account" if source_is_real else "sample") -st.sidebar.header("Filters") -selected_genres = st.sidebar.multiselect("Genres", genre_options, default=genre_options) -liked_mode = st.sidebar.selectbox("Liked", ["All", "Liked", "Not liked"]) -track_search = st.sidebar.text_input("Search").strip().lower() +if not has_library_data: + st.warning("No Yandex Music library metadata was returned for this run.") + st.code("make ingest\nmake dbt-build", language="bash") -tab_overview, tab_periods, tab_artists, tab_genres, tab_playlists, tab_tracks, tab_actions, tab_quality = st.tabs( - ["Overview", "Periods", "Artists", "Genres", "Playlists", "Tracks", "Actions", "Data Quality"] +tab_story, tab_taste, tab_mix, tab_discovery, tab_playlists, tab_tracks, tab_actions, tab_quality = st.tabs( + ["Story", "Taste Map", "Mix Shift", "Rediscovery", "Playlists", "Explorer", "Actions", "Data Quality"] ) -with tab_overview: - tracks = query( - """ - select title, artist_display, album_title, genre, liked, duration_seconds - from yamusic_dim_tracks - order by liked desc, title - limit 5000 - """ +with tab_story: + st.subheader("What stands out") + st.markdown( + f""" + Top artist: {escape(top_artist.get('artist_name', 'unknown'))} + Top genre: {escape(top_genre.get('genre', 'unknown'))} + Known genres: {compact_int(row['known_genres'])} + Max repeat signal: {compact_int(row['max_repeat_signal'])} + Data source: {escape(row['manifest_source'])} + Focus: {compact_int(len(filtered_tracks.index))} tracks + """, + unsafe_allow_html=True, ) - tracks = apply_track_filters(tracks, selected_genres, liked_mode, track_search) - st.subheader("Library snapshot") - st.metric("Filtered tracks", len(tracks.index)) - st.dataframe(tracks, use_container_width=True, hide_index=True) -with tab_periods: - periods = query( - """ - select activity_month, event_count, liked_events, playlist_events, active_tracks, active_artists, active_genres - from yamusic_period_activity - order by activity_month - """ - ) - st.subheader("Activity periods") - if not periods.empty: - chart_data = periods.set_index("activity_month")[["event_count", "active_tracks", "active_artists"]] - st.line_chart(chart_data) - st.dataframe(periods, use_container_width=True, hide_index=True) - genre_periods = query( - """ - select activity_month, genre, event_count, active_tracks, event_share_in_month - from yamusic_genre_periods - order by activity_month, event_share_in_month desc, genre - """ - ) - st.subheader("Genre shifts") - if not genre_periods.empty: - genre_shift_chart = genre_periods.pivot( - index="activity_month", columns="genre", values="event_share_in_month" - ).fillna(0) - st.line_chart(genre_shift_chart) - genre_periods["event_share_in_month"] = genre_periods["event_share_in_month"].map(lambda value: f"{value * 100:.1f}%") - st.dataframe(genre_periods, use_container_width=True, hide_index=True) - -with tab_artists: - artists = query( + left, right = st.columns([1.25, 1]) + with left: + st.subheader("Activity timeline") + st.markdown("
Events are metadata events from liked tracks and playlist membership.
", unsafe_allow_html=True) + if not periods.empty: + period_long = periods.melt( + id_vars=["activity_month"], + value_vars=["event_count", "active_tracks", "active_artists"], + var_name="signal", + value_name="value", + ) + timeline = ( + alt.Chart(period_long) + .mark_line(point=True) + .encode( + x=alt.X("activity_month:T", title=None), + y=alt.Y("value:Q", title=None), + color=alt.Color("signal:N", title=None), + tooltip=["activity_month:T", "signal:N", "value:Q"], + ) + .properties(height=320) + ) + st.altair_chart(polish_chart(timeline), use_container_width=True, theme=None) + with right: + st.subheader("Genre fingerprint") + st.markdown("
Ranked by track count, with liked coverage retained for comparison.
", unsafe_allow_html=True) + if not top_genres.empty: + genre_long = top_genres.head(10).melt( + id_vars=["genre"], + value_vars=["track_count", "liked_track_count"], + var_name="measure", + value_name="tracks", + ) + chart = ( + alt.Chart(genre_long) + .mark_bar(cornerRadiusEnd=3) + .encode( + x=alt.X("tracks:Q", title=None), + y=alt.Y("genre:N", sort="-x", title=None), + color=alt.Color("measure:N", title=None, scale=alt.Scale(range=["#0f766e", "#a16207"])), + tooltip=["genre:N", "measure:N", "tracks:Q"], + ) + .properties(height=320) + ) + st.altair_chart(polish_chart(chart), use_container_width=True, theme=None) + + with st.expander("Audit rows for Story"): + display = top_genres.copy() + display["track_share"] = display["track_share"].map(lambda value: f"{value * 100:.1f}%") + st.dataframe(display.head(12), use_container_width=True, hide_index=True) + st.dataframe(periods.sort_values("activity_month", ascending=False), use_container_width=True, hide_index=True) + +with tab_taste: + st.subheader("Artist gravity") + st.markdown("
Artists with more tracks are not necessarily the most liked; the scatter shows breadth versus affinity.
", unsafe_allow_html=True) + artist_map = query( """ select artist_name, track_count, liked_track_count, playlist_appearances from yamusic_artist_affinity - order by track_count desc, playlist_appearances desc, artist_name - limit 30 + order by track_count desc, liked_track_count desc, artist_name + limit 150 """ ) - st.subheader("Artist affinity") - if not artists.empty: - top_artist = artists.iloc[0] - st.caption( - f"Top artist: {top_artist['artist_name']} with {safe_int(top_artist['track_count'])} tracks " - f"and {safe_int(top_artist['liked_track_count'])} liked tracks." + if not artist_map.empty: + artist_map["affinity_rate"] = artist_map["liked_track_count"] / artist_map["track_count"].clip(lower=1) + artist_chart = ( + alt.Chart(artist_map) + .mark_circle(opacity=0.72, color="#0f766e") + .encode( + x=alt.X("track_count:Q", title="tracks in library"), + y=alt.Y("liked_track_count:Q", title="liked tracks"), + size=alt.Size("playlist_appearances:Q", title="playlist appearances", scale=alt.Scale(range=[40, 900])), + tooltip=["artist_name:N", "track_count:Q", "liked_track_count:Q", "playlist_appearances:Q"], + ) + .properties(height=420) ) - st.bar_chart(artists.set_index("artist_name")["track_count"]) - st.dataframe(artists, use_container_width=True, hide_index=True) + st.altair_chart(polish_chart(artist_chart), use_container_width=True, theme=None) + fan_col, breadth_col = st.columns(2) + with fan_col: + st.subheader("High-affinity artists") + high_affinity = artist_map[artist_map["track_count"] >= 3].sort_values( + ["affinity_rate", "liked_track_count", "track_count"], + ascending=False, + ).head(8) + hbar_chart(high_affinity, "affinity_rate", "artist_name", "Liked share among artists with 3+ tracks", "#a16207") + with breadth_col: + st.subheader("Catalog anchors") + hbar_chart(artist_map.head(8), "track_count", "artist_name", "Largest artist footprints", "#2563eb") -with tab_genres: - genres = query( - """ - select genre, track_count, liked_track_count, library_hours, track_share - from yamusic_genre_profile - order by track_count desc, genre - """ - ) st.subheader("Genre diversity") - if not genres.empty: - st.bar_chart(genres.set_index("genre")["track_count"]) - genres["track_share"] = genres["track_share"].map(lambda value: f"{value * 100:.1f}%") - st.dataframe(genres, use_container_width=True, hide_index=True) - genre_periods = query( - """ - select activity_month, genre, event_share_in_month, event_count, active_tracks - from yamusic_genre_periods - order by activity_month desc, event_share_in_month desc, genre - """ - ) - st.subheader("Genre shifts by month") + if not top_genres.empty: + genre_bubble = ( + alt.Chart(top_genres) + .mark_circle(opacity=0.75, color="#c2412d") + .encode( + x=alt.X("track_count:Q", title="tracks"), + y=alt.Y("liked_track_count:Q", title="liked tracks"), + size=alt.Size("library_hours:Q", title="library hours", scale=alt.Scale(range=[80, 1200])), + tooltip=["genre:N", "track_count:Q", "liked_track_count:Q", "library_hours:Q", alt.Tooltip("track_share:Q", format=".1%")], + ) + .properties(height=360) + ) + st.altair_chart(polish_chart(genre_bubble), use_container_width=True, theme=None) + with st.expander("Artist and genre data"): + genre_table = top_genres.copy() + if not genre_table.empty: + genre_table["track_share"] = genre_table["track_share"].map(lambda value: f"{value * 100:.1f}%") + genre_table["library_hours"] = genre_table["library_hours"].map(lambda value: f"{value:.1f}") + st.dataframe(artist_map.head(50), use_container_width=True, hide_index=True) + st.dataframe(genre_table, use_container_width=True, hide_index=True) + +with tab_mix: + st.subheader("Genre heatmap") + st.markdown("
A Wrapped-style view of when genres entered the library metadata stream.
", unsafe_allow_html=True) if not genre_periods.empty: - genre_periods["event_share_in_month"] = genre_periods["event_share_in_month"].map(lambda value: f"{value * 100:.1f}%") - st.dataframe(genre_periods, use_container_width=True, hide_index=True) + heatmap = ( + alt.Chart(genre_periods) + .mark_rect() + .encode( + x=alt.X("yearmonth(activity_month):O", title=None), + y=alt.Y("genre:N", title=None), + color=alt.Color("event_count:Q", title="events", scale=alt.Scale(scheme="tealblues")), + tooltip=["activity_month:T", "genre:N", "event_count:Q", alt.Tooltip("event_share_in_month:Q", format=".1%")], + ) + .properties(height=360) + ) + st.altair_chart(polish_chart(heatmap), use_container_width=True, theme=None) + + left, right = st.columns(2) + with left: + st.subheader("Release-era mix") + era_long = release_eras.melt( + id_vars=["era"], + value_vars=["track_count", "liked_track_count"], + var_name="measure", + value_name="tracks", + ) + era_chart = ( + alt.Chart(era_long) + .mark_bar(cornerRadiusEnd=3) + .encode( + x=alt.X("tracks:Q", title=None), + y=alt.Y("era:N", sort=["<1980", "1980s", "1990s", "2000s", "2010s", "2020s", "unknown"], title=None), + color=alt.Color("measure:N", title=None, scale=alt.Scale(range=["#2563eb", "#0f766e"])), + tooltip=["era:N", "measure:N", "tracks:Q"], + ) + .properties(height=300) + ) + st.altair_chart(polish_chart(era_chart), use_container_width=True, theme=None) + with right: + st.subheader("Focus genre mix") + focus_genres = ( + filtered_tracks.assign(genre=filtered_tracks["genre"].fillna("unknown")) + .groupby("genre", as_index=False) + .agg(track_count=("title", "count"), repeat_signal=("repeat_signal", "sum")) + .sort_values("track_count", ascending=False) + .head(10) + ) + hbar_chart(focus_genres, "track_count", "genre", "Tracks in the current focus", "#0f766e") -with tab_playlists: - playlists = query( - """ - select playlist_title, actual_track_count, unique_track_count, declared_track_count - from yamusic_dim_playlists - order by actual_track_count desc, playlist_title - """ - ) - st.subheader("Playlist coverage") - st.dataframe(playlists, use_container_width=True, hide_index=True) - playlist_signals = query( - """ - select playlist_title, uniqueness_ratio, max_overlap, overlapped_track_mentions, underrated_playlist_flag - from yamusic_playlist_signals - order by underrated_playlist_flag desc, uniqueness_ratio desc, playlist_title - """ +with tab_discovery: + st.subheader("Rediscovery queue") + st.markdown( + "
Liked tracks with low playlist coverage are good candidates for resurfacing or playlist repair.
", + unsafe_allow_html=True, ) - st.subheader("Underrated playlist signals") - if not playlist_signals.empty: - playlist_signals["uniqueness_ratio"] = playlist_signals["uniqueness_ratio"].map(lambda value: f"{value * 100:.1f}%") - playlist_signals["max_overlap"] = playlist_signals["max_overlap"].map(lambda value: f"{value * 100:.1f}%") - playlist_signals["underrated_playlist_flag"] = playlist_signals["underrated_playlist_flag"].map(lambda value: "yes" if value else "no") - st.dataframe(playlist_signals, use_container_width=True, hide_index=True) - overlap = query( - """ - select playlist_a_title, playlist_b_title, overlap_track_count, jaccard_overlap - from yamusic_playlist_overlap - order by overlap_track_count desc, jaccard_overlap desc - limit 50 - """ + rediscovery = filtered_tracks[filtered_tracks["underrated_flag"] == 1].sort_values( + ["playlist_count", "repeat_signal", "title"], + ascending=[True, False, True], ) + c1, c2, c3 = st.columns(3) + c1.metric("Filtered rediscovery tracks", compact_int(len(rediscovery.index))) + c2.metric("Zero-playlist liked tracks", compact_int((rediscovery["playlist_count"] == 0).sum() if not rediscovery.empty else 0)) + c3.metric("Top repeat in queue", compact_int(rediscovery["repeat_signal"].max() if not rediscovery.empty else 0)) + render_track_cards(rediscovery, limit=8) + + st.subheader("Repeat signals") + repeats = filtered_tracks.sort_values(["repeat_signal", "playlist_count", "event_count"], ascending=False).head(40) + if not repeats.empty: + repeat_chart = repeats.head(15).copy() + repeat_chart["track"] = repeat_chart["title"] + " · " + repeat_chart["artist_display"] + hbar_chart(repeat_chart, "repeat_signal", "track", "Repeat signal leaderboard", "#c2412d") + with st.expander("Rediscovery and repeat rows"): + st.dataframe( + rediscovery[["title", "artist_display", "genre", "playlist_count", "event_count", "repeat_signal"]].head(250), + use_container_width=True, + hide_index=True, + ) + st.dataframe( + repeats[["title", "artist_display", "genre", "liked", "playlist_count", "event_count", "repeat_signal"]], + use_container_width=True, + hide_index=True, + ) + +with tab_playlists: + st.subheader("Playlist health") + st.markdown("
High uniqueness plus low overlap suggests a playlist is a distinct listening surface.
", unsafe_allow_html=True) + playlist_viz = playlist_signals.copy() + if not playlist_viz.empty: + playlist_viz["health_score"] = playlist_viz["uniqueness_ratio"] * (1 - playlist_viz["max_overlap"]) + playlist_viz["status"] = playlist_viz["underrated_playlist_flag"].map(lambda value: "standout" if value else "normal") + playlist_chart = ( + alt.Chart(playlist_viz) + .mark_circle(opacity=0.78) + .encode( + x=alt.X("uniqueness_ratio:Q", title="uniqueness", axis=alt.Axis(format="%")), + y=alt.Y("max_overlap:Q", title="max overlap", axis=alt.Axis(format="%")), + size=alt.Size("actual_track_count:Q", title="tracks", scale=alt.Scale(range=[80, 1100])), + color=alt.Color("status:N", title=None, scale=alt.Scale(range=["#0f766e", "#c2412d"])), + tooltip=["playlist_title:N", "actual_track_count:Q", alt.Tooltip("uniqueness_ratio:Q", format=".1%"), alt.Tooltip("max_overlap:Q", format=".1%")], + ) + .properties(height=420) + ) + st.altair_chart(polish_chart(playlist_chart), use_container_width=True, theme=None) + standout = playlist_viz.sort_values(["health_score", "actual_track_count"], ascending=False).head(6) + hbar_chart(standout, "health_score", "playlist_title", "Most distinct playlist surfaces", "#0f766e") + st.subheader("Playlist overlap") + overlap = playlist_overlap.copy() if not overlap.empty: - overlap["jaccard_overlap"] = overlap["jaccard_overlap"].map(lambda value: f"{value * 100:.1f}%") - st.dataframe(overlap, use_container_width=True, hide_index=True) + overlap["pair"] = overlap["playlist_a_title"] + " / " + overlap["playlist_b_title"] + hbar_chart(overlap.head(15), "jaccard_overlap", "pair", "Potential cleanup pairs", "#a16207") + with st.expander("Playlist rows"): + playlist_table = playlist_signals.copy() + playlist_table["uniqueness_ratio"] = playlist_table["uniqueness_ratio"].map(lambda value: f"{value * 100:.1f}%") + playlist_table["max_overlap"] = playlist_table["max_overlap"].map(lambda value: f"{value * 100:.1f}%") + playlist_table["underrated_playlist_flag"] = playlist_table["underrated_playlist_flag"].map(lambda value: "yes" if value else "no") + if not overlap.empty: + overlap["jaccard_overlap"] = overlap["jaccard_overlap"].map(lambda value: f"{value * 100:.1f}%") + st.dataframe(playlist_table, use_container_width=True, hide_index=True) + st.dataframe(overlap.drop(columns=["pair"], errors="ignore"), use_container_width=True, hide_index=True) with tab_tracks: - track_signals = query( - """ - select - title, - artist_display, - genre, - liked, - playlist_count, - event_count, - repeat_signal, - underrated_flag, - first_event_ts, - last_event_ts - from yamusic_track_signals - order by underrated_flag desc, repeat_signal desc, title - limit 5000 - """ - ) - track_signals = apply_track_filters(track_signals, selected_genres, liked_mode, track_search) - st.subheader("Repeated and underrated tracks") - st.metric("Filtered track signals", len(track_signals.index)) - if not track_signals.empty: - top_repeat = track_signals.sort_values(["repeat_signal", "playlist_count"], ascending=False).iloc[0] - st.caption( - f"Highest repeat signal: {top_repeat['title']} by {top_repeat['artist_display']} " - f"with score {safe_int(top_repeat['repeat_signal'])}." + st.subheader("Explorer") + st.markdown("
A visual browse surface for the active focus. Tables stay collapsed for exact lookup.
", unsafe_allow_html=True) + e1, e2, e3 = st.columns(3) + e1.metric("Tracks in focus", compact_int(len(filtered_tracks.index))) + e2.metric("Liked in focus", compact_int(filtered_tracks["liked"].sum() if not filtered_tracks.empty else 0)) + e3.metric("Zero-playlist", compact_int((filtered_tracks["playlist_count"] == 0).sum() if not filtered_tracks.empty else 0)) + render_track_cards(filtered_tracks.sort_values(["liked", "repeat_signal", "playlist_count"], ascending=[False, False, True]), limit=12) + with st.expander("Exact track lookup"): + st.dataframe( + filtered_tracks[["title", "artist_display", "album_title", "genre", "release_year", "liked", "playlist_count", "repeat_signal"]], + use_container_width=True, + hide_index=True, ) - track_signals["liked"] = track_signals["liked"].map(lambda value: "yes" if value else "no") - track_signals["underrated_flag"] = track_signals["underrated_flag"].map(lambda value: "yes" if value else "no") - st.dataframe(track_signals, use_container_width=True, hide_index=True) with tab_actions: st.subheader("Next actions") - action_profile = row.to_dict() - for action in build_data_next_actions(action_profile): + for action in build_data_next_actions(row.to_dict()): st.write(f"- {action}") - rediscovery = query( - """ - select title, artist_display, genre, playlist_slots, playlist_count - from yamusic_track_signals - where underrated_flag = true - order by playlist_slots asc, playlist_count asc, title - limit 25 - """ - ) - st.subheader("Rediscovery queue") - st.dataframe(rediscovery, use_container_width=True, hide_index=True) - - cleanup = query( - """ - select playlist_a_title, playlist_b_title, overlap_track_count, jaccard_overlap - from yamusic_playlist_overlap - order by jaccard_overlap desc, overlap_track_count desc, playlist_a_title, playlist_b_title - limit 25 - """ - ) - if not cleanup.empty: - cleanup["jaccard_overlap"] = cleanup["jaccard_overlap"].map(lambda value: f"{value * 100:.1f}%") - st.subheader("Playlist cleanup candidates") - st.dataframe(cleanup, use_container_width=True, hide_index=True) - - standout_playlists = query( - """ - select playlist_title, actual_track_count, unique_track_count, uniqueness_ratio, max_overlap - from yamusic_playlist_signals - where underrated_playlist_flag = true - order by uniqueness_ratio desc, actual_track_count desc, playlist_title - limit 25 - """ - ) - if not standout_playlists.empty: - standout_playlists["uniqueness_ratio"] = standout_playlists["uniqueness_ratio"].map(lambda value: f"{value * 100:.1f}%") - standout_playlists["max_overlap"] = standout_playlists["max_overlap"].map(lambda value: f"{value * 100:.1f}%") - st.subheader("Standout playlists") - st.dataframe(standout_playlists, use_container_width=True, hide_index=True) - export_cols = st.columns(2) if REPORT_PATH.exists(): export_cols[0].download_button( @@ -349,74 +915,28 @@ def require_database() -> bool: mime="text/csv", ) + st.subheader("Action previews") + left, right = st.columns(2) + with left: + st.caption("Playlist cleanup candidates") + cleanup = playlist_overlap.copy() + if not cleanup.empty: + cleanup["jaccard_overlap"] = cleanup["jaccard_overlap"].map(lambda value: f"{value * 100:.1f}%") + st.dataframe(cleanup.head(25), use_container_width=True, hide_index=True) + with right: + st.caption("Standout playlists") + standout = playlist_signals[playlist_signals["underrated_playlist_flag"] == 1].copy() + if not standout.empty: + standout["uniqueness_ratio"] = standout["uniqueness_ratio"].map(lambda value: f"{value * 100:.1f}%") + standout["max_overlap"] = standout["max_overlap"].map(lambda value: f"{value * 100:.1f}%") + st.dataframe(standout.head(25), use_container_width=True, hide_index=True) + with tab_quality: - quality = { - "database": str(DB_PATH), - "manifest_source": str(row["manifest_source"]), - "manifest_generated_at": None if pd.isna(row["manifest_generated_at"]) else str(row["manifest_generated_at"]), - "manifest_raw_dir": str(row["manifest_raw_dir"]), - "manifest_json_only": bool(row["manifest_json_only"]), - "adapter": { - "adapter_name": str(row["adapter_name"]), - "adapter_version": str(row["adapter_version"]), - "client_library": str(row["client_library"]), - "client_library_version": None if pd.isna(row["client_library_version"]) else str(row["client_library_version"]), - }, - "ingestion_diagnostics": { - "liked_shortcuts_seen": safe_int(row["diagnostic_liked_shortcuts_seen"]), - "liked_tracks_written": safe_int(row["diagnostic_liked_tracks_written"]), - "liked_shortcuts_fetch_failed": safe_int(row["diagnostic_liked_shortcuts_fetch_failed"]), - "liked_shortcuts_missing_track_id": safe_int(row["diagnostic_liked_shortcuts_missing_track_id"]), - "liked_tracks_duplicate_skipped": safe_int(row["diagnostic_liked_tracks_duplicate_skipped"]), - "liked_albums_seen": safe_int(row["diagnostic_liked_albums_seen"]), - "liked_albums_written": safe_int(row["diagnostic_liked_albums_written"]), - "liked_albums_missing_id": safe_int(row["diagnostic_liked_albums_missing_id"]), - "liked_albums_duplicate_skipped": safe_int(row["diagnostic_liked_albums_duplicate_skipped"]), - "liked_artists_seen": safe_int(row["diagnostic_liked_artists_seen"]), - "liked_artists_written": safe_int(row["diagnostic_liked_artists_written"]), - "liked_artists_missing_id": safe_int(row["diagnostic_liked_artists_missing_id"]), - "liked_artists_duplicate_skipped": safe_int(row["diagnostic_liked_artists_duplicate_skipped"]), - "liked_playlists_seen": safe_int(row["diagnostic_liked_playlists_seen"]), - "liked_playlists_written": safe_int(row["diagnostic_liked_playlists_written"]), - "liked_playlists_missing_id": safe_int(row["diagnostic_liked_playlists_missing_id"]), - "liked_playlists_duplicate_skipped": safe_int(row["diagnostic_liked_playlists_duplicate_skipped"]), - "playlists_seen": safe_int(row["diagnostic_playlists_seen"]), - "playlists_written": safe_int(row["diagnostic_playlists_written"]), - "playlists_missing_id": safe_int(row["diagnostic_playlists_missing_id"]), - "playlist_fetch_fallbacks": safe_int(row["diagnostic_playlist_fetch_fallbacks"]), - "playlist_tracks_seen": safe_int(row["diagnostic_playlist_tracks_seen"]), - "playlist_tracks_written": safe_int(row["diagnostic_playlist_tracks_written"]), - "playlist_tracks_fetch_failed": safe_int(row["diagnostic_playlist_tracks_fetch_failed"]), - "playlist_tracks_missing_track_id": safe_int(row["diagnostic_playlist_tracks_missing_track_id"]), - "playlist_tracks_duplicate_skipped": safe_int(row["diagnostic_playlist_tracks_duplicate_skipped"]), - }, - "raw_counts": { - "tracks": safe_int(row["raw_tracks"]), - "artists": safe_int(row["raw_artists"]), - "albums": safe_int(row["raw_albums"]), - "playlists": safe_int(row["raw_playlists"]), - "playlist_tracks": safe_int(row["raw_playlist_tracks"]), - "user_library_events": safe_int(row["raw_user_library_events"]), - }, - "raw_checksums": { - "tracks": str(row["raw_tracks_sha256"]), - "artists": str(row["raw_artists_sha256"]), - "albums": str(row["raw_albums_sha256"]), - "playlists": str(row["raw_playlists_sha256"]), - "playlist_tracks": str(row["raw_playlist_tracks_sha256"]), - "user_library_events": str(row["raw_user_library_events_sha256"]), - }, - "calculated_at": str(row["calculated_at"]), - "top_artist_concentration": percent_label(row["top_artist_concentration"]), - "top_genre_share": percent_label(row["top_genre_share"]), - "playlist_track_slots": safe_int(row["playlist_track_slots"]), - "playlist_unique_tracks": safe_int(row["playlist_unique_tracks"]), - "busiest_month_events": safe_int(row["busiest_month_events"]), - "max_repeat_signal": safe_int(row["max_repeat_signal"]), - "last_ingested_at": None if pd.isna(row["last_ingested_at"]) else str(row["last_ingested_at"]), - "ingestion_age_hours": safe_int(row["ingestion_age_hours"]), - "stale_ingestion_flag": yes_no(row["stale_ingestion_flag"]), - } st.subheader("Local data quality signals") - st.json(quality) + q1, q2, q3, q4 = st.columns(4) + q1.metric("Raw tracks", compact_int(row["raw_tracks"])) + q2.metric("Fetch failures", compact_int(row["diagnostic_liked_shortcuts_fetch_failed"] + row["diagnostic_playlist_tracks_fetch_failed"])) + q3.metric("Duplicate skips", compact_int(row["diagnostic_liked_tracks_duplicate_skipped"] + row["diagnostic_playlist_tracks_duplicate_skipped"])) + q4.metric("Stale", yes_no(row["stale_ingestion_flag"])) + st.json(source_payload(row)) st.info("Run `make test` for schema, relationship, compile and compose checks.") diff --git a/docs/product_acceptance.md b/docs/product_acceptance.md index 9c7d881..4dab146 100644 --- a/docs/product_acceptance.md +++ b/docs/product_acceptance.md @@ -8,7 +8,7 @@ This document maps the MVP requirements to concrete repository artifacts and ver | --- | --- | --- | | Fully local run without GCP | `Makefile`, `docker-compose.local.yml`, `dbt/profiles.yml` local DuckDB target | `make acceptance-local`, `make compose-smoke-local` | | Local operator entrypoint | `make help` lists sample, real-account, Docker, export, readiness and cleanup commands | `make help` | -| Docker Compose local product path | `docker-compose.local.yml` `local` profile with one-shot build, dashboard services, `set -euo pipefail`, real-source readiness enforcement when a token is configured, and compose smoke validation of raw/product/dashboard artifacts | `make up-local`, `make compose-smoke-local` | +| Docker Compose local product path | `docker-compose.local.yml` `local` profile with one-shot build, dashboard services, `set -euo pipefail`, real-source readiness enforcement when a token is configured, and compose smoke validation of raw/product/dashboard artifacts | `make up-local`, `make compose-smoke-local`, `make compose-smoke-real` | | Yandex Music metadata ingestion | `yamusic_ingest/__main__.py`, `yamusic_ingest/yandex_client.py`; liked tracks, owned playlists, liked playlists, liked albums and liked artists where exposed by the API | `make preflight`, `make ingest`, `make acceptance-real` | | No audio download or storage | metadata-only adapter, `.gitignore`, safety scripts | `scripts/check_no_audio_artifacts.py`, `scripts/check_no_local_sensitive_artifacts.py` | | Raw normalized outputs | `tracks`, `artists`, `albums`, `playlists`, `playlist_tracks`, `user_library_events` JSONL/Parquet writers | `make ingest-sample`, `make raw-contract` | diff --git a/docs/yandex_music_local.md b/docs/yandex_music_local.md index 049eda8..1a1a643 100644 --- a/docs/yandex_music_local.md +++ b/docs/yandex_music_local.md @@ -32,7 +32,7 @@ make dashboard Streamify does not ask for your Yandex password and does not fetch a token by itself. The installed `yandex-music` Python client accepts an existing OAuth token through `Client(token).init()`, but version 2.2.0 does not expose a `device_auth` helper. -Run `make token-help` for the short in-repo reminder. +Run `make token-help` for the in-repo token setup helper. It checks whether `.env` exists, whether a token is configured, the installed `yandex-music` client version and whether that client exposes a built-in device auth flow. It never prints token values. Use an external Yandex Music OAuth token helper, then paste only the resulting token into the local `.env` file: @@ -70,6 +70,12 @@ For an automated Docker smoke test that does not call a real account, run: make compose-smoke-local ``` +After `YANDEX_MUSIC_TOKEN` is configured, verify the same Docker Compose profile against real account metadata: + +```bash +make compose-smoke-real +``` + The local dbt command is: ```bash diff --git a/scripts/smoke_compose_local.py b/scripts/smoke_compose_local.py index 5829bd1..443f8f3 100644 --- a/scripts/smoke_compose_local.py +++ b/scripts/smoke_compose_local.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import annotations +import argparse import os import socket import subprocess @@ -69,13 +70,33 @@ def run_host_check(args: list[str], env: dict[str, str]) -> None: raise RuntimeError(f"host validation failed for {' '.join(args)}:\n{output}") +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Smoke test the local Docker Compose product profile.") + parser.add_argument( + "--use-env-token", + action="store_true", + help="Use YANDEX_MUSIC_TOKEN from the environment and require real-account readiness.", + ) + return parser.parse_args() + + def main() -> int: + args = parse_args() port = free_port() url = f"http://127.0.0.1:{port}" env = os.environ.copy() env["STREAMIFY_DASHBOARD_PORT"] = str(port) - # Compose smoke must be deterministic and must not call a real account. - env["YANDEX_MUSIC_TOKEN"] = "" + if args.use_env_token: + if not env.get("YANDEX_MUSIC_TOKEN"): + print("ERROR: --use-env-token requires YANDEX_MUSIC_TOKEN in the environment or .env.", file=sys.stderr) + return 2 + readiness_args = ["scripts/audit_yamusic_readiness.py", "--require-real"] + mode = "Yandex Music metadata" + else: + # Default compose smoke must be deterministic and must not call a real account. + env["YANDEX_MUSIC_TOKEN"] = "" + readiness_args = ["scripts/audit_yamusic_readiness.py"] + mode = "sample metadata" try: run_compose(["up", "--build", "-d", "dashboard"], env) @@ -85,7 +106,7 @@ def main() -> int: assert_no_runtime_failures(logs.stdout + logs.stderr) for check_args in [ ["scripts/validate_yamusic_raw_contract.py"], - ["scripts/audit_yamusic_readiness.py"], + readiness_args, ["scripts/smoke_product_answers.py"], ["scripts/smoke_dashboard_content.py"], ]: @@ -102,7 +123,7 @@ def main() -> int: finally: run_compose(["down", "--remove-orphans"], env, check=False) - print(f"OK: docker compose local profile returned HTTP 200 at {url} and produced valid local product artifacts.") + print(f"OK: docker compose local profile returned HTTP 200 at {url} and produced valid local product artifacts from {mode}.") return 0 diff --git a/scripts/smoke_dashboard_content.py b/scripts/smoke_dashboard_content.py index 82e7d89..a33e00a 100644 --- a/scripts/smoke_dashboard_content.py +++ b/scripts/smoke_dashboard_content.py @@ -47,55 +47,54 @@ def main() -> int: if app.exception: fail(f"dashboard emitted st.exception elements: {values(app.exception)}") - require_contains(values(app.title), ["Streamify Self-Analytics"], "title") - require_contains( - values(app.caption), - ["Local Yandex Music metadata analytics. Audio is not downloaded or stored."], - "caption", - ) require_contains( labels(app.metric), [ "Tracks", - "Liked", "Artists", "Playlists", - "Hours", + "Library hours", "Source", "Raw tracks", - "Known genres", - "Active months", - "Underrated tracks", - "Underrated playlists", - "Top artist concentration", + "Fetch failures", + "Duplicate skips", + "Stale", + "Filtered rediscovery tracks", + "Tracks in focus", + "Liked in focus", + "Zero-playlist", ], "metrics", ) require_contains( labels(app.tabs), - ["Overview", "Periods", "Artists", "Genres", "Playlists", "Tracks", "Actions", "Data Quality"], + ["Story", "Taste Map", "Mix Shift", "Rediscovery", "Playlists", "Explorer", "Actions", "Data Quality"], "tabs", ) require_contains( values(app.subheader), [ - "Library snapshot", - "Activity periods", - "Genre shifts", - "Artist affinity", + "What stands out", + "Activity timeline", + "Genre fingerprint", + "Artist gravity", "Genre diversity", - "Playlist coverage", + "Genre heatmap", + "Release-era mix", + "Focus genre mix", + "Rediscovery queue", + "Repeat signals", + "Playlist health", "Playlist overlap", - "Repeated and underrated tracks", + "Explorer", "Next actions", - "Rediscovery queue", - "Playlist cleanup candidates", + "Action previews", "Local data quality signals", ], "sections", ) - if len(app.dataframe) < 8: - fail(f"dashboard should expose multiple analytical dataframes, found {len(app.dataframe)}") + if len(app.dataframe) < 6: + fail(f"dashboard should keep audit dataframes available in expanders, found {len(app.dataframe)}") if not app.json: fail("dashboard Data Quality tab should expose a JSON quality block") diff --git a/scripts/validate_yamusic_local.py b/scripts/validate_yamusic_local.py index 342dd41..56523b1 100644 --- a/scripts/validate_yamusic_local.py +++ b/scripts/validate_yamusic_local.py @@ -58,6 +58,7 @@ def main() -> int: "scripts/export_yamusic_recommendations.py", "scripts/audit_yamusic_readiness.py", "scripts/build_pages_site.py", + "scripts/yamusic_token_help.py", ".github/workflows/pages.yml", ".github/workflows/release.yml", ".github/ISSUE_TEMPLATE/agent_task.yml", @@ -73,11 +74,11 @@ def main() -> int: require_markers( "README.md", - ["Yandex Music", "DuckDB", "make help", "make status", "make ingest-sample", "make acceptance-real", "make dashboard", "genre shifts", "`local` profile", "DBT_THREADS=1", "scripts/run_with_dotenv.py", "fresh checkout", "make clean-local", "dbt target/logs/packages", "make readiness-real", "make up-local", "make snapshot", "make recommendations", "make pages-site", "GitHub Pages", "tag-based releases", "streamify_snapshot.json", "data/recommendations"], + ["Yandex Music", "DuckDB", "make help", "make status", "make ingest-sample", "make acceptance-real", "make dashboard", "genre shifts", "`local` profile", "DBT_THREADS=1", "scripts/run_with_dotenv.py", "fresh checkout", "make clean-local", "dbt target/logs/packages", "make readiness-real", "make up-local", "make compose-smoke-real", "make snapshot", "make recommendations", "make pages-site", "GitHub Pages", "tag-based releases", "streamify_snapshot.json", "data/recommendations"], ) require_markers( "docs/yandex_music_local.md", - ["YANDEX_MUSIC_TOKEN", "make acceptance-real", "make status", "make token-help", "bounded retries", "dbt build --profiles-dir . --target local", "No audio", "underrated tracks", "Real Account Acceptance Check", "Empty/private accounts", "scripts/run_with_dotenv.py", "make dbt-build", "make up-local", "streamify_empty_smoke", "--require-real", "stale Parquet cleanup", "JSONL sha256 checksums", "ingestion diagnostics", "ingestion diagnostics consistency", "STREAMIFY_SNAPSHOT_PATH", "STREAMIFY_RECOMMENDATIONS_DIR", "streamify_snapshot.json", "data/recommendations", "latest manifest source", "Actions tab"], + ["YANDEX_MUSIC_TOKEN", "make acceptance-real", "make status", "make token-help", "make compose-smoke-real", "built-in device auth", "bounded retries", "dbt build --profiles-dir . --target local", "No audio", "underrated tracks", "Real Account Acceptance Check", "Empty/private accounts", "scripts/run_with_dotenv.py", "make dbt-build", "make up-local", "streamify_empty_smoke", "--require-real", "stale Parquet cleanup", "JSONL sha256 checksums", "ingestion diagnostics", "ingestion diagnostics consistency", "STREAMIFY_SNAPSHOT_PATH", "STREAMIFY_RECOMMENDATIONS_DIR", "streamify_snapshot.json", "data/recommendations", "latest manifest source", "Actions tab"], ) require_markers( "docs/yamusic_lineage.md", @@ -85,7 +86,7 @@ def main() -> int: ) require_markers( "docs/product_acceptance.md", - ["Requirement Matrix", "make acceptance-local", "make test", "make acceptance-real", "real_account_verified", "No audio", "Yandex Music metadata ingestion", "make readiness-real", "make product-answers-smoke", "stale Parquet cleanup", "Source provenance", "data/streamify_snapshot.json", "make snapshot", "data/recommendations/*.csv", "make recommendations", "dashboard Actions tab"], + ["Requirement Matrix", "make acceptance-local", "make test", "make acceptance-real", "real_account_verified", "No audio", "Yandex Music metadata ingestion", "make readiness-real", "make compose-smoke-real", "make product-answers-smoke", "stale Parquet cleanup", "Source provenance", "data/streamify_snapshot.json", "make snapshot", "data/recommendations/*.csv", "make recommendations", "dashboard Actions tab"], ) require_markers("dbt/profiles.yml", ["type: duckdb", "target: dev", "DBT_THREADS"]) require_markers(".env.example", ["YANDEX_MUSIC_TOKEN=", "STREAMIFY_REPORT_PATH", "STREAMIFY_SNAPSHOT_PATH", "STREAMIFY_RECOMMENDATIONS_DIR", "DBT_THREADS=1"]) @@ -93,10 +94,10 @@ def main() -> int: "dbt/models/yamusic/schema.yml", ["stg_yamusic_tracks", "stg_yamusic_manifest", "manifest_source", "adapter_name", "client_library", "yamusic_artist_affinity", "yamusic_library_profile", "yamusic_period_activity", "yamusic_genre_periods", "yamusic_track_signals", "yamusic_playlist_signals", "stale_ingestion_flag", "diagnostic_liked_shortcuts_fetch_failed", "diagnostic_liked_tracks_duplicate_skipped", "diagnostic_liked_albums_seen", "diagnostic_liked_artists_seen", "diagnostic_liked_playlists_seen", "diagnostic_playlist_tracks_fetch_failed", "diagnostic_playlist_tracks_missing_track_id", "diagnostic_playlist_tracks_duplicate_skipped", "raw_tracks_sha256", "raw_user_library_events_sha256"], ) - require_markers("dashboard/app.py", ["Local DuckDB database is missing", "Streamify Self-Analytics", "Periods", "Genre diversity", "Genre shifts", "Actions", "Next actions", "Rediscovery queue", "Playlist cleanup candidates", "Download snapshot", "Download action queues", "RECOMMENDATIONS_DIR", "No Yandex Music library metadata was returned", "manifest_source", "adapter_name", "raw_counts", "raw_checksums", "ingestion_diagnostics", "build_data_next_actions", "apply_track_filters", "st.sidebar.multiselect", "st.sidebar.selectbox", "st.sidebar.text_input"]) + require_markers("dashboard/app.py", ["Local DuckDB database is missing", "Streamify Self-Analytics", "Streamify Taste Console", "Focus controls", "Quick lens", "Story", "Taste Map", "Mix Shift", "Rediscovery", "Activity timeline", "Genre fingerprint", "Artist gravity", "Genre diversity", "Genre heatmap", "Release-era mix", "Playlist health", "Playlist overlap", "Actions", "Next actions", "Action previews", "Rediscovery queue", "Playlist cleanup candidates", "Download snapshot", "Download action queues", "RECOMMENDATIONS_DIR", "No Yandex Music library metadata was returned", "manifest_source", "adapter_name", "raw_counts", "raw_checksums", "ingestion_diagnostics", "build_data_next_actions", "apply_focus_filters", "apply_track_filters", "st.sidebar.radio", "st.sidebar.multiselect", "st.sidebar.selectbox", "st.sidebar.text_input", "st.sidebar.slider"]) require_markers("dashboard/actions.py", ["build_data_next_actions", "YANDEX_MUSIC_TOKEN", "stale_ingestion_flag", "liked shortcuts failed", "playlist shortcuts failed", "Data is ready for exploration"]) require_markers("docker-compose.local.yml", ['profiles: ["local"]', "YANDEX_MUSIC_TOKEN", "service_completed_successfully", "DBT_THREADS", "set -euo pipefail", "READINESS_ARGS", "--require-real", "validate_yamusic_raw_contract.py", "doctor_yamusic_local.py", "export_yamusic_summary.py", "export_yamusic_snapshot.py", "export_yamusic_recommendations.py", "audit_yamusic_readiness.py"]) - require_markers("Makefile", ["help:", "token-help:", "pages-site:", "Streamify local Yandex Music self-analytics", "scripts/run_with_dotenv.py", "$(ENV_RUN) -- docker compose -f docker-compose.local.yml --profile local up --build", "$(ENV_RUN) -- docker compose -f docker-compose.local.yml --profile local config --quiet", "dbt-build: dbt-deps", "status", "preflight", "dashboard-smoke", "compose-smoke-local", "acceptance-real", "raw-contract", "report", "snapshot", "recommendations", "readiness", "readiness-real", "real-gate-smoke", "product-answers-smoke", "check_no_local_sensitive_artifacts.py", "check_no_audio_artifacts.py", "smoke_empty_yamusic_dbt.py", "smoke_real_gate.py", "smoke_product_answers.py", "smoke_dashboard_content.py", "acceptance-local", "doctor_yamusic_local.py", "streamify_empty", "dbt/dbt_packages", "streamify_snapshot.json", "data/recommendations", "build_pages_site.py"]) + require_markers("Makefile", ["help:", "token-help:", "yamusic_token_help.py", "pages-site:", "Streamify local Yandex Music self-analytics", "scripts/run_with_dotenv.py", "$(ENV_RUN) -- docker compose -f docker-compose.local.yml --profile local up --build", "$(ENV_RUN) -- docker compose -f docker-compose.local.yml --profile local config --quiet", "dbt-build: dbt-deps", "status", "preflight", "dashboard-smoke", "compose-smoke-local", "compose-smoke-real", "acceptance-real", "raw-contract", "report", "snapshot", "recommendations", "readiness", "readiness-real", "real-gate-smoke", "product-answers-smoke", "check_no_local_sensitive_artifacts.py", "check_no_audio_artifacts.py", "smoke_empty_yamusic_dbt.py", "smoke_real_gate.py", "smoke_product_answers.py", "smoke_dashboard_content.py", "acceptance-local", "doctor_yamusic_local.py", "streamify_empty", "dbt/dbt_packages", "streamify_snapshot.json", "data/recommendations", "build_pages_site.py"]) reject_markers("Makefile", ["include .env"]) require_markers("scripts/run_with_dotenv.py", ["load_dotenv", "os.execvpe", "--cwd", "Make parsing secrets"]) require_markers(".github/workflows/data-quality.yml", ["make test", "YANDEX_MUSIC_TOKEN", "DBT_THREADS"]) @@ -107,13 +108,14 @@ def main() -> int: require_markers("docs/project_management.md", ["Agent Lanes", "Repo/Build", "Yandex Ingestion", "QA/Integration", "v0.1.0-local-mvp"]) require_markers("docs/release_process.md", ["Release Checklist", "GitHub Pages", "sample metadata", "git tag vX.Y.Z"]) require_markers("scripts/build_pages_site.py", ["PUBLIC_DIR", "Sample Summary", "streamify_summary.md", "index.html"]) + require_markers("scripts/yamusic_token_help.py", ["TOKEN_HELPER_URL", "supports_device_auth", "token_configured", "make preflight", "make acceptance-real"]) require_markers("scripts/check_no_local_sensitive_artifacts.py", ["FORBIDDEN_TRACKED_PATHS", "data/raw/yamusic", "DuckDB files", "audio artifacts are tracked"]) require_markers("scripts/check_no_audio_artifacts.py", ["AUDIO_EXTENSIONS", "must not store audio files"]) require_markers("scripts/validate_yamusic_raw_contract.py", ["SCHEMAS", "DIAGNOSTIC_FIELDS", "validate_diagnostic_consistency", "jsonl_sha256", "sha256 mismatch", "playlist_tracks_written", "playlist_tracks_fetch_failed", "liked_tracks_duplicate_skipped", "liked_playlists_written", "playlist_tracks_duplicate_skipped", "liked shortcut diagnostics must add up", "Yandex Music raw schema contract is valid", "user_library_events", "adapter_name", "client_library"]) require_markers("scripts/smoke_empty_yamusic_dbt.py", ["yamusic_empty_smoke", "--no-partial-parse", "dbt.cli.main", "deps_command", "empty Yandex Music raw datasets", "stale_ingestion_flag", "jsonl_sha256"]) require_markers("scripts/smoke_dashboard.py", ["dashboard returned HTTP 200", "STREAMIFY_DUCKDB_PATH", "server.headless=true"]) - require_markers("scripts/smoke_dashboard_content.py", ["AppTest", "Streamify Self-Analytics", "Data Quality", "Local data quality signals", "dashboard content exposes the expected self-analytics sections"]) - require_markers("scripts/smoke_compose_local.py", ["docker compose local profile returned HTTP 200", "produced valid local product artifacts", "YANDEX_MUSIC_TOKEN", "wait_for_http", "assert_no_runtime_failures", "run_host_check", "validate_yamusic_raw_contract.py", "smoke_product_answers.py", "smoke_dashboard_content.py", "ModuleNotFoundError"]) + require_markers("scripts/smoke_dashboard_content.py", ["AppTest", "Story", "Taste Map", "Mix Shift", "Rediscovery", "Data Quality", "Local data quality signals", "dashboard content exposes the expected self-analytics sections"]) + require_markers("scripts/smoke_compose_local.py", ["--use-env-token", "docker compose local profile returned HTTP 200", "produced valid local product artifacts", "YANDEX_MUSIC_TOKEN", "wait_for_http", "assert_no_runtime_failures", "run_host_check", "validate_yamusic_raw_contract.py", "--require-real", "smoke_product_answers.py", "smoke_dashboard_content.py", "ModuleNotFoundError"]) require_markers("scripts/smoke_real_gate.py", ["sample metadata is rejected", "--require-real", "source=yandex_music", "YANDEX_MUSIC_TOKEN"]) require_markers("scripts/smoke_product_answers.py", ["favorite artists", "repeat signals", "genre shifts", "playlist overlap", "Data Quality", "manifest_source", "adapter_name", "Raw Ingestion Counts", "Raw File Checksums", "raw_checksums", "diagnostic_liked_shortcuts_seen", "JSON snapshot", "streamify_snapshot.json", "recommendations export", "rediscovery_tracks.csv"]) require_markers("scripts/doctor_yamusic_local.py", ["_manifest.json", "stg_yamusic_manifest", "adapter metadata", "yamusic_genre_periods", "raw counts", "local Yandex Music acceptance checks passed"]) diff --git a/scripts/yamusic_token_help.py b/scripts/yamusic_token_help.py new file mode 100644 index 0000000..b42ec46 --- /dev/null +++ b/scripts/yamusic_token_help.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +from pathlib import Path +import sys +from typing import Any + + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) + +from yamusic_ingest.config import Settings +from yamusic_ingest.yandex_client import client_metadata + +TOKEN_HELPER_URL = "https://github.com/MarshalX/yandex-music-token" + + +def yandex_music_client_capabilities() -> dict[str, Any]: + try: + from yandex_music import Client + except ImportError: + metadata = client_metadata() + return { + **metadata, + "client_importable": False, + "supports_device_auth": False, + "supports_token_only_client": False, + } + + metadata = client_metadata() + return { + **metadata, + "client_importable": True, + "supports_device_auth": hasattr(Client, "device_auth"), + "supports_token_only_client": True, + } + + +def token_status() -> dict[str, Any]: + settings = Settings.from_env() + capabilities = yandex_music_client_capabilities() + return { + "env_file_present": (ROOT / ".env").exists(), + "token_configured": bool(settings.token), + "raw_dir": str(settings.raw_dir), + "recommended_helper": TOKEN_HELPER_URL, + "next_step": "make preflight" if settings.token else "get a Yandex Music OAuth token and save YANDEX_MUSIC_TOKEN in .env", + **capabilities, + } + + +def print_human_help(status: dict[str, Any]) -> None: + print("Streamify Yandex Music token setup") + print() + print("Current local status:") + print(f" .env present: {str(status['env_file_present']).lower()}") + print(f" token configured: {str(status['token_configured']).lower()}") + print(f" yandex-music importable: {str(status['client_importable']).lower()}") + print(f" yandex-music version: {status.get('client_library_version') or 'unknown'}") + print(f" built-in device auth helper: {str(status['supports_device_auth']).lower()}") + print() + print("This project does not ask for your Yandex password and must not print or store token values outside .env.") + if status["supports_device_auth"]: + print("The installed yandex-music client reports a built-in device_auth helper, but Streamify still expects the final OAuth token in .env.") + else: + print("The installed yandex-music client accepts an existing OAuth token but does not expose a built-in token acquisition flow.") + print() + print("Steps:") + print(" 1. Ensure local config exists: cp .env.example .env") + print(f" 2. Get a Yandex Music OAuth token with a trusted helper, for example: {TOKEN_HELPER_URL}") + print(" 3. Paste only the token into .env as: YANDEX_MUSIC_TOKEN=...") + print(" 4. Validate without writing raw data: make preflight") + print(" 5. Build real-account analytics: make acceptance-real") + print() + print(f"Next step: {status['next_step']}") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Print safe Yandex Music token setup guidance for Streamify.") + parser.add_argument("--json", action="store_true", help="Print machine-readable status without token values.") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + status = token_status() + if args.json: + print(json.dumps(status, ensure_ascii=False, indent=2, sort_keys=True)) + else: + print_human_help(status) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_yamusic_ingest.py b/tests/test_yamusic_ingest.py index 8fa5d37..ccbfa97 100644 --- a/tests/test_yamusic_ingest.py +++ b/tests/test_yamusic_ingest.py @@ -275,6 +275,26 @@ def test_status_payload_reports_broken_manifest_without_failing(tmp_path, monkey assert payload["manifest_read_error"] == "JSONDecodeError" +def test_token_help_reports_configuration_without_printing_token() -> None: + env = os.environ.copy() + env["YANDEX_MUSIC_TOKEN"] = "secret-token-for-test" + result = subprocess.run( + [sys.executable, "scripts/yamusic_token_help.py", "--json"], + cwd=ROOT, + env=env, + text=True, + capture_output=True, + check=False, + ) + + assert result.returncode == 0, result.stderr + payload = json.loads(result.stdout) + assert payload["token_configured"] is True + assert payload["supports_token_only_client"] is True + assert payload["recommended_helper"] == "https://github.com/MarshalX/yandex-music-token" + assert "secret-token-for-test" not in result.stdout + + @dataclass class FakeLikesResponse: tracks: list[object] From c9c98890366715842e9bdf73724f17247c3bb415 Mon Sep 17 00:00:00 2001 From: Denis Irinyakov Date: Wed, 17 Jun 2026 21:00:55 +0300 Subject: [PATCH 2/2] Add atlas visuals and location enrichment contract --- .env.example | 1 + dashboard/app.py | 337 ++++++++++++++++++++++++++++- docs/location_enrichment.md | 126 +++++++++++ docs/product_acceptance.md | 3 + docs/yandex_music_local.md | 6 +- scripts/smoke_dashboard_content.py | 13 +- scripts/validate_yamusic_local.py | 6 +- 7 files changed, 484 insertions(+), 8 deletions(-) create mode 100644 docs/location_enrichment.md diff --git a/.env.example b/.env.example index 9f5cfdc..275977b 100644 --- a/.env.example +++ b/.env.example @@ -5,6 +5,7 @@ STREAMIFY_DUCKDB_PATH=data/streamify.duckdb STREAMIFY_REPORT_PATH=data/streamify_summary.md STREAMIFY_SNAPSHOT_PATH=data/streamify_snapshot.json STREAMIFY_RECOMMENDATIONS_DIR=data/recommendations +STREAMIFY_ENRICHMENT_DIR=data/enrichment STREAMIFY_DBT_PROFILES_DIR=dbt STREAMIFY_DASHBOARD_PORT=8501 DBT_THREADS=1 diff --git a/dashboard/app.py b/dashboard/app.py index eeb821a..f6ecc57 100644 --- a/dashboard/app.py +++ b/dashboard/app.py @@ -1,6 +1,7 @@ from __future__ import annotations import html +import math import os import sys from pathlib import Path @@ -23,6 +24,7 @@ REPORT_PATH = Path(os.getenv("STREAMIFY_REPORT_PATH", "data/streamify_summary.md")) SNAPSHOT_PATH = Path(os.getenv("STREAMIFY_SNAPSHOT_PATH", "data/streamify_snapshot.json")) RECOMMENDATIONS_DIR = Path(os.getenv("STREAMIFY_RECOMMENDATIONS_DIR", "data/recommendations")) +ENRICHMENT_DIR = Path(os.getenv("STREAMIFY_ENRICHMENT_DIR", "data/enrichment")) def safe_int(value: object) -> int: @@ -77,6 +79,12 @@ def first_record(frame: pd.DataFrame) -> dict[str, object]: return {} if frame.empty else frame.iloc[0].to_dict() +def optional_csv(path: Path) -> pd.DataFrame: + if not path.exists(): + return pd.DataFrame() + return pd.read_csv(path) + + def style_app() -> None: st.markdown( """ @@ -292,6 +300,57 @@ def hbar_chart(frame: pd.DataFrame, x: str, y: str, title: str, color: str = "#0 st.altair_chart(polish_chart(chart), use_container_width=True, theme=None) +def playlist_subway_frames(overlap: pd.DataFrame, limit: int = 14) -> tuple[pd.DataFrame, pd.DataFrame]: + if overlap.empty: + return pd.DataFrame(), pd.DataFrame() + + edges = overlap.head(40).copy() + playlist_names = pd.concat([edges["playlist_a_title"], edges["playlist_b_title"]], ignore_index=True) + node_stats = ( + playlist_names.value_counts() + .rename_axis("playlist_title") + .reset_index(name="connection_count") + .head(limit) + ) + selected = set(node_stats["playlist_title"]) + edges = edges[ + edges["playlist_a_title"].isin(selected) + & edges["playlist_b_title"].isin(selected) + ].copy() + if edges.empty: + return pd.DataFrame(), pd.DataFrame() + + overlap_mentions = pd.concat( + [ + edges[["playlist_a_title", "jaccard_overlap", "overlap_track_count"]].rename( + columns={"playlist_a_title": "playlist_title"} + ), + edges[["playlist_b_title", "jaccard_overlap", "overlap_track_count"]].rename( + columns={"playlist_b_title": "playlist_title"} + ), + ], + ignore_index=True, + ) + node_stats = node_stats.merge( + overlap_mentions.groupby("playlist_title", as_index=False).agg( + max_jaccard=("jaccard_overlap", "max"), + overlap_tracks=("overlap_track_count", "sum"), + ), + on="playlist_title", + how="left", + ) + + lane_count = min(4, max(1, math.ceil(len(node_stats.index) / 4))) + node_stats["x"] = node_stats.index // lane_count + node_stats["y"] = node_stats.index % lane_count + node_lookup = node_stats.set_index("playlist_title")[["x", "y"]] + + edges = edges.merge(node_lookup, left_on="playlist_a_title", right_index=True) + edges = edges.merge(node_lookup, left_on="playlist_b_title", right_index=True, suffixes=("_a", "_b")) + edges["pair"] = edges["playlist_a_title"] + " / " + edges["playlist_b_title"] + return node_stats, edges + + def source_payload(row: pd.Series) -> dict[str, object]: return { "database": str(DB_PATH), @@ -505,6 +564,45 @@ def source_payload(row: pd.Series) -> dict[str, object]: end """ ) +playlist_dna = query( + """ + with top_playlists as ( + select playlist_id, playlist_title, actual_track_count + from yamusic_playlist_signals + order by actual_track_count desc, playlist_title + limit 14 + ) + select + tp.playlist_title, + coalesce(t.genre, 'unknown') as genre, + count(*) as track_count + from yamusic_fact_playlist_tracks pt + join top_playlists tp on pt.playlist_id = tp.playlist_id + left join yamusic_dim_tracks t on pt.track_id = t.track_id + group by 1, 2 + order by 1, 3 desc, 2 + """ +) +time_travel = query( + """ + select + title, + artist_display, + coalesce(genre, 'unknown') as genre, + release_year, + cast(date_trunc('month', first_event_ts) as date) as first_event_month, + repeat_signal, + playlist_count, + liked + from yamusic_track_signals + where release_year is not null + and first_event_ts is not null + order by repeat_signal desc, playlist_count desc, title + limit 1500 + """ +) +artist_locations = optional_csv(ENRICHMENT_DIR / "artist_locations.csv") +user_location_events = optional_csv(ENRICHMENT_DIR / "user_location_events.csv") top_artist = first_record(top_artists) top_genre = first_record(top_genres) @@ -619,8 +717,8 @@ def source_payload(row: pd.Series) -> dict[str, object]: st.warning("No Yandex Music library metadata was returned for this run.") st.code("make ingest\nmake dbt-build", language="bash") -tab_story, tab_taste, tab_mix, tab_discovery, tab_playlists, tab_tracks, tab_actions, tab_quality = st.tabs( - ["Story", "Taste Map", "Mix Shift", "Rediscovery", "Playlists", "Explorer", "Actions", "Data Quality"] +tab_story, tab_taste, tab_atlas, tab_mix, tab_discovery, tab_playlists, tab_tracks, tab_actions, tab_quality = st.tabs( + ["Story", "Taste Map", "Atlas", "Mix Shift", "Rediscovery", "Playlists", "Explorer", "Actions", "Data Quality"] ) with tab_story: @@ -748,6 +846,220 @@ def source_payload(row: pd.Series) -> dict[str, object]: st.dataframe(artist_map.head(50), use_container_width=True, hide_index=True) st.dataframe(genre_table, use_container_width=True, hide_index=True) +with tab_atlas: + st.subheader("Genre Atlas") + st.markdown( + "
Not a geographic map: each point is a genre positioned by catalog weight and liked coverage from local metadata.
", + unsafe_allow_html=True, + ) + atlas_genres = top_genres.copy() + if not atlas_genres.empty: + atlas_genres["liked_rate"] = atlas_genres["liked_track_count"] / atlas_genres["track_count"].clip(lower=1) + atlas_genres["label"] = atlas_genres["genre"].where(atlas_genres["track_count"].rank(method="first", ascending=False) <= 8, "") + genre_points = ( + alt.Chart(atlas_genres) + .mark_circle(opacity=0.72) + .encode( + x=alt.X("track_share:Q", title="share of known library", axis=alt.Axis(format="%")), + y=alt.Y("liked_rate:Q", title="liked coverage", axis=alt.Axis(format="%")), + size=alt.Size("library_hours:Q", title="library hours", scale=alt.Scale(range=[90, 1500])), + color=alt.Color("track_count:Q", title="tracks", scale=alt.Scale(scheme="goldgreen")), + tooltip=[ + "genre:N", + "track_count:Q", + "liked_track_count:Q", + alt.Tooltip("track_share:Q", format=".1%"), + alt.Tooltip("liked_rate:Q", format=".1%"), + alt.Tooltip("library_hours:Q", format=".1f"), + ], + ) + ) + genre_labels = ( + alt.Chart(atlas_genres) + .mark_text(align="left", baseline="middle", dx=8, color="#17201f", fontSize=12) + .encode( + x=alt.X("track_share:Q"), + y=alt.Y("liked_rate:Q"), + text="label:N", + ) + ) + st.altair_chart(polish_chart((genre_points + genre_labels).properties(height=430)), use_container_width=True, theme=None) + else: + st.info("No genre profile data for the atlas.") + + st.subheader("Monthly Rhythm") + st.markdown( + "
A compact rhythm grid from period activity; color intensity is activity volume, not inferred listening location.
", + unsafe_allow_html=True, + ) + if not periods.empty: + rhythm = periods.melt( + id_vars=["activity_month"], + value_vars=["event_count", "liked_events", "playlist_events", "active_tracks", "active_artists", "active_genres"], + var_name="signal", + value_name="value", + ) + rhythm["signal"] = rhythm["signal"].map( + { + "event_count": "all events", + "liked_events": "liked events", + "playlist_events": "playlist events", + "active_tracks": "active tracks", + "active_artists": "active artists", + "active_genres": "active genres", + } + ) + rhythm_heatmap = ( + alt.Chart(rhythm) + .mark_rect(cornerRadius=2) + .encode( + x=alt.X("yearmonth(activity_month):O", title=None), + y=alt.Y("signal:N", title=None, sort=["all events", "liked events", "playlist events", "active tracks", "active artists", "active genres"]), + color=alt.Color("value:Q", title="value", scale=alt.Scale(scheme="tealblues")), + tooltip=["activity_month:T", "signal:N", "value:Q"], + ) + .properties(height=260) + ) + st.altair_chart(polish_chart(rhythm_heatmap), use_container_width=True, theme=None) + else: + st.info("No monthly period activity for the rhythm view.") + + st.subheader("Music Time Travel") + st.markdown( + "
Release year versus first library event month: a way to see whether the library is discovering old catalog or tracking current releases.
", + unsafe_allow_html=True, + ) + if not time_travel.empty: + time_travel_chart = ( + alt.Chart(time_travel) + .mark_circle(opacity=0.55) + .encode( + x=alt.X("release_year:Q", title="release year", scale=alt.Scale(zero=False)), + y=alt.Y("first_event_month:T", title="first library event"), + size=alt.Size("repeat_signal:Q", title="repeat signal", scale=alt.Scale(range=[25, 700])), + color=alt.Color("genre:N", title="genre", legend=None), + tooltip=["title:N", "artist_display:N", "genre:N", "release_year:Q", "first_event_month:T", "repeat_signal:Q", "playlist_count:Q"], + ) + .properties(height=390) + ) + st.altair_chart(polish_chart(time_travel_chart), use_container_width=True, theme=None) + else: + st.info("No release-year and event-month pairs are available for time travel.") + + st.subheader("Playlist Subway") + st.markdown( + "
Lines connect playlists with shared tracks; thicker lines mean higher Jaccard overlap.
", + unsafe_allow_html=True, + ) + subway_nodes, subway_edges = playlist_subway_frames(playlist_overlap) + if not subway_nodes.empty and not subway_edges.empty: + edge_chart = ( + alt.Chart(subway_edges) + .mark_rule(opacity=0.58, color="#66736f") + .encode( + x=alt.X("x_a:Q", title=None, axis=None), + y=alt.Y("y_a:Q", title=None, axis=None, scale=alt.Scale(reverse=True)), + x2="x_b:Q", + y2="y_b:Q", + size=alt.Size("jaccard_overlap:Q", title="overlap", scale=alt.Scale(range=[1, 9])), + tooltip=[ + "pair:N", + "overlap_track_count:Q", + alt.Tooltip("jaccard_overlap:Q", format=".1%"), + ], + ) + ) + node_chart = ( + alt.Chart(subway_nodes) + .mark_circle(color="#0f766e", opacity=0.88) + .encode( + x=alt.X("x:Q", title=None, axis=None), + y=alt.Y("y:Q", title=None, axis=None, scale=alt.Scale(reverse=True)), + size=alt.Size("connection_count:Q", title="connections", scale=alt.Scale(range=[180, 900])), + tooltip=[ + "playlist_title:N", + "connection_count:Q", + "overlap_tracks:Q", + alt.Tooltip("max_jaccard:Q", format=".1%"), + ], + ) + ) + node_labels = ( + alt.Chart(subway_nodes) + .mark_text(align="left", baseline="middle", dx=12, color="#17201f", fontSize=12) + .encode(x="x:Q", y=alt.Y("y:Q", scale=alt.Scale(reverse=True)), text="playlist_title:N") + ) + st.altair_chart(polish_chart((edge_chart + node_chart + node_labels).properties(height=380)), use_container_width=True, theme=None) + else: + st.info("No playlist overlap edges are available for the subway view.") + + st.subheader("Playlist DNA") + st.markdown( + "
A matrix of playlist composition by genre. This is more useful than overlap when only a few playlists share tracks directly.
", + unsafe_allow_html=True, + ) + if not playlist_dna.empty: + dna_chart = ( + alt.Chart(playlist_dna) + .mark_rect() + .encode( + x=alt.X("genre:N", title=None, sort="-y"), + y=alt.Y("playlist_title:N", title=None, sort="-x"), + color=alt.Color("track_count:Q", title="tracks", scale=alt.Scale(scheme="goldgreen")), + tooltip=["playlist_title:N", "genre:N", "track_count:Q"], + ) + .properties(height=max(280, min(620, playlist_dna["playlist_title"].nunique() * 34))) + ) + st.altair_chart(polish_chart(dna_chart), use_container_width=True, theme=None) + else: + st.info("No playlist DNA rows are available.") + + st.subheader("Geo Atlas readiness") + st.markdown( + "
Maps stay opt-in. Current Yandex Music metadata has no listening location, so map layers need user-provided location timelines or artist-location enrichment.
", + unsafe_allow_html=True, + ) + geo_cols = st.columns(2) + geo_cols[0].metric("Artist location rows", compact_int(len(artist_locations.index))) + geo_cols[1].metric("User location rows", compact_int(len(user_location_events.index))) + if not artist_locations.empty and {"latitude", "longitude"}.issubset(artist_locations.columns): + st.caption("Artist-associated geography. This is not evidence of where you listened.") + artist_map = artist_locations.rename(columns={"latitude": "lat", "longitude": "lon"}).copy() + st.map(artist_map[["lat", "lon"]].dropna()) + elif not user_location_events.empty and {"latitude", "longitude"}.issubset(user_location_events.columns): + st.caption("User-provided location timeline. Events need timestamp matching before this becomes a listening map.") + user_map = user_location_events.rename(columns={"latitude": "lat", "longitude": "lon"}).copy() + st.map(user_map[["lat", "lon"]].dropna()) + else: + st.info( + "Add `data/enrichment/artist_locations.csv` or `data/enrichment/user_location_events.csv` " + "with `latitude` and `longitude` columns to unlock map previews." + ) + st.code( + "artist_name,city,country_code,latitude,longitude,confidence,source\n" + "Oxxxymiron,London,GB,51.5072,-0.1276,0.6,manual\n\n" + "started_at,ended_at,city,country_code,latitude,longitude,confidence,source\n" + "2024-08-01T00:00:00Z,2024-08-15T00:00:00Z,Tbilisi,GE,41.7151,44.8271,0.8,manual_city_timeline", + language="csv", + ) + + with st.expander("Atlas source rows"): + if not atlas_genres.empty: + atlas_table = atlas_genres.drop(columns=["label"], errors="ignore").copy() + atlas_table["track_share"] = atlas_table["track_share"].map(lambda value: f"{value * 100:.1f}%") + atlas_table["liked_rate"] = atlas_table["liked_rate"].map(lambda value: f"{value * 100:.1f}%") + st.dataframe(atlas_table, use_container_width=True, hide_index=True) + if not periods.empty: + st.dataframe(periods, use_container_width=True, hide_index=True) + if not playlist_overlap.empty: + overlap_table = playlist_overlap.copy() + overlap_table["jaccard_overlap"] = overlap_table["jaccard_overlap"].map(lambda value: f"{value * 100:.1f}%") + st.dataframe(overlap_table, use_container_width=True, hide_index=True) + if not playlist_dna.empty: + st.dataframe(playlist_dna, use_container_width=True, hide_index=True) + if not time_travel.empty: + st.dataframe(time_travel.head(200), use_container_width=True, hide_index=True) + with tab_mix: st.subheader("Genre heatmap") st.markdown("
A Wrapped-style view of when genres entered the library metadata stream.
", unsafe_allow_html=True) @@ -811,6 +1123,27 @@ def source_payload(row: pd.Series) -> dict[str, object]: c1.metric("Filtered rediscovery tracks", compact_int(len(rediscovery.index))) c2.metric("Zero-playlist liked tracks", compact_int((rediscovery["playlist_count"] == 0).sum() if not rediscovery.empty else 0)) c3.metric("Top repeat in queue", compact_int(rediscovery["repeat_signal"].max() if not rediscovery.empty else 0)) + + st.subheader("Rediscovery quadrants") + st.markdown( + "
High repeat and low playlist coverage is the most actionable corner: tracks you seem to return to but have not organized.
", + unsafe_allow_html=True, + ) + if not filtered_tracks.empty: + quadrant_chart = ( + alt.Chart(filtered_tracks) + .mark_circle(opacity=0.56) + .encode( + x=alt.X("playlist_count:Q", title="playlist coverage"), + y=alt.Y("repeat_signal:Q", title="repeat signal"), + size=alt.Size("event_count:Q", title="library events", scale=alt.Scale(range=[20, 650])), + color=alt.Color("liked:N", title="liked", scale=alt.Scale(range=["#a16207", "#0f766e"])), + tooltip=["title:N", "artist_display:N", "genre:N", "playlist_count:Q", "repeat_signal:Q", "event_count:Q"], + ) + .properties(height=360) + ) + st.altair_chart(polish_chart(quadrant_chart), use_container_width=True, theme=None) + render_track_cards(rediscovery, limit=8) st.subheader("Repeat signals") diff --git a/docs/location_enrichment.md b/docs/location_enrichment.md new file mode 100644 index 0000000..e59f50b --- /dev/null +++ b/docs/location_enrichment.md @@ -0,0 +1,126 @@ +# Future Location Enrichment Contract + +This document sketches a future, optional geo/location data contract for Streamify. It is not part of the current Yandex Music local ingestion path and should not be treated as an implemented feature. + +## Why Yandex Music Metadata Is Not Enough + +The current Yandex Music adapter reads account-visible metadata through the `yandex-music` Python client: tracks, artists, albums, playlists, playlist membership, liked markers and derived library events. That metadata does not include a stable listening location field. + +Even when timestamps are present, they describe a music library action such as a like, playlist membership or account-visible history item. They do not prove where the user was when listening. Region, catalog availability, account locale, artist country or playlist language are not user location signals. + +## Safe User Location Sources + +Location enrichment must be user-supplied, optional and separable from music ingestion. Possible sources over time: + +- Google Timeline / Google Takeout Location History, if the user has it enabled and explicitly exports it. +- iOS Significant Locations, noted as a possible on-device source but not practically exportable for reliable Streamify ingestion. +- Photo EXIF GPS coordinates, only from user-selected photos and only with explicit consent. +- Calendar or travel exports, such as flight, hotel, event or trip records that the user explicitly provides. +- A manual city timeline maintained by the user, for example date ranges such as `2025-06-01` to `2025-06-14` in `Tbilisi, Georgia`. +- Network or IP logs only if the user explicitly provides them and understands their limits. Streamify should not collect IP logs implicitly. + +Sources that are not safe defaults: + +- Inferring user location from artist origin, track language, genre, playlist name or Yandex account region. +- Scraping device, browser or network location without a deliberate user import step. +- Treating coarse country or IP geolocation as precise movement history. + +## `user_location_events` + +`user_location_events` should represent where the user may have been during a time interval, with confidence and provenance. + +Suggested fields: + +| Field | Type | Notes | +| --- | --- | --- | +| `location_event_id` | string | Stable hash of source, source row id and normalized time interval. | +| `source` | string | `google_takeout`, `photo_exif`, `calendar_travel`, `manual_city_timeline`, `network_ip_log`, or another explicit import source. | +| `source_record_id` | string | Optional source-local identifier, redacted where needed. | +| `started_at` | timestamp | Inclusive UTC start time. | +| `ended_at` | timestamp | Exclusive UTC end time; may equal `started_at` for point observations. | +| `timezone` | string | IANA timezone when known. | +| `latitude` | double | Optional; omit or round when only coarse location is needed. | +| `longitude` | double | Optional; omit or round when only coarse location is needed. | +| `city` | string | Optional normalized city. | +| `region` | string | Optional state/province/region. | +| `country_code` | string | Optional ISO 3166-1 alpha-2 code. | +| `precision_meters` | integer | Approximate spatial precision or bucket size. | +| `confidence` | double | `0.0` to `1.0`; manual ranges and IP-derived locations should usually be lower confidence than direct GPS. | +| `is_inferred` | boolean | True when the row is inferred from an indirect source such as calendar travel or IP logs. | +| `consent_scope` | string | User-approved scope, such as `analytics_only` or `city_level_only`. | +| `imported_at` | timestamp | Time Streamify imported the row. | + +The table should allow overlapping rows because real-world sources conflict. Downstream joins must choose a deterministic tie-break rule instead of assuming one location per timestamp. + +## `artist_locations` + +`artist_locations` should describe artist-associated places, not user listening places. It can support questions such as geographic diversity of artists, but it must never be used as evidence of where the user listened. + +Suggested fields: + +| Field | Type | Notes | +| --- | --- | --- | +| `artist_location_id` | string | Stable hash of artist id, source and normalized location. | +| `artist_id` | string | Streamify/Yandex artist identifier when available. | +| `artist_name` | string | Display name for review and fallback matching. | +| `source` | string | Discogs, MusicBrainz, Wikidata, manual curation or another cited source. | +| `source_url` | string | Optional provenance URL. | +| `location_type` | string | `origin`, `formed_in`, `based_in`, `birthplace`, `scene`, or `label_location`. | +| `started_at` | date | Optional date when the association began. | +| `ended_at` | date | Optional date when the association ended. | +| `city` | string | Optional normalized city. | +| `region` | string | Optional region. | +| `country_code` | string | Optional ISO 3166-1 alpha-2 code. | +| `latitude` | double | Optional coarse coordinate for mapping. | +| `longitude` | double | Optional coarse coordinate for mapping. | +| `confidence` | double | `0.0` to `1.0`; biographies and crowd-sourced sources require care. | +| `notes` | string | Optional caveat for ambiguous or multi-location artists. | + +## Joining Location To Library Events + +The future join should be timestamp-based and explicit about uncertainty: + +1. Normalize all `user_library_events.event_at`, `user_location_events.started_at` and `user_location_events.ended_at` values to UTC. +2. For each library event with a usable timestamp, find location events where `started_at <= event_at < ended_at`. +3. If no interval matches, optionally search nearest point observations within a configured window, such as 30 minutes for GPS-like data or one day for manual city timelines. +4. Rank candidates by source trust, precision, confidence, non-inferred status and distance from the observation time. +5. Persist the selected match in a bridge table such as `user_library_event_locations`, including `location_event_id`, `match_method`, `match_confidence`, `time_delta_seconds` and `location_precision_meters`. +6. Keep unmatched music events. Missing location is expected and should not fail ingestion. + +Recommended bridge fields: + +| Field | Type | Notes | +| --- | --- | --- | +| `event_location_id` | string | Stable hash of library event and selected location event. | +| `library_event_id` | string | Existing music/library event id. | +| `location_event_id` | string | Selected user location event id. | +| `match_method` | string | `interval_exact`, `nearest_point`, `manual_range`, `calendar_range`, `ip_coarse`, or similar. | +| `match_confidence` | double | Combined confidence after tie-breaking. | +| `time_delta_seconds` | integer | `0` for interval matches; signed delta for nearest-point matches. | +| `location_precision_meters` | integer | Spatial precision used for the match. | + +## Privacy Constraints + +- Location imports must be opt-in and separate from `YANDEX_MUSIC_TOKEN` setup. +- Raw high-precision location files should remain local, ignored by git and excluded from reports by default. +- Default analytics should use city, region or country buckets instead of exact coordinates. +- Users must be able to delete imported location data without deleting music metadata. +- Reports, snapshots and dashboards should label location-derived metrics as optional and source-dependent. +- The manifest should store row counts, source names and checksums, not raw coordinates or sensitive source identifiers. +- Consent scope should travel with derived rows so a city-only import is not later used for exact maps. +- IP-derived rows must be marked inferred and coarse, and must never be collected implicitly. + +## Inference Caveats + +Location enrichment can answer "what music-library event happened while the user's provided location data suggests they were in this place?" It cannot prove the user listened there unless the source event itself is a trustworthy listening event and the location source is accurate for the same time. + +Important caveats: + +- Library likes and playlist edits can happen long after listening. +- Manual city timelines are useful for coarse trip context but poor for exact movement. +- Calendar travel can describe intended plans, not actual presence. +- GPS and photo EXIF can be sparse and biased toward moments when photos were taken. +- IP geolocation can be wrong because of VPNs, mobile carriers, corporate networks and provider databases. +- Artist location is artist metadata, not user location. + +Any product surface using this contract should show confidence and source labels, avoid precise claims, and prefer language such as "associated with your provided location timeline" over "listened in." diff --git a/docs/product_acceptance.md b/docs/product_acceptance.md index 4dab146..d79a647 100644 --- a/docs/product_acceptance.md +++ b/docs/product_acceptance.md @@ -17,6 +17,7 @@ This document maps the MVP requirements to concrete repository artifacts and ver | Idempotent local ingestion | overwrite-per-run raw writer, stale Parquet cleanup, and `_manifest.json` row counts | repeated `make ingest-sample`, `make raw-contract` | | Data quality checks | dbt schema tests, raw contract, doctor, safety checks, empty-account smoke | `make test` | | Practical self-analytics answers | `yamusic_artist_affinity`, `yamusic_genre_periods`, `yamusic_track_signals`, `yamusic_playlist_signals`, `yamusic_library_profile`, dashboard genre/liked/search filters, dashboard Actions/Data Quality tabs, dashboard content smoke, `data/streamify_summary.md`, `data/streamify_snapshot.json`, `data/recommendations/*.csv` | `make product-answers-smoke`, `make dashboard-smoke`, `make report`, `make snapshot`, `make recommendations`, `make dashboard` | +| Future location enrichment is documented but not implemented | `docs/location_enrichment.md`; current Yandex Music metadata has no listening-location field, and any future joins require explicit user-provided location timelines plus confidence/provenance labels | Documentation review only | | Empty/private account handling | typed empty raw files and empty dbt smoke | `scripts/smoke_empty_yamusic_dbt.py`, `make test` | | Token safety | `.env.example`, `.gitignore`, no token in manifest/report/status, preflight without raw writes | `make status`, `make preflight`, `scripts/check_no_local_sensitive_artifacts.py` | @@ -53,9 +54,11 @@ The readiness audit must report `"real_account_verified": true` before the real- | Can I open action queues in a spreadsheet? | `data/recommendations/*.csv`, `make recommendations` | | Can I reuse the answers outside the dashboard? | `data/streamify_snapshot.json`, `make snapshot` | | Is my local data trustworthy? | dashboard Data Quality tab, JSON snapshot quality block, `make doctor`, `make readiness` | +| Can Streamify analyze music by where I was? | Future-only contract in `docs/location_enrichment.md`; not part of current acceptance | ## Known Product Limits - Yandex Music availability depends on the unofficial `yandex-music` package and account-visible metadata. - Listening timestamps/history are used only when exposed by the account/API response; otherwise the product falls back to liked-track and playlist-membership events. +- Listening location is not present in current Yandex Music metadata. Future geo enrichment must be opt-in, source-labeled and joined to library events by timestamp with caveats around confidence and inference. - The dashboard and report are analytics over metadata and derived events, not audio playback or audio feature extraction. diff --git a/docs/yandex_music_local.md b/docs/yandex_music_local.md index 1a1a643..e84a41a 100644 --- a/docs/yandex_music_local.md +++ b/docs/yandex_music_local.md @@ -134,7 +134,9 @@ The local marts are designed around practical self-analytics questions: - underrated playlists: high-uniqueness, low-overlap playlists in `yamusic_playlist_signals`; - playlist overlap: pairwise Jaccard similarity in `yamusic_playlist_overlap`. -The dashboard includes sidebar filters for genre, liked state and track/artist/album search. These filters apply to track-level discovery views such as the library snapshot and repeated/underrated tracks. The Actions tab turns the marts into next-step queues: real-account/data-quality actions, rediscovery tracks, playlist cleanup candidates, standout playlists, and download buttons for the markdown summary, JSON snapshot and recommendations CSV files. +The dashboard includes sidebar focus controls for genre, liked state, text search, release years, repeat signal and playlist coverage. These filters apply to track-level discovery views such as repeated/underrated tracks and the visual Explorer. The Atlas tab adds chart-first views for genre shape, monthly rhythm, release-year time travel, playlist subway overlap and playlist DNA. The Actions tab turns the marts into next-step queues: real-account/data-quality actions, rediscovery tracks, playlist cleanup candidates, standout playlists, and download buttons for the markdown summary, JSON snapshot and recommendations CSV files. + +Location-aware analytics are intentionally out of scope for the current Yandex Music metadata adapter. Yandex Music library metadata does not provide a stable listening location, and account region, playlist language, genre or artist origin must not be treated as user location. The dashboard can show Geo Atlas readiness and optional map previews when user-supplied CSV enrichment exists under `STREAMIFY_ENRICHMENT_DIR`, but those maps must be labeled as artist-associated geography or user-provided location timeline data. A future opt-in contract for user-supplied location timelines and artist-associated places is documented in [Future Location Enrichment Contract](location_enrichment.md). `make report` exports the same marts into two portable artifacts: @@ -179,6 +181,8 @@ Yandex Music does not provide a stable public API for every analytics use case. If the real integration returns less data than expected, use `make ingest-sample` to verify the local pipeline and dashboard independently from account access. +The local product does not infer where listening happened. Future location enrichment must come from explicit user-provided sources, such as Google Takeout Location History when available, selected photo EXIF, calendar/travel exports, a manual city timeline, or network/IP logs only when the user deliberately supplies them. iOS Significant Locations may exist on-device but are not practically exportable for this product. See [Future Location Enrichment Contract](location_enrichment.md) for schema ideas, privacy constraints and timestamp-join caveats. + ## Reset ```bash diff --git a/scripts/smoke_dashboard_content.py b/scripts/smoke_dashboard_content.py index a33e00a..bc1b57e 100644 --- a/scripts/smoke_dashboard_content.py +++ b/scripts/smoke_dashboard_content.py @@ -63,12 +63,14 @@ def main() -> int: "Tracks in focus", "Liked in focus", "Zero-playlist", + "Artist location rows", + "User location rows", ], "metrics", ) require_contains( labels(app.tabs), - ["Story", "Taste Map", "Mix Shift", "Rediscovery", "Playlists", "Explorer", "Actions", "Data Quality"], + ["Story", "Taste Map", "Atlas", "Mix Shift", "Rediscovery", "Playlists", "Explorer", "Actions", "Data Quality"], "tabs", ) require_contains( @@ -79,10 +81,17 @@ def main() -> int: "Genre fingerprint", "Artist gravity", "Genre diversity", + "Genre Atlas", + "Monthly Rhythm", + "Music Time Travel", + "Playlist Subway", + "Playlist DNA", + "Geo Atlas readiness", "Genre heatmap", "Release-era mix", "Focus genre mix", "Rediscovery queue", + "Rediscovery quadrants", "Repeat signals", "Playlist health", "Playlist overlap", @@ -93,7 +102,7 @@ def main() -> int: ], "sections", ) - if len(app.dataframe) < 6: + if len(app.dataframe) < 8: fail(f"dashboard should keep audit dataframes available in expanders, found {len(app.dataframe)}") if not app.json: fail("dashboard Data Quality tab should expose a JSON quality block") diff --git a/scripts/validate_yamusic_local.py b/scripts/validate_yamusic_local.py index 56523b1..213f484 100644 --- a/scripts/validate_yamusic_local.py +++ b/scripts/validate_yamusic_local.py @@ -89,12 +89,12 @@ def main() -> int: ["Requirement Matrix", "make acceptance-local", "make test", "make acceptance-real", "real_account_verified", "No audio", "Yandex Music metadata ingestion", "make readiness-real", "make compose-smoke-real", "make product-answers-smoke", "stale Parquet cleanup", "Source provenance", "data/streamify_snapshot.json", "make snapshot", "data/recommendations/*.csv", "make recommendations", "dashboard Actions tab"], ) require_markers("dbt/profiles.yml", ["type: duckdb", "target: dev", "DBT_THREADS"]) - require_markers(".env.example", ["YANDEX_MUSIC_TOKEN=", "STREAMIFY_REPORT_PATH", "STREAMIFY_SNAPSHOT_PATH", "STREAMIFY_RECOMMENDATIONS_DIR", "DBT_THREADS=1"]) + require_markers(".env.example", ["YANDEX_MUSIC_TOKEN=", "STREAMIFY_REPORT_PATH", "STREAMIFY_SNAPSHOT_PATH", "STREAMIFY_RECOMMENDATIONS_DIR", "STREAMIFY_ENRICHMENT_DIR", "DBT_THREADS=1"]) require_markers( "dbt/models/yamusic/schema.yml", ["stg_yamusic_tracks", "stg_yamusic_manifest", "manifest_source", "adapter_name", "client_library", "yamusic_artist_affinity", "yamusic_library_profile", "yamusic_period_activity", "yamusic_genre_periods", "yamusic_track_signals", "yamusic_playlist_signals", "stale_ingestion_flag", "diagnostic_liked_shortcuts_fetch_failed", "diagnostic_liked_tracks_duplicate_skipped", "diagnostic_liked_albums_seen", "diagnostic_liked_artists_seen", "diagnostic_liked_playlists_seen", "diagnostic_playlist_tracks_fetch_failed", "diagnostic_playlist_tracks_missing_track_id", "diagnostic_playlist_tracks_duplicate_skipped", "raw_tracks_sha256", "raw_user_library_events_sha256"], ) - require_markers("dashboard/app.py", ["Local DuckDB database is missing", "Streamify Self-Analytics", "Streamify Taste Console", "Focus controls", "Quick lens", "Story", "Taste Map", "Mix Shift", "Rediscovery", "Activity timeline", "Genre fingerprint", "Artist gravity", "Genre diversity", "Genre heatmap", "Release-era mix", "Playlist health", "Playlist overlap", "Actions", "Next actions", "Action previews", "Rediscovery queue", "Playlist cleanup candidates", "Download snapshot", "Download action queues", "RECOMMENDATIONS_DIR", "No Yandex Music library metadata was returned", "manifest_source", "adapter_name", "raw_counts", "raw_checksums", "ingestion_diagnostics", "build_data_next_actions", "apply_focus_filters", "apply_track_filters", "st.sidebar.radio", "st.sidebar.multiselect", "st.sidebar.selectbox", "st.sidebar.text_input", "st.sidebar.slider"]) + require_markers("dashboard/app.py", ["Local DuckDB database is missing", "Streamify Self-Analytics", "Streamify Taste Console", "Focus controls", "Quick lens", "Story", "Taste Map", "Atlas", "Mix Shift", "Rediscovery", "Activity timeline", "Genre fingerprint", "Artist gravity", "Genre diversity", "Genre Atlas", "Monthly Rhythm", "Music Time Travel", "Playlist Subway", "Playlist DNA", "Geo Atlas readiness", "artist_locations.csv", "user_location_events.csv", "Genre heatmap", "Release-era mix", "Playlist health", "Playlist overlap", "Actions", "Next actions", "Action previews", "Rediscovery queue", "Rediscovery quadrants", "Playlist cleanup candidates", "Download snapshot", "Download action queues", "RECOMMENDATIONS_DIR", "ENRICHMENT_DIR", "No Yandex Music library metadata was returned", "manifest_source", "adapter_name", "raw_counts", "raw_checksums", "ingestion_diagnostics", "build_data_next_actions", "apply_focus_filters", "apply_track_filters", "st.sidebar.radio", "st.sidebar.multiselect", "st.sidebar.selectbox", "st.sidebar.text_input", "st.sidebar.slider"]) require_markers("dashboard/actions.py", ["build_data_next_actions", "YANDEX_MUSIC_TOKEN", "stale_ingestion_flag", "liked shortcuts failed", "playlist shortcuts failed", "Data is ready for exploration"]) require_markers("docker-compose.local.yml", ['profiles: ["local"]', "YANDEX_MUSIC_TOKEN", "service_completed_successfully", "DBT_THREADS", "set -euo pipefail", "READINESS_ARGS", "--require-real", "validate_yamusic_raw_contract.py", "doctor_yamusic_local.py", "export_yamusic_summary.py", "export_yamusic_snapshot.py", "export_yamusic_recommendations.py", "audit_yamusic_readiness.py"]) require_markers("Makefile", ["help:", "token-help:", "yamusic_token_help.py", "pages-site:", "Streamify local Yandex Music self-analytics", "scripts/run_with_dotenv.py", "$(ENV_RUN) -- docker compose -f docker-compose.local.yml --profile local up --build", "$(ENV_RUN) -- docker compose -f docker-compose.local.yml --profile local config --quiet", "dbt-build: dbt-deps", "status", "preflight", "dashboard-smoke", "compose-smoke-local", "compose-smoke-real", "acceptance-real", "raw-contract", "report", "snapshot", "recommendations", "readiness", "readiness-real", "real-gate-smoke", "product-answers-smoke", "check_no_local_sensitive_artifacts.py", "check_no_audio_artifacts.py", "smoke_empty_yamusic_dbt.py", "smoke_real_gate.py", "smoke_product_answers.py", "smoke_dashboard_content.py", "acceptance-local", "doctor_yamusic_local.py", "streamify_empty", "dbt/dbt_packages", "streamify_snapshot.json", "data/recommendations", "build_pages_site.py"]) @@ -114,7 +114,7 @@ def main() -> int: require_markers("scripts/validate_yamusic_raw_contract.py", ["SCHEMAS", "DIAGNOSTIC_FIELDS", "validate_diagnostic_consistency", "jsonl_sha256", "sha256 mismatch", "playlist_tracks_written", "playlist_tracks_fetch_failed", "liked_tracks_duplicate_skipped", "liked_playlists_written", "playlist_tracks_duplicate_skipped", "liked shortcut diagnostics must add up", "Yandex Music raw schema contract is valid", "user_library_events", "adapter_name", "client_library"]) require_markers("scripts/smoke_empty_yamusic_dbt.py", ["yamusic_empty_smoke", "--no-partial-parse", "dbt.cli.main", "deps_command", "empty Yandex Music raw datasets", "stale_ingestion_flag", "jsonl_sha256"]) require_markers("scripts/smoke_dashboard.py", ["dashboard returned HTTP 200", "STREAMIFY_DUCKDB_PATH", "server.headless=true"]) - require_markers("scripts/smoke_dashboard_content.py", ["AppTest", "Story", "Taste Map", "Mix Shift", "Rediscovery", "Data Quality", "Local data quality signals", "dashboard content exposes the expected self-analytics sections"]) + require_markers("scripts/smoke_dashboard_content.py", ["AppTest", "Story", "Taste Map", "Atlas", "Mix Shift", "Rediscovery", "Data Quality", "Geo Atlas readiness", "dashboard content exposes the expected self-analytics sections"]) require_markers("scripts/smoke_compose_local.py", ["--use-env-token", "docker compose local profile returned HTTP 200", "produced valid local product artifacts", "YANDEX_MUSIC_TOKEN", "wait_for_http", "assert_no_runtime_failures", "run_host_check", "validate_yamusic_raw_contract.py", "--require-real", "smoke_product_answers.py", "smoke_dashboard_content.py", "ModuleNotFoundError"]) require_markers("scripts/smoke_real_gate.py", ["sample metadata is rejected", "--require-real", "source=yandex_music", "YANDEX_MUSIC_TOKEN"]) require_markers("scripts/smoke_product_answers.py", ["favorite artists", "repeat signals", "genre shifts", "playlist overlap", "Data Quality", "manifest_source", "adapter_name", "Raw Ingestion Counts", "Raw File Checksums", "raw_checksums", "diagnostic_liked_shortcuts_seen", "JSON snapshot", "streamify_snapshot.json", "recommendations export", "rediscovery_tracks.csv"])